예제 #1
0
 def _files_to_subtype(self, subname_to_file: Dict[str, Path]) -> datapoints.BaseVocabDatapoint:
     # print(f'BaseVocabXHandler subname_to_file:')
     # utils.print_dict(subname_to_file)
     print('BaseVocabXHandler._files_to_subtype.subname_to_file')
     utils.print_dict(subname_to_file)
     print(' Done with print dict')
     return datapoints.BaseVocabDatapoint(
         base=utils.load_csv(subname_to_file[constants.X_BASE_BASE_NAME]),
         vocab=utils.load_csv(subname_to_file[constants.X_VOCAB_BASE_NAME]),
     )
예제 #2
0
    def run(self, fast_dev_run=False, use_gpus=False):
        utils.set_seeds(self.search_params.data.seed)

        search_dict = self.search_params.to_ray_tune_search_dict()
        # see tune.utils.UtilMonitor
        search_dict['log_sys_usage'] = True

        output_str = str(self.search_params.logs.output_dir)
        if output_str.startswith('s3://') or output_str.startswith(
                'gs://') or output_str.startswith('hdfs://'):
            sync_config = tune.SyncConfig(
                upload_dir=self.search_params.logs.output_dir)
        else:
            sync_config = None

        analysis = tune.run(
            run_or_experiment=self._get_train_fn(fast_dev_run=fast_dev_run,
                                                 include_gpus=use_gpus),
            name=self.search_params.exp.get_project_exp_name(),
            stop=self.get_tune_stopper(self.search_params.opt.num_epochs),
            config=search_dict,
            resources_per_trial=self.get_resources_per_trial(
                self.search_params, include_gpu=use_gpus),
            num_samples=self.tune_hp.num_hp_samples,
            sync_config=sync_config,
            loggers=self.get_tune_loggers(),
            log_to_file=self.tune_hp.log_to_file
            and not self.tune_hp.ray_local_mode,
            keep_checkpoints_num=2,
            checkpoint_score_attr=
            f'{self.search_params.opt.search_mode}-{self.search_params.opt.search_metric}',
            fail_fast=False,
            scheduler=self.get_tune_scheduler(self.search_params,
                                              self.tune_hp),
            verbose=2,
            progress_reporter=self.get_cli_reporter(),
            reuse_actors=False,
        )

        utils.hprint("done with tune.run")

        param_hash = self.search_params.get_short_hash(num_chars=8)
        analysis_file = self.search_params.logs.output_dir / f'tune_analysis_{param_hash}.cloudpickle'
        print(f"Saving {analysis_file}")
        utils.save_cloudpickle(analysis_file, analysis)

        best_trial = analysis.get_best_trial(
            self.search_params.opt.search_metric,
            self.search_params.opt.search_mode, "last-5-avg")
        utils.hprint('best_trial.last_result', do_include_pre_break_line=True)
        utils.print_dict(best_trial.last_result)

        utils.hprint('best_trial.config', do_include_pre_break_line=True)
        utils.print_dict(best_trial.config)
예제 #3
0
    def _train_fn(self,
                  config: Dict,
                  checkpoint_dir=None,
                  fast_dev_run=False,
                  include_gpus=False):
        utils.hprint('Starting train function with config:')
        utils.print_dict(config)
        print()

        utils.set_pandas_disp(width=200)

        hp = self._model_param_class.from_dict(config)
        assert isinstance(hp, self._model_param_class)
        print('  hp:', hp)

        if checkpoint_dir:
            # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing
            raise NotImplementedError(
                f"Got checkpoint_dir in trian_fn: {checkpoint_dir}")

        utils.hprint("About to create net in TuneRunner")
        net = hp.build()
        # import torch.autograd.profiler as profiler
        # with profiler.profile(record_shapes=True, use_cuda=True, profile_memory=True) as prof:
        #     net = self._factored_lightning_module_class.from_hp(hp=hp)
        # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=1000))

        utils.set_seeds(hp.data.seed)

        # noinspection PyTypeChecker
        trainer = pl.Trainer(
            logger=logs_mod.get_pl_logger(hp=hp.exp, tune=tune),
            default_root_dir=tune.get_trial_dir(),
            callbacks=self.extra_pl_callbacks +
            self.get_pl_callbacks_for_tune(),
            max_epochs=hp.opt.num_epochs,
            gpus=hp.data.num_gpus if include_gpus else None,
            weights_summary='full',
            fast_dev_run=fast_dev_run,
            accumulate_grad_batches=1,
            profiler='simple',
            deterministic=True,
            log_every_n_steps=hp.logs.num_steps_per_metric_log,
            log_gpu_memory=hp.logs.log_gpu_memory,
        )
        utils.hprint('About to start tune_runner\'s trainer.fit...')
        fit_out = trainer.fit(net, datamodule=net.dm)
        utils.hprint('Done with tune_runner._train_fn')

        return fit_out
예제 #4
0
    def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        field_to_class_counts = self.get_field_to_class_counts(dataloader=pl_module.train_dataloader())
        if self.verbose:
            utils.hprint('ClassCounterCallback Class Counts:')
            utils.print_dict(field_to_class_counts)
            print()

        if self.hp is None:
            if self.verbose:
                print(f'  Not setting head_params.pos_class_weights because you did not pass hp to my init')
        else:
            if self.hp.type != 'weighted':
                raise NotImplementedError(
                    f'hp == {self.hp} but this is only implemented for WeightedHeadParams'
                )
            for field_name, class_counts_df in field_to_class_counts.items():
                if field_name not in self.field_name_to_head_name:
                    # we might not be using all fields in heads
                    continue
                head_name = self.field_name_to_head_name[field_name]
                head = pl_module.head.heads[head_name]

                if head.did_set_pos_class_weights:
                    pos_class_weights = head.pos_class_weights
                    if self.verbose:
                        weights_str = ', '.join([f'{e:.2f}' for e in pos_class_weights])
                        print(f'  head_params["{field_name}"].pos_class_weights was already set to [{weights_str}]')
                        print()

                else:
                    pos_class_weights = class_counts_df.loc[self.INV_PORTIONS].values
                    max_inds = np.where(pos_class_weights > self.max_pos_class_weight)[0]
                    pos_class_weights[max_inds] = self.max_pos_class_weight

                    head.set_pos_class_weights(
                        torch.tensor(pos_class_weights, dtype=torch.float, device=pl_module.device)
                    )
                    if self.verbose:
                        weights_str = ', '.join([f'{e:.2f}' for e in pos_class_weights])
                        print(f'  Setting head_params["{field_name}"].pos_class_weights = [{weights_str}]')
                        print()

        pl_module.log_lossmetrics_dict(
            phase=utils.Phase.train,
            d={self.CLASS_COUNTS: field_to_class_counts},
            do_log_to_progbar=False,
        )
예제 #5
0
    def _train_fn(self,
                  config: Dict,
                  checkpoint_dir=None,
                  fast_dev_run=False,
                  include_gpus=False):
        utils.hprint('Starting train function with config:')
        utils.print_dict(config)

        del config['tune']
        hp = self._param_class.from_dict(config)
        assert isinstance(hp, self._param_class)

        if checkpoint_dir:
            # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing
            raise NotImplementedError(
                f"Got checkpoint_dir in trian_fn: {checkpoint_dir}")

        net = self._factored_lightning_module_class.from_hp(hp=hp)

        utils.set_seeds(hp.data.seed)

        # noinspection PyTypeChecker
        trainer = pl.Trainer(
            logger=torch_mod.get_pl_logger(hp=hp.exp,
                                           tune=tune,
                                           offline_mode=fast_dev_run),
            default_root_dir=tune.get_trial_dir(),
            callbacks=self.extra_pl_callbacks + self.get_tune_callbacks(),
            max_epochs=hp.opt.num_epochs,
            gpus=hp.data.num_gpus if include_gpus else None,
            weights_summary='full',
            fast_dev_run=fast_dev_run,
            accumulate_grad_batches=1,
            profiler='simple',
            deterministic=True,
            log_every_n_steps=hp.metrics.num_steps_per_metric_log,
        )
        fit_out = trainer.fit(net, datamodule=net.dm)

        utils.print_dict(config)
        utils.hprint('Done with tune_runner._train_fn')

        return fit_out
예제 #6
0
    # PHASE 2
    joined_dfs = []
    doc_dirs = []
    for ocr_output in ocr_outputs:
        doc_ind, ocr_df, ocr_df_file, words_df, colored_page_image_files, this_doc_dir = ray.get(
            ocr_output)
        joined_df = join_and_split_and_save_dfs.remote(
            ocr_df,
            ocr_df_file,
            words_df,
            colored_page_image_files,
            this_doc_dir,
        )
        joined_dfs.append(joined_df)
        doc_dirs.append(this_doc_dir)

    utils.ray_prog_bar(joined_dfs)

    with utils.Timer('Reading csvs into Dataset'):
        dataset = data.TablestakesDataset(docs_dir=doc_settings.docs_dir)

    dataset_file = doc_settings.get_dataset_file()
    with utils.Timer(
            f'Saving Dataset of type {type((dataset))} to {dataset_file}'):
        dataset.save(dataset_file)

    print()
    print(f'Saved to {str(doc_settings.docs_dir)} and {dataset_file}')
    print(f'Settings:')
    utils.print_dict((doc_settings.to_dict()))
예제 #7
0
        return DataLoader(self.train_dataset, batch_size=self.hp.batch_size, num_workers=self.hp.num_workers)

    def val_dataloader(self):
        return DataLoader(self.valid_dataset, batch_size=self.hp.batch_size, num_workers=self.hp.num_workers)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.hp.batch_size, num_workers=self.hp.num_workers)


if __name__ == '__main__':
    # from torchtext.data import Field, BucketIterator
    # from torchnlp.encoders.text import WhitespaceEncoder
    # from torchnlp.word_to_vector import GloVe

    dataset_name = 'num=1000_extra=0'

    hp = LearningParams()
    trainer = pl.Trainer(
        logger=pl_loggers.TensorBoardLogger('tensorboard_logs/', name="conv1d_trial"),
        max_epochs=hp.num_epochs,
        weights_summary='full',
        fast_dev_run=False,
    )
    net = TrapezoidConv1Module(hp, constants.DOCS_DIR / dataset_name)

    print("HP:")
    utils.print_dict(hp.to_dict())
    fit_out = trainer.fit(net)

    print('fit_out:', fit_out)
예제 #8
0
 def _default_convert_dict_of_dfs_to_tensors(d: dict):
     try:
         return {k: torch.tensor(v.values) for k, v in d.items()}
     except BaseException as e:
         utils.print_dict(d)
         raise e