def _files_to_subtype(self, subname_to_file: Dict[str, Path]) -> datapoints.BaseVocabDatapoint: # print(f'BaseVocabXHandler subname_to_file:') # utils.print_dict(subname_to_file) print('BaseVocabXHandler._files_to_subtype.subname_to_file') utils.print_dict(subname_to_file) print(' Done with print dict') return datapoints.BaseVocabDatapoint( base=utils.load_csv(subname_to_file[constants.X_BASE_BASE_NAME]), vocab=utils.load_csv(subname_to_file[constants.X_VOCAB_BASE_NAME]), )
def run(self, fast_dev_run=False, use_gpus=False): utils.set_seeds(self.search_params.data.seed) search_dict = self.search_params.to_ray_tune_search_dict() # see tune.utils.UtilMonitor search_dict['log_sys_usage'] = True output_str = str(self.search_params.logs.output_dir) if output_str.startswith('s3://') or output_str.startswith( 'gs://') or output_str.startswith('hdfs://'): sync_config = tune.SyncConfig( upload_dir=self.search_params.logs.output_dir) else: sync_config = None analysis = tune.run( run_or_experiment=self._get_train_fn(fast_dev_run=fast_dev_run, include_gpus=use_gpus), name=self.search_params.exp.get_project_exp_name(), stop=self.get_tune_stopper(self.search_params.opt.num_epochs), config=search_dict, resources_per_trial=self.get_resources_per_trial( self.search_params, include_gpu=use_gpus), num_samples=self.tune_hp.num_hp_samples, sync_config=sync_config, loggers=self.get_tune_loggers(), log_to_file=self.tune_hp.log_to_file and not self.tune_hp.ray_local_mode, keep_checkpoints_num=2, checkpoint_score_attr= f'{self.search_params.opt.search_mode}-{self.search_params.opt.search_metric}', fail_fast=False, scheduler=self.get_tune_scheduler(self.search_params, self.tune_hp), verbose=2, progress_reporter=self.get_cli_reporter(), reuse_actors=False, ) utils.hprint("done with tune.run") param_hash = self.search_params.get_short_hash(num_chars=8) analysis_file = self.search_params.logs.output_dir / f'tune_analysis_{param_hash}.cloudpickle' print(f"Saving {analysis_file}") utils.save_cloudpickle(analysis_file, analysis) best_trial = analysis.get_best_trial( self.search_params.opt.search_metric, self.search_params.opt.search_mode, "last-5-avg") utils.hprint('best_trial.last_result', do_include_pre_break_line=True) utils.print_dict(best_trial.last_result) utils.hprint('best_trial.config', do_include_pre_break_line=True) utils.print_dict(best_trial.config)
def _train_fn(self, config: Dict, checkpoint_dir=None, fast_dev_run=False, include_gpus=False): utils.hprint('Starting train function with config:') utils.print_dict(config) print() utils.set_pandas_disp(width=200) hp = self._model_param_class.from_dict(config) assert isinstance(hp, self._model_param_class) print(' hp:', hp) if checkpoint_dir: # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing raise NotImplementedError( f"Got checkpoint_dir in trian_fn: {checkpoint_dir}") utils.hprint("About to create net in TuneRunner") net = hp.build() # import torch.autograd.profiler as profiler # with profiler.profile(record_shapes=True, use_cuda=True, profile_memory=True) as prof: # net = self._factored_lightning_module_class.from_hp(hp=hp) # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=1000)) utils.set_seeds(hp.data.seed) # noinspection PyTypeChecker trainer = pl.Trainer( logger=logs_mod.get_pl_logger(hp=hp.exp, tune=tune), default_root_dir=tune.get_trial_dir(), callbacks=self.extra_pl_callbacks + self.get_pl_callbacks_for_tune(), max_epochs=hp.opt.num_epochs, gpus=hp.data.num_gpus if include_gpus else None, weights_summary='full', fast_dev_run=fast_dev_run, accumulate_grad_batches=1, profiler='simple', deterministic=True, log_every_n_steps=hp.logs.num_steps_per_metric_log, log_gpu_memory=hp.logs.log_gpu_memory, ) utils.hprint('About to start tune_runner\'s trainer.fit...') fit_out = trainer.fit(net, datamodule=net.dm) utils.hprint('Done with tune_runner._train_fn') return fit_out
def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule): field_to_class_counts = self.get_field_to_class_counts(dataloader=pl_module.train_dataloader()) if self.verbose: utils.hprint('ClassCounterCallback Class Counts:') utils.print_dict(field_to_class_counts) print() if self.hp is None: if self.verbose: print(f' Not setting head_params.pos_class_weights because you did not pass hp to my init') else: if self.hp.type != 'weighted': raise NotImplementedError( f'hp == {self.hp} but this is only implemented for WeightedHeadParams' ) for field_name, class_counts_df in field_to_class_counts.items(): if field_name not in self.field_name_to_head_name: # we might not be using all fields in heads continue head_name = self.field_name_to_head_name[field_name] head = pl_module.head.heads[head_name] if head.did_set_pos_class_weights: pos_class_weights = head.pos_class_weights if self.verbose: weights_str = ', '.join([f'{e:.2f}' for e in pos_class_weights]) print(f' head_params["{field_name}"].pos_class_weights was already set to [{weights_str}]') print() else: pos_class_weights = class_counts_df.loc[self.INV_PORTIONS].values max_inds = np.where(pos_class_weights > self.max_pos_class_weight)[0] pos_class_weights[max_inds] = self.max_pos_class_weight head.set_pos_class_weights( torch.tensor(pos_class_weights, dtype=torch.float, device=pl_module.device) ) if self.verbose: weights_str = ', '.join([f'{e:.2f}' for e in pos_class_weights]) print(f' Setting head_params["{field_name}"].pos_class_weights = [{weights_str}]') print() pl_module.log_lossmetrics_dict( phase=utils.Phase.train, d={self.CLASS_COUNTS: field_to_class_counts}, do_log_to_progbar=False, )
def _train_fn(self, config: Dict, checkpoint_dir=None, fast_dev_run=False, include_gpus=False): utils.hprint('Starting train function with config:') utils.print_dict(config) del config['tune'] hp = self._param_class.from_dict(config) assert isinstance(hp, self._param_class) if checkpoint_dir: # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing raise NotImplementedError( f"Got checkpoint_dir in trian_fn: {checkpoint_dir}") net = self._factored_lightning_module_class.from_hp(hp=hp) utils.set_seeds(hp.data.seed) # noinspection PyTypeChecker trainer = pl.Trainer( logger=torch_mod.get_pl_logger(hp=hp.exp, tune=tune, offline_mode=fast_dev_run), default_root_dir=tune.get_trial_dir(), callbacks=self.extra_pl_callbacks + self.get_tune_callbacks(), max_epochs=hp.opt.num_epochs, gpus=hp.data.num_gpus if include_gpus else None, weights_summary='full', fast_dev_run=fast_dev_run, accumulate_grad_batches=1, profiler='simple', deterministic=True, log_every_n_steps=hp.metrics.num_steps_per_metric_log, ) fit_out = trainer.fit(net, datamodule=net.dm) utils.print_dict(config) utils.hprint('Done with tune_runner._train_fn') return fit_out
# PHASE 2 joined_dfs = [] doc_dirs = [] for ocr_output in ocr_outputs: doc_ind, ocr_df, ocr_df_file, words_df, colored_page_image_files, this_doc_dir = ray.get( ocr_output) joined_df = join_and_split_and_save_dfs.remote( ocr_df, ocr_df_file, words_df, colored_page_image_files, this_doc_dir, ) joined_dfs.append(joined_df) doc_dirs.append(this_doc_dir) utils.ray_prog_bar(joined_dfs) with utils.Timer('Reading csvs into Dataset'): dataset = data.TablestakesDataset(docs_dir=doc_settings.docs_dir) dataset_file = doc_settings.get_dataset_file() with utils.Timer( f'Saving Dataset of type {type((dataset))} to {dataset_file}'): dataset.save(dataset_file) print() print(f'Saved to {str(doc_settings.docs_dir)} and {dataset_file}') print(f'Settings:') utils.print_dict((doc_settings.to_dict()))
return DataLoader(self.train_dataset, batch_size=self.hp.batch_size, num_workers=self.hp.num_workers) def val_dataloader(self): return DataLoader(self.valid_dataset, batch_size=self.hp.batch_size, num_workers=self.hp.num_workers) def test_dataloader(self): return DataLoader(self.test_dataset, batch_size=self.hp.batch_size, num_workers=self.hp.num_workers) if __name__ == '__main__': # from torchtext.data import Field, BucketIterator # from torchnlp.encoders.text import WhitespaceEncoder # from torchnlp.word_to_vector import GloVe dataset_name = 'num=1000_extra=0' hp = LearningParams() trainer = pl.Trainer( logger=pl_loggers.TensorBoardLogger('tensorboard_logs/', name="conv1d_trial"), max_epochs=hp.num_epochs, weights_summary='full', fast_dev_run=False, ) net = TrapezoidConv1Module(hp, constants.DOCS_DIR / dataset_name) print("HP:") utils.print_dict(hp.to_dict()) fit_out = trainer.fit(net) print('fit_out:', fit_out)
def _default_convert_dict_of_dfs_to_tensors(d: dict): try: return {k: torch.tensor(v.values) for k, v in d.items()} except BaseException as e: utils.print_dict(d) raise e