def update(self, preds: torch.Tensor, target: torch.Tensor): do_print = self.print_every is not None and not self.counter % self.print_every if do_print: utils.hprint(f'BetterAccuracy is set to print every {self.print_every} and you at {self.counter}:') print(f"BetterAccuracy: preds: \n{preds}") print(f"BetterAccuracy: target: \n{target}") print() self.counter += 1 # preds, target = _input_format_classification(preds, target, self.threshold) assert preds.shape == target.shape, f'preds.shape = {preds.shape} != target.shape = {target.shape}' preds = preds.argmax(dim=1) target = target.argmax(dim=1) if do_print: print(f"BetterAccuracy: preds post argmax: \n{preds}") print(f"BetterAccuracy: target post argmax: \n{target}") print() assert target.dim() == 1, f'got target of shape {target.shape}' eqs = preds.eq(target) if do_print: print( f"BetterAccuracy: new_correct: {eqs.sum()}, " f" numel: {target.numel()}, " f" shape[0]: {target.shape[0]}, " f" ignore: {target.eq(self.Y_VALUE_TO_IGNORE).sum()}" ) self.correct = self.correct + torch.sum(eqs) self.total = self.total + target.shape[0]
def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule): field_to_class_counts = self.get_field_to_class_counts(dataloader=pl_module.train_dataloader()) if self.verbose: utils.hprint('ClassCounterCallback Class Counts:') utils.print_dict(field_to_class_counts) print() if self.hp is None: if self.verbose: print(f' Not setting head_params.pos_class_weights because you did not pass hp to my init') else: if self.hp.type != 'weighted': raise NotImplementedError( f'hp == {self.hp} but this is only implemented for WeightedHeadParams' ) for field_name, class_counts_df in field_to_class_counts.items(): if field_name not in self.field_name_to_head_name: # we might not be using all fields in heads continue head_name = self.field_name_to_head_name[field_name] head = pl_module.head.heads[head_name] if head.did_set_pos_class_weights: pos_class_weights = head.pos_class_weights if self.verbose: weights_str = ', '.join([f'{e:.2f}' for e in pos_class_weights]) print(f' head_params["{field_name}"].pos_class_weights was already set to [{weights_str}]') print() else: pos_class_weights = class_counts_df.loc[self.INV_PORTIONS].values max_inds = np.where(pos_class_weights > self.max_pos_class_weight)[0] pos_class_weights[max_inds] = self.max_pos_class_weight head.set_pos_class_weights( torch.tensor(pos_class_weights, dtype=torch.float, device=pl_module.device) ) if self.verbose: weights_str = ', '.join([f'{e:.2f}' for e in pos_class_weights]) print(f' Setting head_params["{field_name}"].pos_class_weights = [{weights_str}]') print() pl_module.log_lossmetrics_dict( phase=utils.Phase.train, d={self.CLASS_COUNTS: field_to_class_counts}, do_log_to_progbar=False, )
def run(self, fast_dev_run=False, use_gpus=False, log_to_file=False): search_dict = self.search_params.to_ray_tune_search_dict() # see tune.utils.UtilMonitor search_dict['log_sys_usage'] = True # noinspection PyTypeChecker analysis = tune.run( run_or_experiment=self._get_train_fn(fast_dev_run=fast_dev_run, include_gpus=use_gpus), name=self.search_params.exp.get_project_exp_name(), stop=self.get_tune_stopper(self.search_params.opt.num_epochs), config=search_dict, resources_per_trial=self.get_resources_per_trial( self.search_params, include_gpu=use_gpus), num_samples=self.search_params.tune.num_hp_samples, sync_config=tune.SyncConfig( upload_dir=self.search_params.metrics.output_dir), loggers=self.get_tune_loggers(), log_to_file=log_to_file, keep_checkpoints_num=2, checkpoint_score_attr= f'{self.search_params.opt.search_mode}-{self.search_params.opt.search_metric}', fail_fast=False, scheduler=self.get_tune_scheduler(self.search_params), verbose=2, progress_reporter=self.get_cli_reporter(), reuse_actors=False, ) utils.hprint("done with tune.run") param_hash = self.search_params.get_short_hash(num_chars=8) analysis_file = self.search_params.metrics.output_dir / f'tune_analysis_{param_hash}.pkl' print(f"Saving {analysis_file}") utils.save_pickle(analysis_file, analysis) best_trial = analysis.get_best_trial( self.search_params.opt.search_metric, self.search_params.opt.search_mode, "last-5-avg") print(f'best_trial.last_result: {best_trial.last_result}') print("Best trial config: {}".format(best_trial.config)) print("Best trial final search_metric: {}".format( best_trial.last_result[self.search_params.opt.search_metric]))
def _train_fn(self, config: Dict, checkpoint_dir=None, fast_dev_run=False, include_gpus=False): utils.hprint('Starting train function with config:') utils.print_dict(config) del config['tune'] hp = self._param_class.from_dict(config) assert isinstance(hp, self._param_class) if checkpoint_dir: # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing raise NotImplementedError( f"Got checkpoint_dir in trian_fn: {checkpoint_dir}") net = self._factored_lightning_module_class.from_hp(hp=hp) utils.set_seeds(hp.data.seed) # noinspection PyTypeChecker trainer = pl.Trainer( logger=torch_mod.get_pl_logger(hp=hp.exp, tune=tune, offline_mode=fast_dev_run), default_root_dir=tune.get_trial_dir(), callbacks=self.extra_pl_callbacks + self.get_tune_callbacks(), max_epochs=hp.opt.num_epochs, gpus=hp.data.num_gpus if include_gpus else None, weights_summary='full', fast_dev_run=fast_dev_run, accumulate_grad_batches=1, profiler='simple', deterministic=True, log_every_n_steps=hp.metrics.num_steps_per_metric_log, ) fit_out = trainer.fit(net, datamodule=net.dm) utils.print_dict(config) utils.hprint('Done with tune_runner._train_fn') return fit_out
def run( net: pl.LightningModule, dm: pl.LightningDataModule, hp: ModelBertConvTransTClass.Params, fast_dev_run=False, do_find_lr=False, ): print("model run about to create trainer") trainer = pl.Trainer( logger=True if fast_dev_run else torch_mod.get_pl_logger(hp.exp), default_root_dir=hp.metrics.output_dir, callbacks=[metrics_mod.CounterTimerCallback()], max_epochs=hp.opt.num_epochs, gpus=hp.data.num_gpus, weights_summary='full', fast_dev_run=fast_dev_run, accumulate_grad_batches=1, profiler=True, deterministic=True, auto_lr_find=do_find_lr, log_every_n_steps=hp.metrics.num_steps_per_metric_log, ) print("model run done creating trainer") if do_find_lr: utils.hprint("Starting trainer.tune:") lr_tune_out = trainer.tune(net, datamodule=dm) print(f' Tune out: {lr_tune_out}') else: utils.hprint("Starting trainer.fit:") print(f' Dataset file: {hp.data.get_dataset_file()}') trainer.fit(net, datamodule=dm) utils.hprint('Done with model run fn')
def _train_fn(self, config: Dict, checkpoint_dir=None, fast_dev_run=False, include_gpus=False): utils.hprint('Starting train function with config:') utils.print_dict(config) print() utils.set_pandas_disp(width=200) hp = self._model_param_class.from_dict(config) assert isinstance(hp, self._model_param_class) print(' hp:', hp) if checkpoint_dir: # see https://docs.ray.io/en/master/tune/user-guide.html#checkpointing raise NotImplementedError( f"Got checkpoint_dir in trian_fn: {checkpoint_dir}") utils.hprint("About to create net in TuneRunner") net = hp.build() # import torch.autograd.profiler as profiler # with profiler.profile(record_shapes=True, use_cuda=True, profile_memory=True) as prof: # net = self._factored_lightning_module_class.from_hp(hp=hp) # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=1000)) utils.set_seeds(hp.data.seed) # noinspection PyTypeChecker trainer = pl.Trainer( logger=logs_mod.get_pl_logger(hp=hp.exp, tune=tune), default_root_dir=tune.get_trial_dir(), callbacks=self.extra_pl_callbacks + self.get_pl_callbacks_for_tune(), max_epochs=hp.opt.num_epochs, gpus=hp.data.num_gpus if include_gpus else None, weights_summary='full', fast_dev_run=fast_dev_run, accumulate_grad_batches=1, profiler='simple', deterministic=True, log_every_n_steps=hp.logs.num_steps_per_metric_log, log_gpu_memory=hp.logs.log_gpu_memory, ) utils.hprint('About to start tune_runner\'s trainer.fit...') fit_out = trainer.fit(net, datamodule=net.dm) utils.hprint('Done with tune_runner._train_fn') return fit_out
def run(self, fast_dev_run=False, use_gpus=False): utils.set_seeds(self.search_params.data.seed) search_dict = self.search_params.to_ray_tune_search_dict() # see tune.utils.UtilMonitor search_dict['log_sys_usage'] = True output_str = str(self.search_params.logs.output_dir) if output_str.startswith('s3://') or output_str.startswith( 'gs://') or output_str.startswith('hdfs://'): sync_config = tune.SyncConfig( upload_dir=self.search_params.logs.output_dir) else: sync_config = None analysis = tune.run( run_or_experiment=self._get_train_fn(fast_dev_run=fast_dev_run, include_gpus=use_gpus), name=self.search_params.exp.get_project_exp_name(), stop=self.get_tune_stopper(self.search_params.opt.num_epochs), config=search_dict, resources_per_trial=self.get_resources_per_trial( self.search_params, include_gpu=use_gpus), num_samples=self.tune_hp.num_hp_samples, sync_config=sync_config, loggers=self.get_tune_loggers(), log_to_file=self.tune_hp.log_to_file and not self.tune_hp.ray_local_mode, keep_checkpoints_num=2, checkpoint_score_attr= f'{self.search_params.opt.search_mode}-{self.search_params.opt.search_metric}', fail_fast=False, scheduler=self.get_tune_scheduler(self.search_params, self.tune_hp), verbose=2, progress_reporter=self.get_cli_reporter(), reuse_actors=False, ) utils.hprint("done with tune.run") param_hash = self.search_params.get_short_hash(num_chars=8) analysis_file = self.search_params.logs.output_dir / f'tune_analysis_{param_hash}.cloudpickle' print(f"Saving {analysis_file}") utils.save_cloudpickle(analysis_file, analysis) best_trial = analysis.get_best_trial( self.search_params.opt.search_metric, self.search_params.opt.search_mode, "last-5-avg") utils.hprint('best_trial.last_result', do_include_pre_break_line=True) utils.print_dict(best_trial.last_result) utils.hprint('best_trial.config', do_include_pre_break_line=True) utils.print_dict(best_trial.config)
def run( net: pl.LightningModule, hp: TotalParams, fast_dev_run=False, do_find_lr=False, callbacks=None, ): utils.set_seeds(hp.data.seed) utils.set_pandas_disp() if callbacks is None: callbacks = [ logs_mod.CounterTimerLrCallback(), logs_mod.VocabLengthCallback(), ] print("model run about to create trainer") trainer = pl.Trainer( logger=True if fast_dev_run else logs_mod.get_pl_logger(hp.exp), default_root_dir=hp.logs.output_dir, callbacks=callbacks, max_epochs=hp.opt.num_epochs, gpus=hp.data.num_gpus, weights_summary='full', fast_dev_run=fast_dev_run, accumulate_grad_batches=1, profiler='simple', deterministic=True, auto_lr_find=do_find_lr, log_every_n_steps=hp.logs.num_steps_per_metric_log, ) print("model run done creating trainer") if do_find_lr: utils.hprint("Starting trainer.tune:") lr_tune_out = trainer.tune(net, datamodule=net.dm) print(f' Tune out: {lr_tune_out}') else: utils.hprint("Starting trainer.fit:") print(f' Dataset file: {hp.data.dataset_file}') trainer.fit(net, datamodule=net.dm) utils.hprint('Done with model run fn')
hp.heads.num_features = params.Discrete([32, 64, 128, 256]) hp.heads.num_layers = params.Integer(2, 5) hp.heads.num_groups = params.Discrete([8, 16, 32, 64]) hp.heads.num_blocks_per_residual = params.Integer(1, 5) hp.heads.num_blocks_per_dropout = params.Integer(1, 5) hp.heads.requires_grad = True hp.tune = TuneRunner.TuneParams() hp.tune.asha_grace_period = 16 hp.tune.asha_reduction_factor = 2 hp.tune.num_hp_samples = 100 hostname = socket.gethostname() is_local_run = hostname.endswith('.local') # noinspection PyTypeChecker tune_runner = TuneRunner( search_params=hp, factored_lightning_module_class=model_bert_trans_conv_tclass. ModelBertConvTransTClass, extra_pl_callbacks=None, ray_local_mode=False, ) tune_runner.run( fast_dev_run=False, use_gpus=not is_local_run, log_to_file=True, ) utils.hprint("done with tune_runner.run")
hp.trans.num_layers = 6 hp.trans.num_query_features = None hp.trans.fc_dim_mult = 2 hp.fc.num_features = 128 hp.fc.num_layers = 2 hp.fc.num_groups = 16 hp.fc.num_blocks_per_residual = 2 hp.fc.num_blocks_per_dropout = 2 hp.fc.requires_grad = True hp.heads.num_features = 128 hp.heads.num_layers = 4 hp.heads.num_groups = 16 hp.heads.num_blocks_per_residual = 2 hp.heads.num_blocks_per_dropout = 2 hp.heads.requires_grad = True dm = data.TablestakesDataModule(hp.data) net = ModelBertConvTransTClass( hp=hp, data_module=dm, metrics_tracker=metrics_mod.ClassificationMetricsTracker(hp.metrics), opt=factored.OptimizersMaker(hp.opt), ) utils.hprint('About to start model run:') utils.print_dict(hp.to_dict()) run(net, dm, hp, fast_dev_run, do_find_lr=False)
}, ) hp.verbose = False tune_hp = TuneParams() tune_hp.asha_grace_period = 4 tune_hp.asha_reduction_factor = 2 tune_hp.num_hp_samples = 2 tune_hp.log_to_file = False tune_hp.ray_local_mode = False hostname = socket.gethostname() is_local_run = hostname.endswith('.local') utils.hprint('About to start model run:') utils.print_dict(hp.to_dict()) tune_runner = TuneRunner( model_hp=hp, tune_hp=tune_hp, # factored_lightning_module_class=ts_model.TablestakesBertConvTransTClassModel, extra_pl_callbacks=None, ) tune_runner.run( fast_dev_run=False, use_gpus=not is_local_run, ) utils.hprint("done with tune_runner.run")