def run_training_batch(self, batch, batch_idx, dataloader_idx): # track grad norms grad_norm_dict = {} # bookkeeping self._hiddens = None optimizers = list(enumerate(self.trainer.optimizers)) # track all outputs across time and num of optimizers batch_outputs = [[] for _ in range(len(optimizers))] if batch is None: self.warning_cache.warn( "train_dataloader yielded None. If this was on purpose, ignore this warning..." ) return AttributeDict( signal=0, grad_norm_dict={}, training_step_output_for_epoch_end=batch_outputs, ) # hook response = self.trainer.call_hook("on_batch_start") if response == -1: return AttributeDict(signal=-1, grad_norm_dict={}) # hook response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx) if response == -1: return AttributeDict(signal=-1, grad_norm_dict={}) # lightning module hook splits = self._tbptt_split_batch(batch) for split_idx, split_batch in enumerate(splits): self.split_idx = split_idx if self.trainer.lightning_module.automatic_optimization: for opt_idx, optimizer in self.get_active_optimizers( batch_idx): result = self._run_optimization(batch_idx, split_idx, split_batch, opt_idx, optimizer) if result: batch_outputs[opt_idx].append( result.training_step_output_for_epoch_end) grad_norm_dict = result.get("grad_norm_dict", {}) else: # in manual optimization, there is no looping over optimizers result = self._run_optimization(batch_idx, split_idx, split_batch) if result: batch_outputs[0].append( result.training_step_output_for_epoch_end) output = AttributeDict( signal=0, # todo: Properly aggregate grad_norm accros opt_idx and split_idx grad_norm_dict=grad_norm_dict, training_step_output_for_epoch_end=batch_outputs, ) return output
def run_training_batch(self, batch, batch_idx, dataloader_idx): # track grad norms grad_norm_dic = {} # track all metrics for callbacks batch_callback_metrics = [] # track metrics to log batch_log_metrics = [] # bookkeeping using_results_obj = False self.trainer.hiddens = None # track all outputs across time and num of optimizers batch_outputs = [[] for _ in range(len(self.get_optimizers_iterable()))] if batch is None: return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic) # hook response = self.trainer.call_hook("on_batch_start") if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) # hook response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx) if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) # checks if backward or backward + optimizer step (via closure) accumulation_done = self._accumulated_batches_reached() is_final_batch = self._num_training_batches_reached() should_accumulate = not (accumulation_done or is_final_batch) # lightning module hook splits = self.tbptt_split_batch(batch) for split_idx, split_batch in enumerate(splits): self.trainer.split_idx = split_idx # in manual optimization we loop over all optimizers at once optimizers = self.get_optimizers_iterable() if not self.automatic_optimization: optimizers = [optimizers[0]] # loop over optimizers for opt_idx, optimizer in optimizers: # make sure only the gradients of the current optimizer's parameters are calculated # in the training step to prevent dangling gradients in multiple-optimizer setup. if self.automatic_optimization and len(self.trainer.optimizers) > 1: model = self.trainer.get_model() model.toggle_optimizer(optimizer, opt_idx) if should_accumulate: # For gradient accumulation # ------------------- # calculate loss (train step + train step end) # ------------------- # perform dpp sync only when performing optimizer_step with self.block_ddp_sync_behaviour(): self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens) batch_outputs = self._process_closure_result( batch_callback_metrics=batch_callback_metrics, batch_log_metrics=batch_log_metrics, batch_outputs=batch_outputs, opt_idx=opt_idx, ) # ------------------------------ # BACKWARD PASS # ------------------------------ # gradient update with accumulated gradients else: if self.automatic_optimization: def train_step_and_backward_closure(): result = self.training_step_and_backward( split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens ) return None if result is None else result.loss # optimizer step self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure) else: self._curr_step_result = self.training_step(split_batch, batch_idx, opt_idx, self.trainer.hiddens) if self._curr_step_result is None: # user decided to skip optimization continue batch_outputs = self._process_closure_result( batch_callback_metrics=batch_callback_metrics, batch_log_metrics=batch_log_metrics, batch_outputs=batch_outputs, opt_idx=opt_idx, ) grad_norm_dic = self._cur_grad_norm_dict self._cur_grad_norm_dict = None # hook self.on_before_zero_grad(optimizer) # clear gradients self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) accumulated_loss = self.accumulated_loss.mean() if accumulated_loss is not None: # calculate running loss for display self.running_loss.append(self.accumulated_loss.mean() * self.trainer.accumulate_grad_batches) # reset for next set of accumulated grads self.accumulated_loss.reset() # collapse all metrics into one dict batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()} # track all metrics for callbacks self.trainer.logger_connector.callback_metrics.update(batch_log_metrics) self.trainer.logger_connector.callback_metrics.update( {k: v for d in batch_callback_metrics for k, v in d.items() if v is not None} ) result = AttributeDict( signal=0, grad_norm_dic=grad_norm_dic, batch_log_metrics=batch_log_metrics, training_step_output_for_epoch_end=batch_outputs, ) return result
def main(): exp_parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) exp_parser.add_argument('--checkpoint-path', '-c', type=str, help='which checkpoint to load') exp_parser.add_argument('--batch-size', '-bs', type=int, help='batch size') exp_parser.add_argument('--resize', '-rs', nargs=2, type=int, help='resize') exp_parser.add_argument('--out-dir', '-od', type=str, help='output directory for nifti counterfactuals') exp_parser.add_argument('--csv', '-csv', type=str, help='csv path') exp_parser.add_argument('-v', '--verbosity', action="count", default=0, help="increase output verbosity (e.g., -vv is more than -v)") exp_args, other_args = exp_parser.parse_known_args() if exp_args.verbosity == 1: level = logging.getLevelName('INFO') elif exp_args.verbosity >= 2: level = logging.getLevelName('DEBUG') else: level = logging.getLevelName('WARNING') logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=level) checkpoint_path = exp_args.checkpoint_path ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu')) hparams = ckpt['hyper_parameters'] logger.info(f'found hparams: {hparams}') exp_class = EXPERIMENT_REGISTRY[hparams['experiment']] parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = Trainer.add_argparse_args(parser) parser.set_defaults(checkpoint_callback=True) parser._action_groups[1].title = 'lightning_options' args = parser.parse_args(other_args) if args.gpus is not None and isinstance(args.gpus, int): os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpus) args.gpus = 1 exp = hparams['model'] model_class = MODEL_REGISTRY[exp] model_params = { k: v for k, v in hparams.items() if (k in inspect.signature(model_class.__init__).parameters or k in k in inspect.signature(model_class.__bases__[0].__init__).parameters or k in k in inspect.signature(model_class.__bases__[0].__bases__[0].__init__).parameters) } model_params['img_shape'] = hparams['resize'] if 'resize' in hparams else exp_args.resize new_state_dict = OrderedDict() for key, value in ckpt['state_dict'].items(): new_key = key.replace('pyro_model.', '') new_state_dict[new_key] = value loaded_model = model_class(**model_params) loaded_model.load_state_dict(new_state_dict) device = torch.device('cuda' if (torch.cuda.is_available() and args.gpus is not None) else 'cpu') for p in loaded_model._buffers.keys(): if any([(b in p) for b in _buffers_to_load]): setattr(loaded_model, p, getattr(loaded_model, p).to(device)) loaded_model.eval() loaded_model = loaded_model.to(device) groups = {} for group in parser._action_groups: group_dict = {a.dest: getattr(args, a.dest, None) for a in group._group_actions} groups[group.title] = argparse.Namespace(**group_dict) lightning_args = groups['lightning_options'] trainer = Trainer.from_argparse_args(lightning_args) trainer.logger.experiment.log_dir = exp_args.checkpoint_path hparams = AttributeDict(hparams) hparams.test_dir = exp_args.out_dir hparams.test_csv = exp_args.csv hparams.test_batch_size = exp_args.batch_size hparams.test_batch_size = exp_args.batch_size experiment = exp_class.load_from_checkpoint(checkpoint_path, hparams=hparams, pyro_model=loaded_model) logger.info(f'Loaded {experiment.__class__}:\n{experiment}') trainer.test(experiment)
def __init__(self, trainer): self.trainer = trainer self.dist = AttributeDict(rank=0, device=None)
tracking_uri=mlflow_url) ]) # Make trainer trainer = pl.Trainer.from_argparse_args(arguments, logger=logger) # Make data model factory if arguments.frames is not None: frames = arguments.frames.split(",") frames = [int(x) for x in frames] frames = range(*frames) else: frames = None data_model_factory = KittiDataModuleFactory(frames, arguments.sequences, arguments.dataset) # Load parameters params = load_hparams_from_yaml(arguments.config) params = AttributeDict(params) print("Load model from params \n" + str(params)) data_model = data_model_factory.make_data_module_from_params(params) model = MultiUnsupervisedDepthModelFactory().make_model( params, data_model.get_cameras_calibration()) if arguments.load_model: print("Load checkpoint") load_undeepvo_checkpoint(model, arguments.model_checkpoint) print("Start training") trainer.fit(model, data_model)
def load( path: Union[Path, str], old_args=AttributeDict(), Cls: Type[RideModule] = None, auto_scale_lr=False, ) -> AttributeDict: """Loads hparams from path Args: path (Union[Path, str]): Path to jsonfile containing hparams old_args (Optional[AttributeDict]):The AttributeDict to be updated with the new hparams cls (Optional[RideModule]): A class whole hyperparameters can be used to select the relevant hparams to take Returns: AttributeDict: AttributeDict with updated hyperparameters """ path = Path(path) hparams = load_structured_data(path) if Cls: hparam_names = Cls.configs().names hparams = {k: v for k, v in hparams.items() if k in hparam_names} # During hparamsearch, only a single GPU is used, but accumulate_grad_batches is set to the total number of gpus given # If we have multiple GPUs, we need to reduce accumulate_grad_batches accordingly num_gpus = parse_num_gpus(old_args.gpus) if num_gpus > 0 and "accumulate_grad_batches" in hparams: # pragma: no cover hparams["accumulate_grad_batches"] = max( 1, int(hparams["accumulate_grad_batches"]) // num_gpus ) old_args = dict(old_args) user_passed_arg_keys = [a[2:] for a in sys.argv if a.startswith("--")] user_passed_args = { k: v for k, v in old_args.items() if k in user_passed_arg_keys } # If batch size was changed by user, automatically apply the linear scaling rule to the learning rate if ( auto_scale_lr and "batch_size" in hparams and "learning_rate" in hparams and "batch_size" in user_passed_args and "learning_rate" not in user_passed_args ): old_accumulate_grad_batches = ( hparams["accumulate_grad_batches"] if "accumulate_grad_batches" in hparams else 1 ) new_accumulate_grad_batches = ( user_passed_args["accumulate_grad_batches"] if "accumulate_grad_batches" in user_passed_args else old_accumulate_grad_batches ) new_tot_batch = new_accumulate_grad_batches * user_passed_args["batch_size"] old_tot_batch = old_accumulate_grad_batches * hparams["batch_size"] if new_tot_batch != old_tot_batch: scaling = new_tot_batch / old_tot_batch user_passed_args["learning_rate"] = hparams["learning_rate"] * scaling logger.info( f"🔧 A `batch_size*accumulate_grad_batches` ({new_tot_batch}) differs from hparams file ({old_tot_batch}). " f"Scaling learning_rate from {hparams['learning_rate']} to {user_passed_args['learning_rate']} (= {hparams['learning_rate']} × {new_tot_batch} / {old_tot_batch})" ) return AttributeDict(**{**old_args, **hparams, **user_passed_args})
def hparams(self) -> AttributeDict: if not hasattr(self, "_hparams"): self._hparams = AttributeDict() return self._hparams
def run_training_batch(self, batch, batch_idx, dataloader_idx): # track grad norms grad_norm_dic = {} # track all metrics for callbacks batch_callback_metrics = [] # track metrics to log batch_log_metrics = [] # bookkeeping using_results_obj = False self.trainer.hiddens = None # track all outputs across time and num of optimizers batch_outputs = [[] for _ in range(len(self.get_optimizers_iterable()))] if batch is None: return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic) # hook response = self.trainer.call_hook('on_batch_start') if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) # hook response = self.trainer.call_hook('on_train_batch_start', batch, batch_idx, dataloader_idx) if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) # lightning module hook splits = self.tbptt_split_batch(batch) for split_idx, split_batch in enumerate(splits): self.trainer.split_idx = split_idx # loop over optimizers for opt_idx, optimizer in self.get_optimizers_iterable(): # make sure only the gradients of the current optimizer's parameters are calculated # in the training step to prevent dangling gradients in multiple-optimizer setup. if len(self.trainer.optimizers) > 1: for param in self.trainer.get_model().parameters(): param.requires_grad = False for group in optimizer.param_groups: for param in group['params']: param.requires_grad = True # ------------------- # calculate loss (train step + train step end) # ------------------- opt_closure_result = self.training_step_and_backward( split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens) using_results_obj = isinstance( opt_closure_result.training_step_output, Result) # log metrics self.log_training_step_metrics(opt_closure_result, batch_callback_metrics, batch_log_metrics) # track hiddens self.trainer.hiddens = self.process_hiddens(opt_closure_result) # check if loss or model weights are nan if self.trainer.terminate_on_nan: self.trainer.detect_nan_tensors(opt_closure_result.loss) # track total loss for logging (avoid mem leaks) self.accumulated_loss.append(opt_closure_result.loss) # track all the outputs across all steps batch_opt_idx = opt_idx if len(batch_outputs) > 1 else 0 batch_outputs[batch_opt_idx].append( opt_closure_result.training_step_output_for_epoch_end) # ------------------------------ # BACKWARD PASS # ------------------------------ # gradient update with accumulated gradients accumulation_done = ( self.trainer.batch_idx + 1) % self.trainer.accumulate_grad_batches == 0 is_final_batch = (self.trainer.batch_idx + 1) == self.trainer.num_training_batches if accumulation_done or is_final_batch: # hook grad_norm_dic = self.on_before_backward( batch_idx, optimizer) # wrap forward + backward pass in closure for 2nd order optimizers train_step_and_backward_closure = lambda: self.training_step_and_backward( split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens, ).loss # optimizer step self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure) # hook self.on_before_zero_grad(optimizer) # clear gradients self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) # calculate running loss for display self.running_loss.append( self.accumulated_loss.mean() * self.trainer.accumulate_grad_batches) # reset for next set of accumulated grads self.accumulated_loss.reset() # collapse all metrics into one dict batch_log_metrics = { k: v for d in batch_log_metrics for k, v in d.items() } # track all metrics for callbacks # TODO: is this needed? self.trainer.logger_connector.callback_metrics.update({ k: v for d in batch_callback_metrics for k, v in d.items() if v is not None }) result = AttributeDict( signal=0, grad_norm_dic=grad_norm_dic, batch_log_metrics=batch_log_metrics, training_step_output_for_epoch_end=batch_outputs) return result
def run_training_batch(self, batch, batch_idx): """ :param batch: dict; contains three keys: input_ids, attention_mask, decoder_input_ids Example for 'batch': batch: {'input_ids': tensor([[ 0, 36, 230, ..., 8, 41, 2]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1]]), 'decoder_input_ids': tensor([[ 0, 287, 10, 2107, 111, 10468, 226, 47385, 11579, 1012, 2156, 5, 5302, 47385, 281, 47385, 10003, 255, 47385, 347, 111, 2107, 47385, 574, 47385, 1000, 47385, 398, 47385, 245, 16, 10, 205, 1374, 12576, 479, 646, 1000, 1215, 3388, 510, 742, 85, 128, 579, 65, 9, 5, 357, 3092, 23, 63, 1836, 11, 5, 3555, 111, 672, 2156, 26180, 47385, 642, 111, 3547, 4120, 479, 646, 1000, 1215, 3388, 510, 742, 7192, 8806, 10262, 3444, 7951, 2170, 1318, 2]])} :param batch_idx: number of batch :return: """ # load tokenizer tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') # load config for GSM config = yaml_load(f"{self.default_root_dir}/data/config/gsm.yaml") # load dict dictionary = Dictionary.load(datapath('dict-www-cnndm-unigram')) # remove [SEP] sep_list = [ '[SEP_0]', '[SEP_1]', '[SEP_2]', '[SEP_3]', '[SEP_4]', '[SEP_5]', '[SEP_6]', '[SEP_7]', '[SEP_8]', '[SEP_9]', '<S_SEP>' ] # vocab size for topic modeling vocab_size = len(dictionary) # model config['hidden']['features'][0] = vocab_size # trainer batch config['trainer_batch']['test_sample'] = 1 config = extend_config_reference(config) gsm_trainer = config['GSMtrainer'] gsm_trainer[ 'base_dir'] = f"{self.default_root_dir}/log/bart-large-cnn-finetune" gsm_trainer = GSMTrainer.from_config(gsm_trainer) # number of topics K = config['gsmtopic']['k'] # yaml_dump(gsm_trainer, # os.path.join(f"{self.default_root_dir}/log/bart-large-cnn-finetune", "gsm_trainer.yaml")) # ----------------------------------------- # Topic Modeling - GSM # ----------------------------------------- batch_size = batch['input_ids'].size()[0] docs = [] for batch_num in range(batch_size): # extract the batch_sentence batch_sentence = tokenizer.decode( batch['input_ids'][batch_num].tolist(), skip_special_tokens=True) # change to lowercase and split to list batch_sentence_list = batch_sentence.split(" ") # remove [SEP] batch_sentence_list_nosep = [ item for item in batch_sentence_list if item not in sep_list ] text = ' '.join([x for x in batch_sentence_list_nosep]) fine_text = text.replace(' ##', '').lower() batch_sentence = re.sub(r'[^\w\s]', '', fine_text) # batch_sentence: change to the cleaned news for topic modeling # change to training data format in topic modeling gsm_data_bow = dictionary.doc2bow(batch_sentence.split(" ")) docs.append(gsm_data_bow) # gsm_data: data for topic modeling gsm_data = DataLoader(DocDataset(docs, len(dictionary), device='cuda'), batch_size=config['dataset']['batch_size'], drop_last=False, num_workers=0) gsm_trainer.__dict__['train_iterator'] = gsm_data gsm_loss, gsm_p = gsm_trainer.co_train(vocab_size, training=True) del gsm_data # track grad norms grad_norm_dic = {} # track all metrics for callbacks batch_callback_metrics = [] # track metrics to log batch_log_metrics = [] if batch is None: return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic) # Batch start events with self.profiler.profile('on_batch_start'): # callbacks self.on_batch_start() # hooks if self.is_function_implemented('on_batch_start'): response = self.get_model().on_batch_start(batch) if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) splits = [batch] if self.truncated_bptt_steps is not None: model_ref = self.get_model() with self.profiler.profile('tbptt_split_batch'): splits = model_ref.tbptt_split_batch(batch, self.truncated_bptt_steps) self.hiddens = None for split_idx, split_batch in enumerate(splits): self.split_idx = split_idx for opt_idx, optimizer in self._get_optimizers_iterable(): # make sure only the gradients of the current optimizer's parameters are calculated # in the training step to prevent dangling gradients in multiple-optimizer setup. if len(self.optimizers) > 1: for param in self.get_model().parameters(): param.requires_grad = False for group in optimizer.param_groups: for param in group['params']: param.requires_grad = True # ------------------- # calculate loss # ------------------- beta = 0.01 opt_closure_result = self.optimizer_closure( split_batch, batch_idx, opt_idx, optimizer, self.hiddens, gsm_p, # topic distribution gsm_loss, # loss for topic modeling K, # number of topics beta, ) # ------------------------------ # POST forward bookkeeping # ------------------------------ batch_callback_metrics.append( opt_closure_result.training_step_output.callback_metrics) batch_log_metrics.append( opt_closure_result.training_step_output.log_metrics) self.add_progress_bar_metrics( opt_closure_result.training_step_output.pbar_on_batch_end) # track hiddens self.hiddens = opt_closure_result.hiddens # check if loss or model weights are nan if self.terminate_on_nan: self.detect_nan_tensors(opt_closure_result.loss) # track total loss for logging (avoid mem leaks) self.batch_loss_value.append(opt_closure_result.loss) # ------------------------------ # BACKWARD PASS # ------------------------------ # gradient update with accumulated gradients if (self.batch_idx + 1) % self.accumulate_grad_batches == 0: # backward grad_norm_dic = self.run_batch_backward_pass( split_batch, batch_idx, opt_idx, optimizer) # calculate running loss for display self.running_loss.append(self.batch_loss_value.mean()) # reset for next set of accumulated grads self.batch_loss_value.reset() # Batch end events with self.profiler.profile('on_batch_end'): # callbacks self.on_batch_end() # model hooks if self.is_function_implemented('on_batch_end'): self.get_model().on_batch_end() # collapse all metrics into one dict batch_log_metrics = { k: v for d in batch_log_metrics for k, v in d.items() } # track all metrics for callbacks self.callback_metrics.update( {k: v for d in batch_callback_metrics for k, v in d.items()}) result = AttributeDict( signal=0, grad_norm_dic=grad_norm_dic, batch_log_metrics=batch_log_metrics, training_step_output_for_epoch_end=opt_closure_result. training_step_output_for_epoch_end) return result
def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens, gsm_p, gsm_loss, K, beta=0.01): """ wrap the forward step in a closure so second order methods work """ # --------------------------- # FORWARD # --------------------------- with self.profiler.profile('model_forward'): if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu: with torch.cuda.amp.autocast(): training_step_output = self.training_forward( split_batch, batch_idx, opt_idx, hiddens, gsm_p, gsm_loss, K) else: training_step_output = self.training_forward( split_batch, batch_idx, opt_idx, hiddens, gsm_p, gsm_loss, K) # ---------------------------- # PROCESS THE RESULT # ---------------------------- # format and reduce outputs accordingly training_step_output_for_epoch_end = training_step_output training_step_output = self.process_output(training_step_output, train=True) training_step_output = AttributeDict( batch_loss=training_step_output[0], pbar_on_batch_end=training_step_output[1], log_metrics=training_step_output[2], callback_metrics=training_step_output[3], hiddens=training_step_output[4], ) # if the user decides to finally reduce things in epoch_end, save raw output without graphs training_step_output_for_epoch_end = recursive_detach( training_step_output_for_epoch_end) # accumulate loss # (if accumulate_grad_batches = 1 no effect) ## todo: check self.accumulate_grad_batches closure_loss = training_step_output.batch_loss / self.accumulate_grad_batches # ---------------------------- # Calculate total loss # ---------------------------- # closure_loss = (1 - beta) * closure_loss + beta * gsm_loss # the loss will get scaled for amp. avoid any modifications to it untouched_loss = closure_loss.detach().clone() # backward pass model_ref = self.get_model() with self.profiler.profile('model_backward'): # scale loss for 16 bit if self.precision == 16 and not self.on_tpu: closure_loss = model_ref.amp_scale_loss( closure_loss, optimizer, opt_idx) # enter amp context if not NATIVE_AMP_AVALAIBLE: context = closure_loss closure_loss = closure_loss.__enter__() # do backward pass model_ref.backward(self, closure_loss, optimizer, opt_idx) # exit amp context if self.precision == 16 and not NATIVE_AMP_AVALAIBLE and not self.on_tpu: a, b, c = None, None, None error = context.__exit__(a, b, c) if error: rank_zero_warn(a, b, c) raise Exception('apex unscale error') # once backward has been applied, release graph closure_loss = closure_loss.detach() training_step_output.batch_loss = training_step_output.batch_loss.detach( ) if self.use_horovod: # Synchronize Horovod to ensure gradient manipulations (e.g., loss scaling) are valid optimizer.synchronize() # insert after step hook if self.is_function_implemented('on_after_backward'): model_ref = self.get_model() with self.profiler.profile('on_after_backward'): model_ref.on_after_backward() result = AttributeDict( loss=untouched_loss, training_step_output=training_step_output, training_step_output_for_epoch_end= training_step_output_for_epoch_end, hiddens=training_step_output.hiddens, ) return result
def get_config(): parser = argparse.ArgumentParser( add_help=False, description='multi-label learning for text classification') # load params from config file parser.add_argument('-c', '--config', help='Path to configuration file') args, _ = parser.parse_known_args() config = {} if args.config: with open(args.config) as fp: config = yaml.load(fp, Loader=yaml.SafeLoader) # path / directory parser.add_argument( '--data_dir', default='./data/rcv1', help='The directory to load data (default: %(default)s)') parser.add_argument( '--result_dir', default='./runs', help='The directory to save checkpoints and logs (default: %(default)s)' ) # data parser.add_argument('--data_name', default='rcv1', help='Dataset name (default: %(default)s)') parser.add_argument( '--train_path', help='Path to training data (default: [data_dir]/train.txt)') parser.add_argument( '--val_path', help='Path to validation data (default: [data_dir]/valid.txt)') parser.add_argument( '--test_path', help='Path to test data (default: [data_dir]/test.txt)') parser.add_argument( '--val_size', type=float, default=0.2, help= 'Training-validation split: a ratio in [0, 1] or an integer for the size of the validation set (default: %(default)s).' ) parser.add_argument( '--min_vocab_freq', type=int, default=1, help= 'The minimum frequency needed to include a token in the vocabulary (default: %(default)s)' ) parser.add_argument( '--max_seq_length', type=int, default=500, help='The maximum number of tokens of a sample (default: %(default)s)') parser.add_argument( '--shuffle', type=bool, default=True, help= 'Whether to shuffle training data before each epoch (default: %(default)s)' ) # train parser.add_argument('--seed', type=int, help='Random seed (default: %(default)s)') parser.add_argument( '--epochs', type=int, default=10000, help='Number of epochs to train (default: %(default)s)') parser.add_argument('--batch_size', type=int, default=16, help='Size of training batches (default: %(default)s)') parser.add_argument('--optimizer', default='adam', choices=['adam', 'sgd'], help='Optimizer: SGD or Adam (default: %(default)s)') parser.add_argument( '--learning_rate', type=float, default=0.0001, help='Learning rate for optimizer (default: %(default)s)') parser.add_argument('--weight_decay', type=float, default=0, help='Weight decay factor (default: %(default)s)') parser.add_argument( '--momentum', type=float, default=0.9, help='Momentum factor for SGD only (default: %(default)s)') parser.add_argument( '--patience', type=int, default=5, help= 'Number of epochs to wait for improvement before early stopping (default: %(default)s)' ) # model parser.add_argument('--model_name', default='KimCNN', help='Model to be used (default: %(default)s)') parser.add_argument( '--init_weight', default='kaiming_uniform', help='Weight initialization to be used (default: %(default)s)') parser.add_argument( '--activation', default='relu', help='Activation function to be used (default: %(default)s)') parser.add_argument( '--num_filter_per_size', type=int, default=128, help= 'Number of filters in convolutional layers in each size (default: %(default)s)' ) parser.add_argument( '--filter_sizes', type=int, nargs='+', default=[4], help='Size of convolutional filters (default: %(default)s)') parser.add_argument( '--dropout', type=float, default=0.2, help='Optional specification of dropout (default: %(default)s)') parser.add_argument( '--dropout2', type=float, default=0.2, help= 'Optional specification of the second dropout (default: %(default)s)') parser.add_argument( '--num_pool', type=int, default=1, help='Number of pool for dynamic max-pooling (default: %(default)s)') # eval parser.add_argument( '--eval_batch_size', type=int, default=256, help='Size of evaluating batches (default: %(default)s)') parser.add_argument( '--metric_threshold', type=float, default=0.5, help='Thresholds to monitor for metrics (default: %(default)s)') parser.add_argument( '--monitor_metrics', nargs='+', default=['P@1', 'P@3', 'P@5'], help='Metrics to monitor while validating (default: %(default)s)') parser.add_argument( '--val_metric', default='P@1', help='The metric to monitor for early stopping (default: %(default)s)') # pretrained vocab / embeddings parser.add_argument( '--vocab_file', type=str, help='Path to a file holding vocabuaries (default: %(default)s)') parser.add_argument( '--embed_file', type=str, help= 'Path to a file holding pre-trained embeddings (default: %(default)s)') parser.add_argument( '--label_file', type=str, help='Path to a file holding all labels (default: %(default)s)') # log parser.add_argument( '--save_k_predictions', type=int, nargs='?', const=100, default=0, help= 'Save top k predictions on test set. k=%(const)s if not specified. (default: %(default)s)' ) parser.add_argument( '--predict_out_path', help= 'Path to the an output file holding top 100 label results (default: %(default)s)' ) # others parser.add_argument('--cpu', action='store_true', help='Disable CUDA') parser.add_argument('--silent', action='store_true', help='Enable silent mode') parser.add_argument( '--data_workers', type=int, default=4, help='Use multi-cpu core for data pre-processing (default: %(default)s)' ) parser.add_argument( '--embed_cache_dir', type=str, help= 'For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)' ) parser.add_argument( '--eval', action='store_true', help='Only run evaluation on the test set (default: %(default)s)') parser.add_argument( '--checkpoint_path', help='The checkpoint to warm-up with (default: %(default)s)') parser.add_argument('-h', '--help', action='help') parser.set_defaults(**config) args = parser.parse_args() config = AttributeDict(vars(args)) return config
DEFAULT_MODEL_PARAMS = AttributeDict( **{ "name": "PoseNet", "feature_extractor": AttributeDict(pretrained=True), "criterion": { "name": "SE3Criterion", "rotation_koef": -3.0, "translation_koef": -3.0, "use_se3_translation": True, "loss_type": "l2", "koef_requires_grad": True, "lr": 0.0001 }, "feature_dimension": 2048, "drop_rate": 0, "optimizer": AttributeDict( betas="0.9 0.999", lr=0.0001, weight_decay=0.0005, ), "scheduler": { "step_size": 20, "gamma": 0.5, }, "bias": True, "activation": "tanh", "pretrained": True })
def run(self, args: AttributeDict): """Run hyperparameter search using the `tune.schedulers.ASHAScheduler` Args: args (AttributeDict): Arguments Side-effects: Saves logs to `TUNE_LOGS_PATH / args.id` """ try: from ray import tune from ray.tune.integration.pytorch_lightning import ( TuneReportCheckpointCallback, ) except ModuleNotFoundError as e: # pragma: no cover logger.error( "To use hyperparameter search, first install Ray Tune via `pip install 'ray[tune]'` or `pip install 'ride[extras]'`" ) raise e if not hasattr(args, "id"): args.id = "hparamsearch" module_config = ( Configs.from_file(args.from_hparam_space_file) if args.from_hparam_space_file else self.Module.configs() ).tune_config() config = { **dict(args), **module_config, # pl.Trainer args: "gpus": args.gpus_per_trial, "logger": False, "accumulate_grad_batches": ( (8 // args.gpus_per_trial) * args.accumulate_grad_batches if args.gpus_per_trial else args.accumulate_grad_batches ), } scheduler = tune.schedulers.ASHAScheduler( metric=f"val/{args.optimization_metric}", mode=self.Module.metrics()[args.optimization_metric].value, max_t=args.max_epochs, grace_period=1, reduction_factor=2, ) metric_names = [f"val/{m}" for m in self.Module.metrics().keys()] reporter = tune.CLIReporter( metric_columns=[*metric_names, "training_iteration"], ) tune_callbacks = [ TuneReportCheckpointCallback( metrics=metric_names, filename="checkpoint", on="validation_end", ) ] cpus_per_trial = max( 1, ( min(10 * args.gpus_per_trial, NUM_CPU - 10) if args.gpus_per_trial else min(10, NUM_CPU - 2) ), ) analysis = tune.run( partial( Runner.static_train_and_val, self.Module, trainer_callbacks=tune_callbacks, ), name=args.id, local_dir=str(TUNE_LOGS_PATH), resources_per_trial={"cpu": cpus_per_trial, "gpu": args.gpus_per_trial}, config=config, num_samples=args.trials, scheduler=scheduler, progress_reporter=reporter, raise_on_failed_trial=False, ) best_hparams = analysis.get_best_config( metric=f"val/{args.optimization_metric}", mode=self.Module.metrics()[args.optimization_metric].value, scope="all", ) # Select only model parameters if best_hparams: best_hparams = { k: best_hparams[k] for k in [ *self.Module.configs().names, # Trainer parameters that influence model hparams: "accumulate_grad_batches", "batch_size", "gpus", ] } return best_hparams
def run_training_batch(self, batch, batch_idx): # track grad norms grad_norm_dic = {} # track all metrics for callbacks batch_callback_metrics = [] # track metrics to log batch_log_metrics = [] using_results_obj = False # track all outputs across time and num of optimizers batch_outputs = [[] for i in range(len(self._get_optimizers_iterable()))] if batch is None: return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic) # Batch start events # TODO: deprecate 1.0 with self.profiler.profile('on_batch_start'): # callbacks self.on_batch_start() # hooks if self.is_function_implemented('on_batch_start'): response = self.get_model().on_batch_start(batch) if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) with self.profiler.profile('on_train_batch_start'): # forward support for multiple loaders dataloader_idx = 0 self.on_train_batch_start(batch, batch_idx, dataloader_idx) # hooks if self.is_function_implemented('on_train_batch_start'): response = self.get_model().on_train_batch_start(batch, batch_idx, dataloader_idx) if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) splits = [batch] if self.truncated_bptt_steps is not None: model_ref = self.get_model() with self.profiler.profile('tbptt_split_batch'): splits = model_ref.tbptt_split_batch(batch, self.truncated_bptt_steps) self.hiddens = None for split_idx, split_batch in enumerate(splits): self.split_idx = split_idx for opt_idx, optimizer in self._get_optimizers_iterable(): # make sure only the gradients of the current optimizer's parameters are calculated # in the training step to prevent dangling gradients in multiple-optimizer setup. if len(self.optimizers) > 1: for param in self.get_model().parameters(): param.requires_grad = False for group in optimizer.param_groups: for param in group['params']: param.requires_grad = True # ------------------- # calculate loss (train step + train step end) # ------------------- opt_closure_result = self.optimizer_closure( split_batch, batch_idx, opt_idx, optimizer, self.hiddens ) using_results_obj = isinstance(opt_closure_result.training_step_output, Result) # ------------------------------ # POST forward bookkeeping # ------------------------------ batch_callback_metrics.append(opt_closure_result.training_step_output.callback_metrics) # add metrics to loggers if using_results_obj: metrics_to_log = opt_closure_result.training_step_output.batch_log_metrics step_pbar_metrics = opt_closure_result.training_step_output.batch_pbar_metrics else: metrics_to_log = opt_closure_result.training_step_output.log_metrics step_pbar_metrics = opt_closure_result.training_step_output.pbar_on_batch_end # track metrics batch_log_metrics.append(metrics_to_log) if len(step_pbar_metrics) > 0: self.add_progress_bar_metrics(step_pbar_metrics) # track hiddens self.hiddens = opt_closure_result.hiddens if using_results_obj: opt_closure_result.training_step_output_for_epoch_end.drop_hiddens() # check if loss or model weights are nan if self.terminate_on_nan: self.detect_nan_tensors(opt_closure_result.loss) # track total loss for logging (avoid mem leaks) self.batch_loss_value.append(opt_closure_result.loss) # track all the outputs across all steps batch_outputs[opt_idx].append(opt_closure_result.training_step_output_for_epoch_end) # ------------------------------ # BACKWARD PASS # ------------------------------ # gradient update with accumulated gradients if ((self.batch_idx + 1) % self.accumulate_grad_batches == 0 or (self.batch_idx + 1) == self.num_training_batches): # backward grad_norm_dic = self.run_batch_backward_pass(split_batch, batch_idx, opt_idx, optimizer) # calculate running loss for display self.running_loss.append(self.batch_loss_value.mean() * self.accumulate_grad_batches) # reset for next set of accumulated grads self.batch_loss_value.reset() # Batch end events with self.profiler.profile('on_batch_end'): # callbacks self.on_batch_end() # model hooks if self.is_function_implemented('on_batch_end'): self.get_model().on_batch_end() with self.profiler.profile('on_train_batch_end'): # forward support for multiple loaders dataloader_idx = 0 self.on_train_batch_end(batch, batch_idx, dataloader_idx) # model hooks if self.is_function_implemented('on_train_batch_end'): self.get_model().on_train_batch_end(batch, batch_idx, dataloader_idx) # collapse all metrics into one dict batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()} # track all metrics for callbacks if not using_results_obj: self.callback_metrics.update({k: v for d in batch_callback_metrics for k, v in d.items()}) result = AttributeDict( signal=0, grad_norm_dic=grad_norm_dic, batch_log_metrics=batch_log_metrics, training_step_output_for_epoch_end=batch_outputs ) return result
def __init__(self, trainer, cluster_environment=None): self.trainer = trainer self.cluster_environment = cluster_environment self.dist = AttributeDict(rank=0, device=None)
def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens): """ wrap the forward step in a closure so second order methods work """ # --------------------------- # FORWARD (TRAINING STEP + TRAIN STEP END) # --------------------------- with self.profiler.profile('model_forward'): args = self.build_train_args(split_batch, batch_idx, opt_idx, hiddens) training_step_output = self.accelerator_backend.training_step(args) training_step_output = self.call_hook('training_step_end', training_step_output) # ---------------------------- # PROCESS THE RESULT # ---------------------------- # format and reduce outputs accordingly training_step_output_for_epoch_end = training_step_output is_result_obj = isinstance(training_step_output, Result) # track batch size for weighted average if is_result_obj: training_step_output.track_batch_size(len(split_batch)) # don't allow EvalResult in the training_step if isinstance(training_step_output, EvalResult): raise MisconfigurationException('training_step cannot return EvalResult, ' 'use a dict or TrainResult instead') # handle regular dicts if not is_result_obj: training_step_output = self.process_output(training_step_output, train=True) training_step_output = AttributeDict( batch_loss=training_step_output[0], pbar_on_batch_end=training_step_output[1], log_metrics=training_step_output[2], callback_metrics=training_step_output[3], hiddens=training_step_output[4], ) # if the user decides to finally reduce things in epoch_end, save raw output without graphs if isinstance(training_step_output_for_epoch_end, torch.Tensor): training_step_output_for_epoch_end = training_step_output_for_epoch_end.detach() elif is_result_obj: training_step_output_for_epoch_end = copy(training_step_output) training_step_output_for_epoch_end.detach() else: training_step_output_for_epoch_end = recursive_detach(training_step_output_for_epoch_end) # accumulate loss # (if accumulate_grad_batches = 1 no effect) closure_loss = training_step_output.minimize if is_result_obj else training_step_output.batch_loss closure_loss = closure_loss / self.accumulate_grad_batches # the loss will get scaled for amp. avoid any modifications to it untouched_loss = closure_loss.detach().clone() # backward pass model_ref = self.get_model() with self.profiler.profile('model_backward'): # scale loss for 16 bit if self.precision == 16 and not self.on_tpu: closure_loss = model_ref.amp_scale_loss(closure_loss, optimizer, opt_idx, amp_backend=self.amp_backend) # enter amp context if self.amp_backend == AMPType.APEX: self.dev_debugger.track_event('AMP', str(AMPType.APEX)) context = closure_loss closure_loss = closure_loss.__enter__() # do backward pass model_ref.backward(self, closure_loss, optimizer, opt_idx) # exit amp context if self.precision == 16 and self.amp_backend == AMPType.APEX and not self.on_tpu: a, b, c = None, None, None error = context.__exit__(a, b, c) if error: rank_zero_warn(a, b, c) raise Exception('apex unscale error') # once backward has been applied, release graph closure_loss = closure_loss.detach() if is_result_obj: training_step_output.detach() else: training_step_output.batch_loss = training_step_output.batch_loss.detach() if self.use_horovod: # Synchronize Horovod to ensure gradient manipulations (e.g., loss scaling) are valid optimizer.synchronize() # insert after step hook if self.is_function_implemented('on_after_backward'): model_ref = self.get_model() with self.profiler.profile('on_after_backward'): model_ref.on_after_backward() # when in dev debugging track the losses self.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach()) result = AttributeDict( loss=untouched_loss, training_step_output=training_step_output, training_step_output_for_epoch_end=training_step_output_for_epoch_end, hiddens=training_step_output.hiddens, ) return result
data = MedNIST(hparams) # embed() model = YourModel(hparams) trainer.fit(model, data) if __name__ == "__main__": hparams = AttributeDict({ 'accel': None, 'autolr': False, 'batch_size': 2, 'check_val_n': 1, 'dev': False, 'gpus': None, 'log_path': DATADIR.joinpath('logs'), 'lr': 0.0001, 'lr_schedule': 'ROP', 'max_epochs': 100, 'num_nodes': 1, 'num_workers': 0, 'pl_ver': pl.__version__, 'seed': 22117, 'weight_decay': 1e-07 }) train(hparams)
def run_training_batch(self, batch, batch_idx, dataloader_idx): # track grad norms grad_norm_dic = {} # bookkeeping self.trainer.hiddens = None # track all outputs across time and num of optimizers batch_outputs = [[] for _ in range(len(self.get_optimizers_iterable()))] if batch is None: return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic) # hook response = self.trainer.call_hook("on_batch_start") if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) # hook response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx) if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) # lightning module hook splits = self.tbptt_split_batch(batch) for split_idx, split_batch in enumerate(splits): # create an iterable for optimizers and loop over them for opt_idx, optimizer in self.prepare_optimizers(): # toggle model params + set info to logger_connector self.run_train_split_start(split_idx, split_batch, opt_idx, optimizer) if self.should_accumulate(): # For gradient accumulation # ------------------- # calculate loss (train step + train step end) # ------------------- # automatic_optimization=True: perform dpp sync only when performing optimizer_step # automatic_optimization=False: don't block synchronization here with self.block_ddp_sync_behaviour(): self.training_step_and_backward( split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens ) batch_outputs = self._process_closure_result( batch_outputs=batch_outputs, opt_idx=opt_idx, ) # ------------------------------ # BACKWARD PASS # ------------------------------ # gradient update with accumulated gradients else: if self.automatic_optimization: def train_step_and_backward_closure(): result = self.training_step_and_backward( split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens ) return None if result is None else result.loss # optimizer step self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure) else: self._curr_step_result = self.training_step( split_batch, batch_idx, opt_idx, self.trainer.hiddens ) if self._curr_step_result is None: # user decided to skip optimization # make sure to zero grad. continue batch_outputs = self._process_closure_result( batch_outputs=batch_outputs, opt_idx=opt_idx, ) # todo: Properly aggregate grad_norm accros opt_idx and split_idx grad_norm_dic = self._cur_grad_norm_dict self._cur_grad_norm_dict = None # update running loss + reset accumulated loss self.update_running_loss() result = AttributeDict( signal=0, grad_norm_dic=grad_norm_dic, training_step_output_for_epoch_end=batch_outputs, ) return result
def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens): """ wrap the forward step in a closure so second order methods work """ # --------------------------- # FORWARD # --------------------------- with self.profiler.profile('model_forward'): if self.use_amp and NATIVE_AMP_AVALAIBLE: with torch.cuda.amp.autocast(): training_step_output = self.training_forward( split_batch, batch_idx, opt_idx, hiddens) else: training_step_output = self.training_forward( split_batch, batch_idx, opt_idx, hiddens) # ---------------------------- # PROCESS THE RESULT # ---------------------------- # format and reduce outputs accordingly training_step_output = self.process_output(training_step_output, train=True) # TODO: temporary part of structured results PR training_step_output = AttributeDict( batch_loss=training_step_output[0], pbar_on_batch_end=training_step_output[1], log_metrics=training_step_output[2], callback_metrics=training_step_output[3], hiddens=training_step_output[4], ) # if the user decides to finally reduce things in epoch_end, save raw output without graphs training_step_output_for_epoch_end = recursive_detach( training_step_output) # accumulate loss # (if accumulate_grad_batches = 1 no effect) closure_loss = training_step_output.batch_loss / self.accumulate_grad_batches # backward pass model_ref = self.get_model() with self.profiler.profile('model_backward'): # scale loss for 16 bit if self.precision == 16 and not self.on_tpu: closure_loss = model_ref.amp_scale_loss( closure_loss, optimizer, opt_idx) # do backward pass model_ref.backward(self, closure_loss, optimizer, opt_idx) # once backward has been applied, release graph closure_loss = closure_loss.detach() training_step_output.batch_loss = training_step_output.batch_loss.detach( ) if self.use_horovod: # Synchronize Horovod to ensure gradient manipulations (e.g., loss scaling) are valid optimizer.synchronize() # insert after step hook if self.is_function_implemented('on_after_backward'): model_ref = self.get_model() with self.profiler.profile('on_after_backward'): model_ref.on_after_backward() result = AttributeDict( loss=closure_loss, training_step_output=training_step_output, training_step_output_for_epoch_end= training_step_output_for_epoch_end, hiddens=training_step_output.hiddens, ) return result
def training_step(self, split_batch, batch_idx, opt_idx, hiddens): with self.trainer.profiler.profile('model_forward'): args = self.build_train_args(split_batch, batch_idx, opt_idx, hiddens) training_step_output = self.trainer.accelerator_backend.training_step( args) training_step_output = self.trainer.call_hook( 'training_step_end', training_step_output) # ---------------------------- # PROCESS THE RESULT # ---------------------------- # format and reduce outputs accordingly training_step_output_for_epoch_end = training_step_output is_result_obj = isinstance(training_step_output, Result) # track batch size for weighted average if is_result_obj: training_step_output.track_batch_size(len(split_batch)) # don't allow EvalResult in the training_step if isinstance(training_step_output, EvalResult): raise MisconfigurationException( 'training_step cannot return EvalResult, ' 'use a dict or TrainResult instead') # handle regular dicts if not is_result_obj: training_step_output = self.trainer.process_output( training_step_output, train=True) training_step_output = AttributeDict( batch_loss=training_step_output[0], pbar_on_batch_end=training_step_output[1], log_metrics=training_step_output[2], callback_metrics=training_step_output[3], hiddens=training_step_output[4], ) # if the user decides to finally reduce things in epoch_end, save raw output without graphs if isinstance(training_step_output_for_epoch_end, torch.Tensor): training_step_output_for_epoch_end = training_step_output_for_epoch_end.detach( ) elif is_result_obj: training_step_output_for_epoch_end = copy(training_step_output) training_step_output_for_epoch_end.detach() else: training_step_output_for_epoch_end = recursive_detach( training_step_output_for_epoch_end) # accumulate loss # (if accumulate_grad_batches = 1 no effect) closure_loss = training_step_output.minimize if is_result_obj else training_step_output.batch_loss closure_loss = closure_loss / self.trainer.accumulate_grad_batches # the loss will get scaled for amp. avoid any modifications to it untouched_loss = closure_loss.detach().clone() # result result = AttributeDict( closure_loss=closure_loss, loss=untouched_loss, training_step_output=training_step_output, training_step_output_for_epoch_end= training_step_output_for_epoch_end, hiddens=training_step_output.hiddens, ) return result
def main(params): # Display date and time print(datetime.datetime.now()) print(AttributeDict(vars(params))) # Load training data train_dataset = COCO( directory_a=params.modified, directory_b=params.original, num_training=params.num_training, num_validation=params.num_validation, size=params.size, channels=params.channels, shuffle=params.shuffle, cache=params.cache, validation=False ) train_loader = setup.load(train_dataset, params.batch_size, False) # Load validation data val_dataset = COCO( directory_a=params.modified, directory_b=params.original, num_training=params.num_training, num_validation=params.num_validation, size=params.size, channels=params.channels, shuffle=params.shuffle, cache=params.cache, validation=True ) val_loader = setup.load(val_dataset, params.batch_size, False) # Create model model = CycleGAN( train_loader=train_loader, val_loader=val_loader, batch_size=params.batch_size, iterations=params.epochs * params.num_training, in_channels=params.channels, out_channels=params.channels, g_filters=params.g_filters, d_filters=params.d_filters, residual_blocks=params.residual_blocks, dropout=params.dropout, skip=params.skip, learning_rate=params.lr, beta_1=params.b1, beta_2=params.b2, init_type=params.init_type, init_scale=params.init_scale, pool_size_a=params.pool_size_a, pool_size_b=params.pool_size_b, lambda_dis_a=params.lambda_dis_a, lambda_dis_b=params.lambda_dis_b, lambda_gen_a=params.lambda_gen_a, lambda_gen_b=params.lambda_gen_b, lambda_cycle_a=params.lambda_cycle_a, lambda_cycle_b=params.lambda_cycle_b, lambda_id_a=params.lambda_id_a, lambda_id_b=params.lambda_id_b, shuffle=params.shuffle ) # Set up trainer checkpoint = ModelCheckpoint( dirpath='checkpoints', filename=params.prefix + '_{epoch:03d}', save_top_k=1 if params.save_top_only else -1, monitor='val_loss', verbose=True ) if setup.cuda_is_available(): trainer = Trainer( accelerator='ddp', gpus=setup.cuda_device_count(), callbacks=[checkpoint], max_epochs=params.epochs, precision=params.precision ) else: trainer = Trainer( callbacks=[checkpoint], max_epochs=params.epochs, precision=params.precision ) # Train trainer.fit(model)