def objective(trial, args): params = get_trial_params(trial) params['hidden_size'] = 2**params['hidden_size'] params['acc_grads'] = 2**params['acc_grads'] early_stopper = EarlyStopping( monitor='val_loss', min_delta=0.005, patience=3, mode='min') callbacks = [early_stopper, PyTorchLightningPruningCallback( trial, monitor="val_loss")] if args.model_type == 'attnlstm': params['attn_width'] = trial.suggest_int("attn_width", 3, 64) if 'split' in args.val_mode: dataset_hour = args.data.split('_')[-1] logger = MLFlowLogger(experiment_name=f'Optuna_{dataset_hour}h_{args.val_mode[-1]}_split') print(f'Optuna_{dataset_hour}_{args.val_mode[-1]}_split') val_losses = [] for _split_id in range(int(args.val_mode[-1])): print(f"Split {_split_id} Trial {trial.number}") args.__dict__["split_id"] = 0 for key in params: args.__dict__[str(key)] = params.get(key) model = LitLSTM(args) trainer = Trainer( logger=logger, callbacks=callbacks, **get_trainer_params(args), ) logger.log_hyperparams(model.args) args.__dict__["val_mode"] = args.val_mode args.__dict__["split_id"] = _split_id model._get_data(args, data_mode='init') trainer.fit(model) trainer.test(model, test_dataloaders=model.test_dataloader()) # logger.finalize() val_losses.append(model.metrics['val_loss']) # log mean val loss for later retrieval of best model mean_val_loss = torch.stack(val_losses).mean() logger.log_metrics({"mean_val_loss": mean_val_loss}, step=0) logger.finalize() return mean_val_loss elif args.val_mode == 'full': logger = MLFlowLogger(experiment_name='Optuna_full') for key in params: args.__dict__[str(key)] = params.get(key) model = LitLSTM(args) trainer = Trainer( logger=logger, callbacks=callbacks, **get_trainer_params(args), ) logger.log_hyperparams(model.args) trainer.fit(model) trainer.test(model, test_dataloaders=model.test_dataloader()) model.save_preds_and_targets(to_disk=True) logger.finalize() return model.metrics['val_loss']
def train(args): seed_everything(args.seed) model = LitLSTM(args) logger = MLFlowLogger(experiment_name='Default') early_stop_callback = EarlyStopping( monitor='val_loss', min_delta=0.005, patience=3, verbose=args.verbose, mode='min') if args.early else None trainer = Trainer( log_gpu_memory='all' if args.verbose else None, track_grad_norm=2 if args.verbose else -1, logger=logger, weights_summary='full', callbacks=[early_stop_callback], accumulate_grad_batches=args.acc_grads, profiler=args.verbose, **get_trainer_params(args), ) logger.log_hyperparams(model.args) trainer.fit(model) trainer.test(model, test_dataloaders=model.test_dataloader()) model.save_preds_and_targets(to_disk=True) logger.finalize() return logger.run_id
def main(args: DictConfig): # Distributed training torch.multiprocessing.set_sharing_strategy('file_system') if str(args.exp.gpus) == '-1': args.exp.gpus = torch.cuda.device_count() # Secondary data args args.data.setting = 'in-topic' if args.data.test_id is None else 'cross-topic' dataset_name = args.data.path.split('/')[1] args.data.path = f'{ROOT_PATH}/{args.data.path}' # MlFlow Logging if args.exp.logging: experiment_name = f'{dataset_name}/{args.setting}-{args.data.setting}/{args.exp.task_name}' mlf_logger = MLFlowLogger(experiment_name=experiment_name, tracking_uri=MLFLOW_URI) experiment = mlf_logger._mlflow_client.get_experiment_by_name( experiment_name) if experiment is not None: experiment_id = experiment.experiment_id if args.exp.check_exisisting_hash: args.hash = calculate_hash(args) existing_runs = mlf_logger._mlflow_client.search_runs( filter_string=f"params.hash = '{args.hash}'", run_view_type=mlflow.tracking.client.ViewType.ACTIVE_ONLY, experiment_ids=[experiment_id]) if len(existing_runs) > 0: logger.info('Skipping existing run.') return else: logger.info('No runs found - perfoming one.') # cpnt_path = f'{ROOT_PATH}/mlruns/{experiment_id}/{run_id}/artifacts' # else: # cpnt_path = None # Load pretrained model and tokenizer set_seed(args) model = instantiate(args.lightning_module, args=args) logger.info(f'Run arguments: \n{args.pretty()}') # Early stopping & Checkpointing early_stop_callback = EarlyStopping( min_delta=0.00, patience=args.exp.early_stopping_patience, verbose=False, mode='min') checkpoint_callback = CustomModelCheckpoint( model=model, verbose=True, mode='min', save_top_k=1, period=0 if args.exp.val_check_interval < 1.0 else 1) lr_logging_callback = LearningRateLogger(logging_interval='epoch') # Training trainer = Trainer( gpus=eval(str(args.exp.gpus)), logger=mlf_logger if args.exp.logging else None, max_epochs=args.exp.max_epochs, gradient_clip_val=args.optimizer.max_grad_norm, early_stop_callback=early_stop_callback, val_check_interval=args.exp.val_check_interval, checkpoint_callback=checkpoint_callback if args.exp.checkpoint else None, accumulate_grad_batches=args.exp.gradient_accumulation_steps, auto_lr_find=args.optimizer.auto_lr_find, precision=args.exp.precision, distributed_backend='dp', callbacks=[lr_logging_callback]) trainer.fit(model) trainer.test(model) # Cleaning cache torch.cuda.empty_cache() # Ending the run if args.exp.logging: mlf_logger.finalize()