def test_by_default_steps_between_gradient_accumulations_is_set_to_1(self): with mock.patch("jiant.models.MultiTaskModel") as MockModel: self.args = params_from_file( resource_filename("jiant", "config/defaults.conf")) self.args.cuda = -1 self.args.run_dir = self.temp_dir self.args.exp_dir = self.temp_dir model = MockModel() _, train_params, _, _ = build_trainer( self.args, self.args.cuda, ["wic"], model, self.args.run_dir, self.wic.val_metric_decreases, phase="pretrain", ) self.assertEqual(train_params["accumulation_steps"], 1)
def main(cl_arguments): """ Train a model for multitask-training.""" cl_args = handle_arguments(cl_arguments) args = config.params_from_file(cl_args.config_file, cl_args.overrides) train_type = args.get('train_type', "SamplingMultiTaskTrainer") if train_type != "SamplingMultiTaskTrainer": print("\n\n\n", train_type, "\n\n\n") # Check for deprecated arg names check_arg_name(args) args, seed = initial_setup(args, cl_args) # Load tasks log.info("Loading tasks...") start_time = time.time() pretrain_tasks, target_tasks, vocab, word_embs = build_tasks(args) tasks = sorted(set(pretrain_tasks + target_tasks), key=lambda x: x.name) log.info("\tFinished loading tasks in %.3fs", time.time() - start_time) log.info("\t Tasks: {}".format([task.name for task in tasks])) # Build model log.info("Building model...") start_time = time.time() model = build_model(args, vocab, word_embs, tasks) log.info("Finished building model in %.3fs", time.time() - start_time) # Start Tensorboard if requested if cl_args.tensorboard: tb_logdir = os.path.join(args.run_dir, "tensorboard") _run_background_tensorboard(tb_logdir, cl_args.tensorboard_port) check_configurations(args, pretrain_tasks, target_tasks) if args.do_pretrain: # Train on pretrain tasks log.info("Training...") stop_metric = pretrain_tasks[0].val_metric if len( pretrain_tasks) == 1 else "macro_avg" should_decrease = (pretrain_tasks[0].val_metric_decreases if len(pretrain_tasks) == 1 else False) trainer, _, opt_params, schd_params = build_trainer( args, [], model, args.run_dir, should_decrease, phase="pretrain", train_type=train_type) to_train = [(n, p) for n, p in model.named_parameters() if p.requires_grad] _ = trainer.train( pretrain_tasks, stop_metric, args.batch_size, args.weighting_method, args.scaling_method, to_train, opt_params, schd_params, args.load_model, phase="pretrain", ) # For checkpointing logic if not args.do_target_task_training: strict = True else: strict = False if args.do_target_task_training: # Train on target tasks pre_target_train_path = setup_target_task_training( args, target_tasks, model, strict) target_tasks_to_train = copy.deepcopy(target_tasks) # Check for previous target train checkpoints task_to_restore, _, _ = check_for_previous_checkpoints( args.run_dir, target_tasks_to_train, "target_train", args.load_model) if task_to_restore is not None: # If there is a task to restore from, target train only on target tasks # including and following that task. last_task_index = [task.name for task in target_tasks_to_train ].index(task_to_restore) target_tasks_to_train = target_tasks_to_train[last_task_index:] for task in target_tasks_to_train: # Skip tasks that should not be trained on. if task.eval_only_task: continue params_to_train = load_model_for_target_train_run( args, pre_target_train_path, model, strict, task) trainer, _, opt_params, schd_params = build_trainer( args, [task.name], model, args.run_dir, task.val_metric_decreases, phase="target_train", train_type=train_type) _ = trainer.train( tasks=[task], stop_metric=task.val_metric, batch_size=args.batch_size, weighting_method=args.weighting_method, scaling_method=args.scaling_method, train_params=params_to_train, optimizer_params=opt_params, scheduler_params=schd_params, load_model=(task.name == task_to_restore), phase="target_train", ) if args.do_full_eval: log.info("Evaluating...") splits_to_write = evaluate.parse_write_preds_arg(args.write_preds) # Evaluate on target_tasks. for task in target_tasks: # Find the task-specific best checkpoint to evaluate on. task_to_use = model._get_task_params(task.name).get( "use_classifier", task.name) ckpt_path = get_best_checkpoint_path(args, "eval", task_to_use) assert ckpt_path is not None load_model_state(model, ckpt_path, args.cuda, skip_task_models=[], strict=strict) evaluate_and_write(args, model, [task], splits_to_write) if args.delete_checkpoints_when_done and not args.keep_all_checkpoints: log.info("Deleting all checkpoints.") delete_all_checkpoints(args.run_dir) log.info("Done!")
def test_checkpointing_does_run(self, build_trainer_params_function): # Check that checkpointing does run and does sanity checks that at each step # it saves the most recent checkpoint as well as the best checkpoint # correctly for both pretrain and target_train stages. with mock.patch("jiant.models.MultiTaskModel") as MockModel: import torch import copy import time from allennlp.common.params import Params MockModel.return_value.eval.return_value = None MockModel.return_value.state_dict.return_value = { "model1": { "requires_grad": True } } pad_dict = self.wic.val_data[0].get_padding_lengths() sorting_keys = [] for field in pad_dict: for pad_field in pad_dict[field]: sorting_keys.append((field, pad_field)) iterator = BucketIterator( sorting_keys=sorting_keys, max_instances_in_memory=10000, batch_size=4, biggest_batch_first=True, ) opt_params = Params({"type": "adam", "lr": 1e-05}) opt_params2 = copy.deepcopy(opt_params) scheduler_params = Params({ "type": "reduce_on_plateau", "factor": 0.05, "mode": "max", "patience": 4, "threshold": 0.05, "threshold_mode": "abs", "verbose": True, }) train_params = [ ( "_text_field_embedder.model.encoder.layer.9.output.dense.bias", torch.Tensor([0.1, 0.3, 0.4, 0.8]), ), ("sent_encoder.layer.1", torch.Tensor([0.1, 0.3, 0.4, 0.8])), ("type", torch.Tensor([0.1])), ] scheduler = LearningRateScheduler.from_params( Optimizer.from_params(train_params, opt_params2), copy.deepcopy(scheduler_params)) optimizer = Optimizer.from_params(train_params, copy.deepcopy(opt_params)) _task_infos = { "wic": { "iterator": iterator(self.wic.val_data, num_epochs=1), "n_tr_batches": 1, "loss": 0.0, "tr_generator": iterator(self.wic.val_data, num_epochs=1), "total_batches_trained": 400, "n_batches_since_val": 0, "optimizer": optimizer, "scheduler": scheduler, "stopped": False, "last_log": time.time(), } } _metric_infos = { metric: { "hist": [], "stopped": False, "best": (-1, {}) } for metric in [self.wic.val_metric] } MockModel.return_value._setup_training.return_value = _task_infos, _metric_infos class MockParams: def __init__(self, requires_grad): self.requires_grad = requires_grad MockModel.return_value.named_parameters.return_value = [ ("model1", MockParams(True)) ] MockModel.use_bert = 1 model = MockModel() pt_trainer, _, _, _ = trainer.build_trainer( self.args, [ "wic" ], # here, we use WIC twice to reduce the amount of boiler-plate code model, self.args.run_dir, self.wic.val_metric_decreases, phase="pretrain", ) tt_trainer, _, _, _ = trainer.build_trainer( self.args, ["wic"], model, self.args.run_dir, self.wic.val_metric_decreases, phase="target_train", ) os.mkdir(os.path.join(self.temp_dir, "wic")) tt_trainer.task_to_metric_mapping = { self.wic.val_metric: self.wic.name } pt_trainer._task_infos = _task_infos pt_trainer._metric_infos = _metric_infos pt_trainer._optimizer = optimizer pt_trainer._scheduler = scheduler pt_trainer._save_checkpoint( { "step": 10, "validation_pass": 1, "should_stop": 0 }, tasks=[self.wic], phase="pretrain", new_best=True, ) pt_trainer._save_checkpoint( { "step": 10, "validation_pass": 2, "should_stop": 0 }, tasks=[self.wic], phase="pretrain", new_best=True, ) tt_trainer._task_infos = _task_infos tt_trainer._metric_infos = _metric_infos tt_trainer._optimizer = optimizer tt_trainer._scheduler = scheduler tt_trainer._save_checkpoint( { "step": 10, "validation_pass": 1, "should_stop": 0 }, tasks=[self.wic], phase="target_train", new_best=True, ) tt_trainer._save_checkpoint( { "step": 10, "validation_pass": 2, "should_stop": 0 }, tasks=[self.wic], phase="target_train", new_best=False, ) assert (os.path.exists( os.path.join(self.temp_dir, "wic", "model_state_target_train_val_1.best.th")) and os.path.exists( os.path.join(self.temp_dir, "wic", "model_state_target_train_val_2.th")) and os.path.exists( os.path.join(self.temp_dir, "model_state_pretrain_val_2.best.th")) and os.path.exists( os.path.join(self.temp_dir, "model_state_pretrain_val_1.th"))) # Assert only one checkpoint is created for pretrain stage. pretrain_best_checkpoints = glob.glob( os.path.join(self.temp_dir, "model_state_pretrain_val_*.best.th")) assert len(pretrain_best_checkpoints) == 1
def main(cl_arguments): """ Train a model for multitask-training.""" cl_args = handle_arguments(cl_arguments) args = config.params_from_file(cl_args.config_file, cl_args.overrides) # Check for deprecated arg names check_arg_name(args) args, seed = initial_setup(args, cl_args) #XXX Dylan's code try: log.info(f'\nK syn is {args.k_syn}') log.info(f'\nK sem is {args.k_sem}\n') except Exception: log.info('No projection matrices.') pass #XXX # Load tasks log.info("Loading tasks...") start_time = time.time() pretrain_tasks, target_tasks, vocab, word_embs = build_tasks(args) #pretrain_tasks[0].load_data() #exit() tasks = sorted(set(pretrain_tasks + target_tasks), key=lambda x: x.name) log.info("\tFinished loading tasks in %.3fs", time.time() - start_time) log.info("\t Tasks: {}".format([task.name for task in tasks])) training_flag = args.do_pretrain if training_flag and args.records_pickle_path: with open(args.records_pickle_path, 'wb') as f: records_dict = dict() records_dict['run_name'] = args.run_name records_dict['last_checkpoint'] = '' records_dict['training'] = dict() records_dict['best_val'] = dict() records_dict['last_val'] = dict() pickle.dump(records_dict, f) # Build model log.info("Building model...") start_time = time.time() model = build_model(args, vocab, word_embs, tasks) log.info("Finished building model in %.3fs", time.time() - start_time) # Start Tensorboard if requested if cl_args.tensorboard: tb_logdir = os.path.join(args.run_dir, "tensorboard_" + str(args.run_name)) _run_background_tensorboard(tb_logdir, cl_args.tensorboard_port) check_configurations(args, pretrain_tasks, target_tasks) if args.do_pretrain: # Train on pretrain tasks log.info("Training...") stop_metric = pretrain_tasks[0].val_metric if len( pretrain_tasks) == 1 else "macro_avg" should_decrease = (pretrain_tasks[0].val_metric_decreases if len(pretrain_tasks) == 1 else False) trainer, _, opt_params, schd_params = build_trainer(args, [], model, args.run_dir, should_decrease, phase="pretrain") to_train = [(n, p) for n, p in model.named_parameters() if p.requires_grad] _ = trainer.train(pretrain_tasks, stop_metric, args.batch_size, args.weighting_method, args.scaling_method, to_train, opt_params, schd_params, args.load_model, phase="pretrain", args=args) # For checkpointing logic if not args.do_target_task_training: strict = True else: strict = False if args.do_target_task_training: # Train on target tasks pre_target_train_path = setup_target_task_training( args, target_tasks, model, strict) target_tasks_to_train = copy.deepcopy(target_tasks) # Check for previous target train checkpoints task_to_restore, _, _ = check_for_previous_checkpoints( args.run_dir, target_tasks_to_train, "target_train", args.load_model) if task_to_restore is not None: # If there is a task to restore from, target train only on target tasks # including and following that task. last_task_index = [task.name for task in target_tasks_to_train ].index(task_to_restore) target_tasks_to_train = target_tasks_to_train[last_task_index:] for task in target_tasks_to_train: # Skip tasks that should not be trained on. if task.eval_only_task: continue params_to_train = load_model_for_target_train_run( args, pre_target_train_path, model, strict, task) trainer, _, opt_params, schd_params = build_trainer( args, [task.name], model, args.run_dir, task.val_metric_decreases, phase="target_train", ) _ = trainer.train( tasks=[task], stop_metric=task.val_metric, batch_size=args.batch_size, weighting_method=args.weighting_method, scaling_method=args.scaling_method, train_params=params_to_train, optimizer_params=opt_params, scheduler_params=schd_params, load_model=(task.name == task_to_restore), phase="target_train", ) tasks_for_eval = [ task for task in target_tasks if (not 'adv' in task.name and not 'discriminator' in task.name) ] if args.do_full_eval: log.info("Evaluating...") splits_to_write = evaluate.parse_write_preds_arg(args.write_preds) # Evaluate on target_tasks. #for task in target_tasks: for task in tasks_for_eval: # Find the task-specific best checkpoint to evaluate on. task_to_use = model._get_task_params(task.name).get( "use_classifier", task.name) ckpt_path = get_best_checkpoint_path(args, "eval", task_to_use) assert ckpt_path is not None load_model_state(model, ckpt_path, args.cuda, skip_task_models=[], strict=strict) records_dict = get_records_dict( args.records_pickle_path) if args.evaluate_final else None evaluate_and_write( args, model, [task], splits_to_write, mode='best_val', do_write=(not args.evaluate_final) or (records_dict != None and ckpt_path == records_dict['last_checkpoint'])) if args.evaluate_final: records_dict = get_records_dict(args.records_pickle_path) if ckpt_path != records_dict['last_checkpoint']: try: load_model_state(model, records_dict['last_checkpoint'], args.cuda, skip_task_models=[], strict=strict) for task in tasks_for_eval: evaluate_and_write(args, model, [task], splits_to_write, mode='last_val', do_write=True) except Exception: log.info( f"Did not record last_checkpoint path properly. Looks like: {records_dict['last_checkpoint']}" ) else: records_dict['last_val'] = records_dict['best_val'] write_records_dict(records_dict, args.records_pickle_path) log.info("Done!")
def main(cl_arguments): """ Train a model for multitask-training.""" cl_args = handle_arguments(cl_arguments) args = config.params_from_file(cl_args.config_file, cl_args.overrides) # Check for deprecated arg names check_arg_name(args) args, seed = initial_setup(args, cl_args) #Store the run description, if any if FLAGS.description: with open(Path(args.run_dir, 'description.txt'), 'w') as f: f.write(FLAGS.description) # Load tasks log.info("Loading tasks...") start_time = time.time() # cuda_device = parse_cuda_list_arg(args.cuda) cuda_device = FLAGS.device_idxs pretrain_tasks, target_tasks, vocab, word_embs = build_tasks( args, cuda_device) tasks = sorted(set(pretrain_tasks + target_tasks), key=lambda x: x.name) log.info("\tFinished loading tasks in %.3fs", time.time() - start_time) log.info("\t Tasks: {}".format([task.name for task in tasks])) # Build model log.info("Building model...") start_time = time.time() model = build_model(args, vocab, word_embs, tasks, cuda_device) log.info("Finished building model in %.3fs", time.time() - start_time) # Start Tensorboard if requested if cl_args.tensorboard: tb_logdir = os.path.join(args.run_dir, "tensorboard") _run_background_tensorboard(tb_logdir, cl_args.tensorboard_port) check_configurations(args, pretrain_tasks, target_tasks) if args.do_pretrain: # Train on pretrain tasks log.info("Training...") stop_metric = pretrain_tasks[0].val_metric if len( pretrain_tasks) == 1 else "macro_avg" should_decrease = (pretrain_tasks[0].val_metric_decreases if len(pretrain_tasks) == 1 else False) trainer, _, opt_params, schd_params = build_trainer(args, cuda_device, [], model, args.run_dir, should_decrease, phase="pretrain") to_train = [(n, p) for n, p in model.named_parameters() if p.requires_grad] _ = trainer.train( pretrain_tasks, stop_metric, args.batch_size, args.weighting_method, args.scaling_method, to_train, opt_params, schd_params, args.load_model, phase="pretrain", ) # For checkpointing logic if not args.do_target_task_training: strict = True else: strict = False if args.do_target_task_training: # Train on target tasks pre_target_train_path = setup_target_task_training( args, target_tasks, model, strict) target_tasks_to_train = copy.deepcopy(target_tasks) # Check for previous target train checkpoints task_to_restore, _, _ = check_for_previous_checkpoints( args.run_dir, target_tasks_to_train, "target_train", args.load_model) if task_to_restore is not None: # If there is a task to restore from, target train only on target tasks # including and following that task. last_task_index = [task.name for task in target_tasks_to_train ].index(task_to_restore) target_tasks_to_train = target_tasks_to_train[last_task_index:] for task in target_tasks_to_train: # Skip tasks that should not be trained on. if task.eval_only_task: continue params_to_train = load_model_for_target_train_run( args, pre_target_train_path, model, strict, task, cuda_device) trainer, _, opt_params, schd_params = build_trainer( args, cuda_device, [task.name], model, args.run_dir, task.val_metric_decreases, phase="target_train", ) _ = trainer.train( tasks=[task], stop_metric=task.val_metric, batch_size=args.batch_size, weighting_method=args.weighting_method, scaling_method=args.scaling_method, train_params=params_to_train, optimizer_params=opt_params, scheduler_params=schd_params, load_model=(task.name == task_to_restore), phase="target_train", ) if args.do_full_eval: log.info("Evaluating...") splits_to_write = evaluate.parse_write_preds_arg(args.write_preds) results_dict = {'run_name': [args.run_name]} # Evaluate on target_tasks. for task in target_tasks: # Find the task-specific best checkpoint to evaluate on. task_params = get_model_attribute(model, "_get_task_params", cuda_device) task_to_use = task_params(task.name).get("use_classifier", task.name) ckpt_path = get_best_checkpoint_path(args, "eval", task_to_use) assert ckpt_path is not None load_model_state(model, ckpt_path, cuda_device, skip_task_models=[], strict=strict) current_tasks_val_results = evaluate_and_write( args, model, [task], splits_to_write, cuda_device) results_dict = {**results_dict, **current_tasks_val_results} tabular_results_csv = os.path.join(SMALL_SHARED_SERVER_DIR, "tabular_results.csv") existing_results_df = pd.read_csv(tabular_results_csv, index_col=False) new_results_df = pd.DataFrame.from_dict(results_dict) updated_results_df = new_results_df.append(existing_results_df, sort=False) with open(tabular_results_csv, 'w') as f: log.info(f"Prepending results to {tabular_results_csv}.") updated_results_df.to_csv(f, header=True, index=False) if args.delete_checkpoints_when_done and not args.keep_all_checkpoints: log.info("Deleting all checkpoints.") delete_all_checkpoints(args.run_dir) log.info("Done!")