def run(ModelClass, output_dir, pipeline_options, model_options): """ Implements the main logic of the training module. Instantiates the dataset, model class and sets their attributes according to the pipeline options received. Loads or creates a trainer and runs it. Args: ModelClass (Model): Python Type of the Model to train output_dir: Directory to save models pipeline_options (Namespace): Generic Train Options load_model: load pre-trained predictor model resume: load trainer state and resume training gpu_id: Set to non-negative integer to train on GPU train_batch_size: Batch Size for training valid_batch_size: Batch size for validation model_options(Namespace): Model Specific options Returns: The trainer object """ model_name = getattr(ModelClass, "title", ModelClass.__name__) logger.info("Training the {} model".format(model_name)) # FIXME: make sure all places use output_dir # del pipeline_options.output_dir pipeline_options.output_dir = None # Data step fieldset = ModelClass.fieldset( wmt18_format=model_options.__dict__.get("wmt18_format")) datasets = retrieve_datasets(fieldset, pipeline_options, model_options, output_dir) save_vocabularies_from_datasets(output_dir, *datasets) if pipeline_options.save_data: save_training_datasets(pipeline_options.save_data, *datasets) # Trainer step device_id = None if pipeline_options.gpu_id is not None and pipeline_options.gpu_id >= 0: device_id = pipeline_options.gpu_id vocabs = utils.fields_to_vocabs(datasets[0].fields) trainer = retrieve_trainer( ModelClass, pipeline_options, model_options, vocabs, output_dir, device_id, ) logger.info(str(trainer.model)) logger.info("{} parameters".format(trainer.model.num_parameters())) # Dataset iterators train_iter = build_bucket_iterator( datasets[0], batch_size=pipeline_options.train_batch_size, is_train=True, device=device_id, ) valid_iter = build_bucket_iterator( datasets[1], batch_size=pipeline_options.valid_batch_size, is_train=False, device=device_id, ) trainer.run(train_iter, valid_iter, epochs=pipeline_options.epochs) return trainer
def run(ModelClass, output_dir, pipeline_options, model_options, splits): model_name = getattr(ModelClass, 'title', ModelClass.__name__) logger.info('Jackknifing with the {} model'.format(model_name)) # Data fieldset = ModelClass.fieldset( wmt18_format=model_options.__dict__.get('wmt18_format')) train_set, dev_set = train.retrieve_datasets(fieldset, pipeline_options, model_options, output_dir) test_set = None try: test_set = build_test_dataset(fieldset, **vars(pipeline_options)) except ValueError: pass except FileNotFoundError: pass device_id = None if pipeline_options.gpu_id is not None and pipeline_options.gpu_id >= 0: device_id = pipeline_options.gpu_id parent_dir = output_dir train_predictions = defaultdict(list) dev_predictions = defaultdict(list) test_predictions = defaultdict(list) splitted_datasets = cross_split_dataset(train_set, splits) for i, (train_fold, pred_fold) in enumerate(splitted_datasets): run_name = 'train_split_{}'.format(i) output_dir = Path(parent_dir, run_name) output_dir.mkdir(parents=True, exist_ok=True) # options.output_dir = str(options.output_dir) # Train vocabs = utils.fields_to_vocabs(train_fold.fields) tracking_run = tracking_logger.start_nested_run(run_name=run_name) with tracking_run: train.setup( output_dir=output_dir, seed=pipeline_options.seed, gpu_id=pipeline_options.gpu_id, debug=pipeline_options.debug, quiet=pipeline_options.quiet, ) trainer = train.retrieve_trainer( ModelClass, pipeline_options, model_options, vocabs, output_dir, device_id, ) # Dataset iterators train_iter = build_bucket_iterator( train_fold, batch_size=pipeline_options.train_batch_size, is_train=True, device=device_id, ) valid_iter = build_bucket_iterator( pred_fold, batch_size=pipeline_options.valid_batch_size, is_train=False, device=device_id, ) trainer.run(train_iter, valid_iter, epochs=pipeline_options.epochs) # Predict predictor = load_model(trainer.checkpointer.best_model_path()) train_predictions_i = predictor.run( pred_fold, batch_size=pipeline_options.valid_batch_size) dev_predictions_i = predictor.run( dev_set, batch_size=pipeline_options.valid_batch_size) test_predictions_i = None if test_set: test_predictions_i = predictor.run( test_set, batch_size=pipeline_options.valid_batch_size) torch.cuda.empty_cache() for output_name in train_predictions_i: train_predictions[output_name] += train_predictions_i[output_name] dev_predictions[output_name].append(dev_predictions_i[output_name]) if test_set: test_predictions[output_name].append( test_predictions_i[output_name]) dev_predictions = average_all(dev_predictions) if test_set: test_predictions = average_all(test_predictions) save_predicted_probabilities(parent_dir, train_predictions, prefix=const.TRAIN) save_predicted_probabilities(parent_dir, dev_predictions, prefix=const.DEV) if test_set: save_predicted_probabilities(parent_dir, test_predictions, prefix=const.TEST) teardown(pipeline_options) return train_predictions