def run_training(settings, data, vocabulary_idx_to_word, vocabulary_word_to_idx, logger, use_cuda): reproduction_command = 'python main.py ' + '-c ' + os.path.join( logger.log_dir, logger.run_name + '.ini') logger.shout(reproduction_command) logger.log('# ' + reproduction_command) logger.log( 'epoch\titeration\tfold\ttrain_loss\ttrain_acc\ttrain_macro_f1\ttrain_macro_f1_main\ttrain_total\tval_loss\tval_acc\tval_macro_f1\tval_macro_f1_main\tval_total\tmodel' ) input_vecs, targets = data_utils.create_input_vectors( data, vocabulary_idx_to_word, vocabulary_word_to_idx) # Compute the class weights if necessary if settings.training.class_weights: class_weights = np.bincount(targets[targets != -1], minlength=settings.model.num_entities) class_weights = 1.0 / ( np.sqrt(class_weights) + 1e-6 ) # 1e-6 for numerical stability (though the inf values wouldn't be used anyway) settings.training.class_weights = class_weights else: settings.training.class_weights = None fold_indices = range(settings.data.folds) if settings.data.folds > 1: folds = data_utils.get_cv_folds(data, settings.data, logger) else: # No cross-validation: train_sequence_bounds = data_utils.get_sequence_bounds( data, settings.data.level) validation_sequence_bounds = [] for fold_idx in fold_indices: # For bookkeeping (logging five folds in one file): logger.fold_idx = fold_idx # Select training and (if cross-validation) validation data: if settings.data.folds > 1: train_sequence_bounds = np.concatenate( tuple(folds[:fold_idx] + folds[fold_idx + 1:])) validation_sequence_bounds = folds[fold_idx] # Initialise model model = models.LSTM_basic(settings.model, padding_idx=data_utils.DUMMY_ENTITY_IDX) if use_cuda: model.cuda() # Train the model last_model, best_model = model_utils.train( model, input_vecs, targets, train_sequence_bounds, validation_sequence_bounds, settings.training, settings.training.no_shuffle, logger) # Save the best model through the logger logger.save_model(best_model)
def run_deploy(model_path, settings, data_path, vocabulary_idx_to_word, vocabulary_word_to_idx, entity_name_to_idx, answers_per_fold, no_cv, logger, use_cuda): data, keys_in_data = data_utils.load_data(data_path, entity_name_to_idx, logger=logger) input_vecs, targets = data_utils.create_input_vectors(data, vocabulary_idx_to_word, vocabulary_word_to_idx) # Load all models from model_path: model_list = [] for path in model_path: model_fold = models.LSTM_basic(settings.model, padding_idx=data_utils.DUMMY_ENTITY_IDX) model_fold.load_state_dict(torch.load(path, map_location=lambda storage, loc: storage)) if use_cuda: model_fold.cuda() model_list.append(model_fold) logger.whisper('Loaded model from ' + path) if no_cv: # To deploy all models as ensemble to all the data test_sequence_bounds = data_utils.get_sequence_bounds(data, settings.data.level) collect_ensembles_preds = False # TODO @Carina I vaguely recall this being used in the final rush before the deadline. Remove (also in model_utils)? See also the TODO in write_answers(). predictions_zipped, _ = model_utils.get_indexed_predictions_with_targets( model_list, input_vecs, targets, test_sequence_bounds, use_cuda, collect_ensembles_preds=collect_ensembles_preds) # Write answers through logger answers_path = logger.write_answers_csv(data_path, predictions_zipped, model_suffix="--ensemble", config=settings.orig) # Optionally also per individual model (i.e., each model trained on one fold): if answers_per_fold: for i, model in enumerate(model_list): predictions_zipped, _ = model_utils.get_indexed_predictions_with_targets( model, input_vecs, targets, test_sequence_bounds, use_cuda) logger.write_answers_csv(data_path, predictions_zipped, model_suffix='--fold'+str(i)) else: # To deploy per fold of the data results = [] folds = data_utils.get_cv_folds(data, settings.data, logger) for fold_idx in range(settings.data.folds): # Obtain predictions for this fold: predictions_zipped, _ = model_utils.get_indexed_predictions_with_targets( model_list[fold_idx], input_vecs, targets, folds[fold_idx], use_cuda) # Optionally write answers for this one fold if answers_per_fold: logger.write_answers_csv(settings.data.dataset + "--fold" + str(fold_idx), predictions_zipped, model_suffix="--fold" + str(fold_idx), config=settings.orig) # But also store them, to be merged and sorted later, for writing merged answers results.extend(predictions_zipped) # Write answers merged over all folds through logger results.sort() answers_path = logger.write_answers_csv(settings.data.dataset, results, model_suffix="--cv", config=settings.orig) return answers_path, keys_in_data