def train_model(model_pipeline, tokenizer, train_dataset, eval_dataset, output_dir:str, model_dir:str, **hyperparams:Dict): logger.info("Start training") model.train(model_pipeline, tokenizer, train_dataset, eval_dataset, output_dir, **hyperparams) utils.write_model(model_pipeline, tokenizer, model_dir) logger.info("Training done")
def gradient_descent(pssm_train, pssm_dir, fasta_dir, tm_align_dir): # Build the feature matrix print('Building feature matrix...') feature_matrix = build_feature_matrix(pssm_train, pssm_dir, fasta_dir, tm_align_dir) w_vector = new_w_vector(feature_matrix[0]) count = 0 print('Training the model...') while not reached_top(w_vector, feature_matrix): count += 1 gradient_vector = calc_gradient(w_vector, feature_matrix) w_vector = update_w(w_vector, gradient_vector) print('Gradient Descent completed in {} iterations'.format(count)) print('with final step size {} and sample size {}'.format( STEP_SIZE, SAMPLE_SIZE)) # Save the model to the file utils.write_model(w_vector)
def exec_demo(demo_params): """ Execute grid search over the param_grid defined in demo_params, using the data from the crowd-sourced annotations. :param demo_params: :return: """ logger = log.setup_logger(__name__) #ignore this line... It's a long story. feature_type = "ver1" # extract base parameters demo_id, name, train_file, dev_file, test_file, output_folder = utils.extract_base_demo_params( demo_params) print_log( logger, "\n".join([str((key, demo_params[key])) for key in list(demo_params)])) train_file_extra_points = demo_params["train_file_extra_points"] param_grid = demo_params["param_grid"] # define the scoring function for the grid search my_scorer = sklearn.metrics.make_scorer(metrics.my_scorer.get_evaluation) # track some result from the grid search used for tuning the hyperparameter delta fscores = {} epsilons_list = {} max_iterations_list = {} best_eval = {"F-score": 0} # pre-processing the data (remove tags and other stuff) print_log(logger, "Making datasets...") task1_train_data = data_parsers.make_dataset.parse_file(open(train_file)) dev_data = data_parsers.make_dataset.parse_file(open(dev_file)) test_data = data_parsers.make_dataset.parse_file(open(test_file)) extra_points_train_data = data_parsers.make_dataset.parse_file( open(train_file_extra_points)) train_data = task1_train_data + extra_points_train_data print_log(logger, "train data size: %s" % len(train_data)) print_log(logger, "development data size: %s" % len(dev_data)) print_log(logger, "test data size: %s" % len(test_data)) # compute the maximum delta possible (from the length of the longest word # in the train and development set) max_delta = max(utils.find_max_len(train_data), utils.find_max_len(dev_data)) if max_delta > settings.MAX_ALLOWABLE_DELTA: max_delta = settings.MAX_ALLOWABLE_DELTA print_log(logger, "max delta: %s" % max_delta) # repeat the grid search for each possible value of delta for delta in range(1, max_delta + 1): os.makedirs(output_folder + "/%02d" % delta, exist_ok=True) print_log(logger, "Training with delta=%s" % delta) X_train, y_train = features.extract_features.get_features_and_labels( train_data, delta, feature_type) X_dev, y_dev = features.extract_features.get_features_and_labels( dev_data, delta, feature_type) X_test, y_test = features.extract_features.get_features_and_labels( test_data, delta, feature_type) model = utils.run_grid_search(X_train, y_train, X_dev, y_dev, param_grid, my_scorer) best_cv_epsilon = model.best_params_["epsilon"] best_cv_max_iterations = model.best_params_["max_iterations"] # the best score will be considered in order to pick the best model fscores[delta] = model.best_score_ epsilons_list[delta] = best_cv_epsilon max_iterations_list[delta] = best_cv_max_iterations print_log( logger, "Best params for delta %02d: max_iterations=%d\tepsilon=%.2E" % (delta, best_cv_max_iterations, best_cv_epsilon)) print_log(logger, "Best CV score: " + str(model.best_score_)) # test the model on the test set. NOTICE: the result will not be considered for the choice # of the hyperparameter delta! print_log(logger, "***Predict test with the grid search model:***") y_test_pred = model.predict(X_test) test_eval = metrics.evaluation.get_evaluation(feature_type, y_test, y_test_pred) print_log( logger, "F-score on test (grid search with delta=%s): %s" % (delta, test_eval["F-score"])) # save some result from the grid search curpath = output_folder + "/%02d" % delta + "/" + name + "_" + "%02d" % delta utils.write_model(model, open(curpath + "_gridsearch.model", "wb+")) utils.write_predictions(feature_type, open(test_file), y_test_pred, open(curpath + ".pred", "w+")) utils.write_evaluation(test_eval, open(curpath + ".eval", "w+")) utils.write_fails(open(test_file), y_test, y_test_pred, open(curpath + ".fails", "w+"), feature_type) details.print_gridsearch_details(model, file=open( curpath + "_gridsearch.details", "w+")) print_log(logger, "#" * 50) print_log(logger, "-" * 50) max_fscore = max(fscores.values()) max_fscore_delta = [i for i in fscores.keys() if fscores[i] == max_fscore][0] best_model_num = max_fscore_delta best_epsilon = epsilons_list[best_model_num] best_max_iterations = max_iterations_list[best_model_num] freport = open(output_folder + "/report.txt", "w+") print_log( logger, "The best model found is the one with delta: %s" % best_model_num) print_log( logger, "With best parameters: max_iterations=%s, epsilon=%s" % (best_max_iterations, best_epsilon)) print_log(logger, "CV F-score: %s" % max_fscore) print("The best model found is the one with delta: %s" % best_model_num, file=freport) print("With best parameters: max_iterations=%s, epsilon=%s" % (best_max_iterations, best_epsilon), file=freport) print("CV F-score: %s" % max_fscore, file=freport) best_model_path = output_folder + "/%02d" % best_model_num + "/" + name + "_" + "%02d" % best_model_num + "_gridsearch.model" best_model = pickle.load(open(best_model_path, "rb")) X_test, y_test = features.extract_features.get_features_and_labels( test_data, best_model_num, feature_type) y_pred = best_model.predict(X_test) delta_evaluation = metrics.evaluation.get_evaluation( feature_type, y_test, y_pred) print_log( logger, "delta: %s\tF-score, : %s" % (best_model_num, delta_evaluation["F-score"])) print("F-score on test set: %s" % delta_evaluation["F-score"], file=freport)
def save_model_checkpoint(model, tokenizer, global_step, output_dir): output_dir = os.path.join(output_dir, 'checkpoint-{}'.format(global_step)) utils.write_model(model, tokenizer, output_dir)
def exec_demo(demo_params): """ Train the crf with different size of the train set Tune the hyperparameter over the development set Then test the best model :param demo_params: :return: """ logger = log.setup_logger(__name__) #ignore this line... It's a long story. feature_type = "ver1" # extract base parameters demo_id, name, train_file, dev_file, test_file, output_folder = utils.extract_base_demo_params( demo_params) print_log( logger, "\n".join([str((key, demo_params[key])) for key in list(demo_params)])) different_sizes_perc = list(range(10, 101, 10)) # define the scoring function for the grid search my_scorer = sklearn.metrics.make_scorer(metrics.my_scorer.get_evaluation) # track some result from the search used for tuning the hyperparameter delta size_evaluations = {} train_data_partitions = {} fscores = {} # pre-processing the data (remove tags and other stuff) print_log(logger, "Making datasets...") train_data = data_parsers.make_dataset.parse_file(open(train_file)) dev_data = data_parsers.make_dataset.parse_file(open(dev_file)) test_data = data_parsers.make_dataset.parse_file(open(test_file)) # compute the maximum delta possible (from the length of the longest word # in the train and development set) max_delta = max(utils.find_max_len(train_data), utils.find_max_len(dev_data)) print_log(logger, "max delta: %s, len train set:%s" % (max_delta, len(train_data))) # train the model for different train sizes for size in different_sizes_perc: print_log( logger, "train the model with percentage of the train set: %02d%%" % size) train_data_shuffled = copy.deepcopy(train_data) random.shuffle(train_data_shuffled) current_size = round(len(train_data) * size / 100) print_log(logger, "current train set size: %d" % current_size) train_data_partition = train_data_shuffled[:current_size] print_log( logger, "train set: " + "; ".join(list(map(str, train_data_partition[0:5]))) + "...") size_evaluations[size] = {} train_data_partitions[size] = train_data_partition current_max_delta = utils.find_max_len(train_data_partition) print_log(logger, "current max delta: %s" % current_max_delta) for delta in range(1, current_max_delta + 1): print_log(logger, "train the model with delta: %d" % delta) X_train, y_train = features.extract_features.get_features_and_labels( train_data_partition, delta, feature_type) X_dev, y_dev = features.extract_features.get_features_and_labels( dev_data, delta, feature_type) X_test, y_test = features.extract_features.get_features_and_labels( test_data, delta, feature_type) crf = sklearn_crfsuite.CRF( algorithm='ap', all_possible_transitions=True, all_possible_states=False, ) crf.fit(X_train, y_train) y_dev_pred = crf.predict(X_dev) delta_evaluation = metrics.evaluation.get_evaluation( feature_type, y_dev, y_dev_pred) print_log( logger, "F-score on development set: %s" % delta_evaluation["F-score"]) size_evaluations[size][delta] = (delta_evaluation["Precision"], delta_evaluation["Recall"], delta_evaluation["F-score"]) # find delta that yields best F-score sizes = list(size_evaluations.keys()) sizes.sort() deltas = [] for size in sizes: max_fscore = max(size_evaluations[size].values()) max_delta_for_size = [ i for i in size_evaluations[size] if size_evaluations[size][i] == max_fscore ][0] deltas.append(max_delta_for_size) print_log( logger, "\nBest delta=%s for train size perc=%s%%. " "\nOn development set:" "\n\tPrecision=%s" "\n\tRecall=%s" "\n\tF-score=%s" % (max_delta_for_size, size, size_evaluations[size][max_delta_for_size][0], size_evaluations[size][max_delta_for_size][1], size_evaluations[size][max_delta_for_size][2])) test_evaluations = {} print_log(logger, "Test models with different sizes of training set") for size, best_delta in zip(sizes, deltas): print_log(logger, "Train with size: %d and delta: %s" % (size, best_delta)) cur_train_set = train_data_partitions[size] print_log( logger, "train set: " + "; ".join(list(map(str, cur_train_set[0:5]))) + "...") X_train, y_train = features.extract_features.get_features_and_labels( cur_train_set, best_delta, feature_type) X_test, y_test = features.extract_features.get_features_and_labels( test_data, best_delta, feature_type) crf = sklearn_crfsuite.CRF( algorithm='ap', all_possible_transitions=True, all_possible_states=False, ) crf.fit(X_train, y_train) y_test_pred = crf.predict(X_test) delta_evaluation = metrics.evaluation.get_evaluation( feature_type, y_test, y_test_pred) test_evaluations[size] = (delta_evaluation["Precision"], delta_evaluation["Recall"], delta_evaluation["F-score"]) print_log( logger, "train score (delta=%s): F-score, : %s" % (best_delta, delta_evaluation["F-score"])) # save some result from the tests curpath = output_folder + "/size_%02d_delta_%02d" % (size, best_delta) os.makedirs(curpath) curpath = curpath\ +"/"+name+"_" \ + "size_%02d_delta_%02d"%(size,best_delta) utils.write_model(crf, open(curpath + ".model", "wb+")) utils.write_predictions(feature_type, open(test_file), y_test_pred, open(curpath + ".pred", "w+")) utils.write_evaluation(delta_evaluation, open(curpath + ".eval", "w+")) utils.write_fails(open(test_file), y_test, y_test_pred, open(curpath + ".fails", "w+"), feature_type) details.print_details(crf, file=open(curpath + ".details", "w+")) freport = open(output_folder + "/report.txt", "w+") for size, best_delta in zip(sizes, deltas): print( "Best delta=%s for train size perc=%s%%. " "\nOn development set:" "\n\tPrecision=%s" "\n\tRecall=%s" "\n\tF-score=%s" "\nOn test set:" "\n\tPrecision=%s" "\n\tRecall=%s" "\n\tF-score=%s" % (best_delta, size, size_evaluations[size][best_delta][0], size_evaluations[size][best_delta][1], size_evaluations[size][best_delta][2], test_evaluations[size][0], test_evaluations[size][1], test_evaluations[size][2]) + "\n" + "-" * 50, file=freport)