Exemplo n.º 1
0
def train_model(model_pipeline, tokenizer, train_dataset, eval_dataset, output_dir:str, model_dir:str, **hyperparams:Dict):

    logger.info("Start training")

    model.train(model_pipeline, tokenizer, train_dataset, eval_dataset, output_dir, **hyperparams)

    utils.write_model(model_pipeline, tokenizer, model_dir)

    logger.info("Training done")
Exemplo n.º 2
0
def gradient_descent(pssm_train, pssm_dir, fasta_dir, tm_align_dir):
    # Build the feature matrix
    print('Building feature matrix...')
    feature_matrix = build_feature_matrix(pssm_train, pssm_dir, fasta_dir,
                                          tm_align_dir)

    w_vector = new_w_vector(feature_matrix[0])
    count = 0

    print('Training the model...')
    while not reached_top(w_vector, feature_matrix):
        count += 1
        gradient_vector = calc_gradient(w_vector, feature_matrix)
        w_vector = update_w(w_vector, gradient_vector)

    print('Gradient Descent completed in {} iterations'.format(count))
    print('with final step size {} and sample size {}'.format(
        STEP_SIZE, SAMPLE_SIZE))

    # Save the model to the file
    utils.write_model(w_vector)
Exemplo n.º 3
0
def exec_demo(demo_params):
    """
    Execute grid search over the param_grid defined in demo_params,
    using the data from the crowd-sourced annotations.
    :param demo_params:
    :return:
    """
    logger = log.setup_logger(__name__)

    #ignore this line... It's a long story.
    feature_type = "ver1"

    # extract base parameters
    demo_id, name, train_file, dev_file, test_file, output_folder = utils.extract_base_demo_params(
        demo_params)
    print_log(
        logger,
        "\n".join([str((key, demo_params[key])) for key in list(demo_params)]))

    train_file_extra_points = demo_params["train_file_extra_points"]
    param_grid = demo_params["param_grid"]

    # define the scoring function for the grid search
    my_scorer = sklearn.metrics.make_scorer(metrics.my_scorer.get_evaluation)

    # track some result from the grid search used for tuning the hyperparameter delta
    fscores = {}
    epsilons_list = {}
    max_iterations_list = {}
    best_eval = {"F-score": 0}

    # pre-processing the data (remove tags and other stuff)
    print_log(logger, "Making datasets...")
    task1_train_data = data_parsers.make_dataset.parse_file(open(train_file))
    dev_data = data_parsers.make_dataset.parse_file(open(dev_file))
    test_data = data_parsers.make_dataset.parse_file(open(test_file))
    extra_points_train_data = data_parsers.make_dataset.parse_file(
        open(train_file_extra_points))

    train_data = task1_train_data + extra_points_train_data

    print_log(logger, "train data size: %s" % len(train_data))
    print_log(logger, "development data size: %s" % len(dev_data))
    print_log(logger, "test data size: %s" % len(test_data))

    # compute the maximum delta possible (from the length of the longest word
    # in the train and development set)
    max_delta = max(utils.find_max_len(train_data),
                    utils.find_max_len(dev_data))
    if max_delta > settings.MAX_ALLOWABLE_DELTA:
        max_delta = settings.MAX_ALLOWABLE_DELTA
    print_log(logger, "max delta: %s" % max_delta)

    # repeat the grid search for each possible value of delta
    for delta in range(1, max_delta + 1):
        os.makedirs(output_folder + "/%02d" % delta, exist_ok=True)

        print_log(logger, "Training with delta=%s" % delta)
        X_train, y_train = features.extract_features.get_features_and_labels(
            train_data, delta, feature_type)
        X_dev, y_dev = features.extract_features.get_features_and_labels(
            dev_data, delta, feature_type)
        X_test, y_test = features.extract_features.get_features_and_labels(
            test_data, delta, feature_type)

        model = utils.run_grid_search(X_train, y_train, X_dev, y_dev,
                                      param_grid, my_scorer)

        best_cv_epsilon = model.best_params_["epsilon"]
        best_cv_max_iterations = model.best_params_["max_iterations"]

        # the best score will be considered in order to pick the best model
        fscores[delta] = model.best_score_
        epsilons_list[delta] = best_cv_epsilon
        max_iterations_list[delta] = best_cv_max_iterations

        print_log(
            logger,
            "Best params for delta %02d: max_iterations=%d\tepsilon=%.2E" %
            (delta, best_cv_max_iterations, best_cv_epsilon))
        print_log(logger, "Best CV score: " + str(model.best_score_))

        # test the model on the test set. NOTICE: the result will not be considered for the choice
        # of the hyperparameter delta!
        print_log(logger, "***Predict test with the grid search model:***")

        y_test_pred = model.predict(X_test)
        test_eval = metrics.evaluation.get_evaluation(feature_type, y_test,
                                                      y_test_pred)
        print_log(
            logger, "F-score on test (grid search with delta=%s): %s" %
            (delta, test_eval["F-score"]))

        # save some result from the grid search
        curpath = output_folder + "/%02d" % delta + "/" + name + "_" + "%02d" % delta
        utils.write_model(model, open(curpath + "_gridsearch.model", "wb+"))
        utils.write_predictions(feature_type, open(test_file), y_test_pred,
                                open(curpath + ".pred", "w+"))
        utils.write_evaluation(test_eval, open(curpath + ".eval", "w+"))
        utils.write_fails(open(test_file), y_test, y_test_pred,
                          open(curpath + ".fails", "w+"), feature_type)
        details.print_gridsearch_details(model,
                                         file=open(
                                             curpath + "_gridsearch.details",
                                             "w+"))

        print_log(logger, "#" * 50)

    print_log(logger, "-" * 50)
    max_fscore = max(fscores.values())
    max_fscore_delta = [i for i in fscores.keys()
                        if fscores[i] == max_fscore][0]
    best_model_num = max_fscore_delta
    best_epsilon = epsilons_list[best_model_num]
    best_max_iterations = max_iterations_list[best_model_num]

    freport = open(output_folder + "/report.txt", "w+")
    print_log(
        logger,
        "The best model found is the one with delta: %s" % best_model_num)
    print_log(
        logger, "With best parameters: max_iterations=%s, epsilon=%s" %
        (best_max_iterations, best_epsilon))
    print_log(logger, "CV F-score: %s" % max_fscore)
    print("The best model found is the one with delta: %s" % best_model_num,
          file=freport)
    print("With best parameters: max_iterations=%s, epsilon=%s" %
          (best_max_iterations, best_epsilon),
          file=freport)
    print("CV F-score: %s" % max_fscore, file=freport)

    best_model_path = output_folder + "/%02d" % best_model_num + "/" + name + "_" + "%02d" % best_model_num + "_gridsearch.model"
    best_model = pickle.load(open(best_model_path, "rb"))
    X_test, y_test = features.extract_features.get_features_and_labels(
        test_data, best_model_num, feature_type)

    y_pred = best_model.predict(X_test)
    delta_evaluation = metrics.evaluation.get_evaluation(
        feature_type, y_test, y_pred)
    print_log(
        logger, "delta: %s\tF-score, : %s" %
        (best_model_num, delta_evaluation["F-score"]))
    print("F-score on test set: %s" % delta_evaluation["F-score"],
          file=freport)
Exemplo n.º 4
0
def save_model_checkpoint(model, tokenizer, global_step, output_dir):
    output_dir = os.path.join(output_dir, 'checkpoint-{}'.format(global_step))
    utils.write_model(model, tokenizer, output_dir)
Exemplo n.º 5
0
def exec_demo(demo_params):
    """
    Train the crf with different size of the train set
    Tune the hyperparameter over the development set
    Then test the best model
    :param demo_params:
    :return:
    """
    logger = log.setup_logger(__name__)

    #ignore this line... It's a long story.
    feature_type = "ver1"

    # extract base parameters
    demo_id, name, train_file, dev_file, test_file, output_folder = utils.extract_base_demo_params(
        demo_params)
    print_log(
        logger,
        "\n".join([str((key, demo_params[key])) for key in list(demo_params)]))

    different_sizes_perc = list(range(10, 101, 10))

    # define the scoring function for the grid search
    my_scorer = sklearn.metrics.make_scorer(metrics.my_scorer.get_evaluation)

    # track some result from the search used for tuning the hyperparameter delta
    size_evaluations = {}
    train_data_partitions = {}
    fscores = {}

    # pre-processing the data (remove tags and other stuff)
    print_log(logger, "Making datasets...")
    train_data = data_parsers.make_dataset.parse_file(open(train_file))
    dev_data = data_parsers.make_dataset.parse_file(open(dev_file))
    test_data = data_parsers.make_dataset.parse_file(open(test_file))

    # compute the maximum delta possible (from the length of the longest word
    # in the train and development set)
    max_delta = max(utils.find_max_len(train_data),
                    utils.find_max_len(dev_data))
    print_log(logger,
              "max delta: %s, len train set:%s" % (max_delta, len(train_data)))

    # train the model for different train sizes
    for size in different_sizes_perc:
        print_log(
            logger,
            "train the model with percentage of the train set: %02d%%" % size)

        train_data_shuffled = copy.deepcopy(train_data)
        random.shuffle(train_data_shuffled)
        current_size = round(len(train_data) * size / 100)
        print_log(logger, "current train set size: %d" % current_size)
        train_data_partition = train_data_shuffled[:current_size]
        print_log(
            logger, "train set: " +
            "; ".join(list(map(str, train_data_partition[0:5]))) + "...")

        size_evaluations[size] = {}
        train_data_partitions[size] = train_data_partition

        current_max_delta = utils.find_max_len(train_data_partition)
        print_log(logger, "current max delta: %s" % current_max_delta)

        for delta in range(1, current_max_delta + 1):
            print_log(logger, "train the model with delta: %d" % delta)

            X_train, y_train = features.extract_features.get_features_and_labels(
                train_data_partition, delta, feature_type)
            X_dev, y_dev = features.extract_features.get_features_and_labels(
                dev_data, delta, feature_type)
            X_test, y_test = features.extract_features.get_features_and_labels(
                test_data, delta, feature_type)

            crf = sklearn_crfsuite.CRF(
                algorithm='ap',
                all_possible_transitions=True,
                all_possible_states=False,
            )
            crf.fit(X_train, y_train)
            y_dev_pred = crf.predict(X_dev)
            delta_evaluation = metrics.evaluation.get_evaluation(
                feature_type, y_dev, y_dev_pred)

            print_log(
                logger,
                "F-score on development set: %s" % delta_evaluation["F-score"])
            size_evaluations[size][delta] = (delta_evaluation["Precision"],
                                             delta_evaluation["Recall"],
                                             delta_evaluation["F-score"])

    # find delta that yields best F-score
    sizes = list(size_evaluations.keys())
    sizes.sort()
    deltas = []
    for size in sizes:
        max_fscore = max(size_evaluations[size].values())
        max_delta_for_size = [
            i for i in size_evaluations[size]
            if size_evaluations[size][i] == max_fscore
        ][0]
        deltas.append(max_delta_for_size)
        print_log(
            logger, "\nBest delta=%s for train size perc=%s%%. "
            "\nOn development set:"
            "\n\tPrecision=%s"
            "\n\tRecall=%s"
            "\n\tF-score=%s" % (max_delta_for_size, size,
                                size_evaluations[size][max_delta_for_size][0],
                                size_evaluations[size][max_delta_for_size][1],
                                size_evaluations[size][max_delta_for_size][2]))

    test_evaluations = {}
    print_log(logger, "Test models with different sizes of training set")
    for size, best_delta in zip(sizes, deltas):
        print_log(logger,
                  "Train with size: %d and delta: %s" % (size, best_delta))
        cur_train_set = train_data_partitions[size]
        print_log(
            logger, "train set: " +
            "; ".join(list(map(str, cur_train_set[0:5]))) + "...")
        X_train, y_train = features.extract_features.get_features_and_labels(
            cur_train_set, best_delta, feature_type)
        X_test, y_test = features.extract_features.get_features_and_labels(
            test_data, best_delta, feature_type)

        crf = sklearn_crfsuite.CRF(
            algorithm='ap',
            all_possible_transitions=True,
            all_possible_states=False,
        )
        crf.fit(X_train, y_train)

        y_test_pred = crf.predict(X_test)
        delta_evaluation = metrics.evaluation.get_evaluation(
            feature_type, y_test, y_test_pred)

        test_evaluations[size] = (delta_evaluation["Precision"],
                                  delta_evaluation["Recall"],
                                  delta_evaluation["F-score"])
        print_log(
            logger, "train score (delta=%s): F-score, : %s" %
            (best_delta, delta_evaluation["F-score"]))

        # save some result from the tests
        curpath = output_folder + "/size_%02d_delta_%02d" % (size, best_delta)
        os.makedirs(curpath)

        curpath = curpath\
                  +"/"+name+"_" \
                  + "size_%02d_delta_%02d"%(size,best_delta)
        utils.write_model(crf, open(curpath + ".model", "wb+"))
        utils.write_predictions(feature_type, open(test_file), y_test_pred,
                                open(curpath + ".pred", "w+"))
        utils.write_evaluation(delta_evaluation, open(curpath + ".eval", "w+"))
        utils.write_fails(open(test_file), y_test, y_test_pred,
                          open(curpath + ".fails", "w+"), feature_type)
        details.print_details(crf, file=open(curpath + ".details", "w+"))

    freport = open(output_folder + "/report.txt", "w+")
    for size, best_delta in zip(sizes, deltas):
        print(
            "Best delta=%s for train size perc=%s%%. "
            "\nOn development set:"
            "\n\tPrecision=%s"
            "\n\tRecall=%s"
            "\n\tF-score=%s"
            "\nOn test set:"
            "\n\tPrecision=%s"
            "\n\tRecall=%s"
            "\n\tF-score=%s" %
            (best_delta, size, size_evaluations[size][best_delta][0],
             size_evaluations[size][best_delta][1],
             size_evaluations[size][best_delta][2], test_evaluations[size][0],
             test_evaluations[size][1], test_evaluations[size][2]) + "\n" +
            "-" * 50,
            file=freport)