def final_clf_training(Xs, ys, X_holdout, y_holdout, scorer_type, model_parent_folder):
    """
    Train final classifier on all of the data to prepare it for the prediction of the FNC-1's unlabeled data
    :param model_parent_folder: Path to the folder for saving the classifier
    :param Xs: All the training data's feature vectors, split in their folds
    :param ys: All the training data's labels, split in their folds
    :param X_holdout: The holdout feature vectors
    :param y_holdout: The holdout labels
    :param scorer_type: the scorer type, e.g. MLB_base (see estimator_definitions.py in utils folder)
    :return: the final classifier
    """

    # stack all the feature vectors of all the folds
    X_train = np.vstack(tuple([Xs[i] for i in range(10)]))
    y_train = np.hstack(tuple([ys[i] for i in range(10)]))

    # stack the holdout feature vectors on the feature vectors of all folds
    X_all = np.concatenate([X_train, X_holdout], axis=0)
    y_all = np.concatenate([y_train, y_holdout], axis=0)

    # create the new save folder for the specific classifer
    scorer_folder_name = scorer_type + "_final"
    save_folder = get_save_folder(model_parent_folder, scorer_folder_name + "_new")

    # get classifier and only pass a save folder if the classifier should be saved
    _clf = estimator_definitions.get_estimator(scorer_type, save_folder=save_folder)

    import datetime
    # fit the final classifier
    loss_monitor_file_dir = path.join(save_folder, 'loss_results')
    os.makedirs(loss_monitor_file_dir, exist_ok=True)
    loss_filename = path.join(loss_monitor_file_dir, str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) + ".txt")
    # fit the classifier
    if 'f_ext' in scorer_type:
        append_to_loss_monitor_file("\n\nFOLD holdout and classifier: " + scorer_type + "\n", loss_filename)
        append_to_loss_monitor_file(str(datetime.datetime.now()).split('.')[0], loss_filename)
        _clf.fit(X_train, y_train, X_holdout, np.array(y_holdout), 'holdout', loss_filename)
    else:
        _clf.fit(X_all, y_all)

    # save the model
    filename = scorer_folder_name + ".sav"
    save_model(_clf, save_folder, filename)  # save model with filename to specific folder
    return _clf
Exemplo n.º 2
0
def final_clf_training(Xs, ys, X_holdout, y_holdout, scorer_type, sanity_check=False):
    """
    Train final classifier on all of the data to prepare it for the prediction of the FNC-1's unlabeled data
    :param Xs: All the training data's feature vectors, split in their folds
    :param ys: All the training data's labels, split in their folds
    :param X_holdout: The holdout feature vectors
    :param y_holdout: The holdout labels
    :param scorer_type: the scorer type, e.g. MLB_base (see estimator_definitions.py in utils folder)
    :param sanity_check: If true, the trained classifier predicts the labels of the data it was trained on and prints out the score
    :return: the final classifier
    """

    # stack all the feature vectors of all the folds
    X_train = np.vstack(tuple([Xs[i] for i in range(10)]))
    y_train = np.hstack(tuple([ys[i] for i in range(10)]))

    # stack the holdout feature vectors on the feature vectors of all folds
    X_all = np.concatenate([X_train, X_holdout], axis=0)
    y_all = np.concatenate([y_train, y_holdout], axis=0)

    # define and create parent folder to save all trained classifiers into
    parent_folder = "%s/data/fnc-1/mlp_models/" % (path.dirname(path.dirname(path.abspath(__file__))))

    # create the new save folder for the specific classifer
    scorer_folder_name = scorer_type+"_final"
    save_folder = get_save_folder(parent_folder, scorer_folder_name+"_new")

    # get classifier and only pass a save folder if the classifier should be saved
    clf = esitmator_definitions.get_estimator(scorer_type, save_folder=save_folder)

    # fit the final classifier
    clf.fit(X_all, y_all)

    # save the model
    filename = scorer_folder_name + ".sav"
    save_model(clf, save_folder, filename)  # save model with filename to specific folder

    # predict on the data the classifier was trained on => should give near perfect score
    if sanity_check == True:
        # get predicted and actual labels
        y_predicted = clf.predict(X_all)
        predicted = [LABELS[int(a)] for a in y_predicted]
        actual = [LABELS[int(a)] for a in y_all]

        # calc FNC score
        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)
        score = fold_score / max_fold_score

        # calc accuracy, f1 macro
        accuracy_stance = score_calculation.get_accuracy(y_predicted, y_all, stance=True)
        accuracy_related = score_calculation.get_accuracy(y_predicted, y_all, stance=False)
        f1_stance = score_calculation.get_f1score(y_predicted, y_all, stance=True)
        f1_related = score_calculation.get_f1score(y_predicted, y_all, stance=False)

        # printout results
        printout = printout_manager.get_holdout_printout(save_folder, accuracy_related, accuracy_stance, f1_related,
                                                         f1_stance, score)
        print("SANITY CHECK (predict on train data):")
        print(printout)
    return clf
Exemplo n.º 3
0
def validate_holdout(Xs, ys, X_holdout, y_holdout, non_bleeding_features, features_dir,
                     scorer_type, feat_indices, result_string, learning_rate_string):
    """
    Trains the classifier on all of the train+test data and tests it on the holdout set
    :param Xs: All the training data's feature vectors, split in their folds
    :param ys: All the training data's labels, split in their folds
    :param X_holdout: The holdout feature vectors
    :param y_holdout: The holdout labels
    :param non_bleeding_features: The list of non-bleeding features that has to be concatenated to the existing feature vectors
    :param features_dir: the directory where the features are stored
    :param scorer_type: the scorer type, e.g. MLB_base (see estimator_definitions.py in utils folder)
    :param feat_indices: indices returned by generate_features() method. They indicate at what index of the feature vector a specific
    feature starts and where it ends. This is used for printing out the feature importances by the RandomForest classifier
    :param result_string: The current result string in order to add the holdout results
    :param learning_rate_string: The current learning rate string in order to add information about the learning rate
    :return: the updated result_string and learning_rate_string
    """
    # define folder to save the classifier and create it if not existing
    parent_folder = "%s/data/fnc-1/mlp_models/" % (path.dirname(path.dirname(path.abspath(__file__))))

    # create the new save folder
    save_folder = get_save_folder(parent_folder, scorer_type+"_new")

    # only pass a save folder if the classifier should be saved
    best_clf = esitmator_definitions.get_estimator(scorer_type, save_folder=save_folder)

    # stack all the feature vectors of all the folds
    X_train = np.vstack(tuple([Xs[i] for i in range(10)]))
    y_train = np.hstack(tuple([ys[i] for i in range(10)]))

    # concat non-bleeding features
    X_train, X_holdout, feat_indices_holdout = concat_non_bleeding_features(
        X_train, X_holdout,
        non_bleeding_features, features_dir, 'holdout')

    # test for oversampling: fits the current classifier, oversampled with a given
    # method and checks the score on the holdout set
    use_over_sampling = False
    if use_over_sampling == True:
        from imblearn.over_sampling import SMOTE
        kind = ['regular', 'borderline1', 'borderline2', 'svm']
        for m in kind:
            sm = SMOTE(kind=m)
            X_res, y_res = sm.fit_sample(X_train, y_train)
            best_clf.fit(X_res, y_res)
            y_predicted = best_clf.predict(X_holdout)
            predicted = [LABELS[int(a)] for a in y_predicted]
            actual = [LABELS[int(a)] for a in y_holdout]
            fold_score, _ = score_submission(actual, predicted)
            max_fold_score, _ = score_submission(actual, actual)
            score = fold_score / max_fold_score
            print("Score " + m +  ":" + str(score))

    # fit the classifier
    best_clf.fit(X_train, y_train)

    # predict labels
    y_predicted = best_clf.predict(X_holdout)
    predicted = [LABELS[int(a)] for a in y_predicted]
    actual = [LABELS[int(a)] for a in y_holdout]

    # calc FNC score
    fold_score, _ = score_submission(actual, predicted)
    max_fold_score, _ = score_submission(actual, actual)
    score = fold_score / max_fold_score

    # calc accuracy for related/unrelated and stances
    accuracy_stance = score_calculation.get_accuracy(y_predicted, y_holdout, stance=True)
    accuracy_related = score_calculation.get_accuracy(y_predicted, y_holdout, stance=False)
    f1_stance = score_calculation.get_f1score(y_predicted, y_holdout, stance=True)
    f1_related = score_calculation.get_f1score(y_predicted, y_holdout, stance=False)

    # prepare printout for final results of holdout set
    printout = printout_manager.get_holdout_printout(save_folder, accuracy_related, accuracy_stance, f1_related, f1_stance, score)
    print(printout) # print holdout results
    result_string += printout # add results to string that is going to be saved into a file

    # test saving and restoring model
    filename = scorer_type + ".sav"
    save_model(best_clf, save_folder,filename)
    load_clf = load_model(parent_folder + scorer_type + "_new_0/", filename) # the 0th folder should always exist
    print_score_from_restored_model(load_clf, X_holdout, y_holdout)

    # add to special file that shows learning rate and loss of optimizer
    if isinstance(best_clf, MultiThreadingFeedForwardMLP):
        learning_rate_string += best_clf.get_learning_rates('holdout') + "\n"

    # print feature importances
    if scorer_type == 'randomforest':
        result_file_folder = "%s" % (path.dirname(path.dirname(path.abspath(__file__))))
        importances = best_clf.feature_importances_
        std = np.std([tree.feature_importances_ for tree in best_clf.estimators_],
                     axis=0)
        indices = np.argsort(importances)[::-1]
        feat_indices.append(feat_indices_holdout)

        feat_importance_string = str(feat_indices) + "\n"
        for i in indices:
            feat_importance_string += str(i) + ";" + str(importances[i]) + ";" + str(std[i]) + "\n"

        # save feature importances as file
        printout_manager.save_file(feat_importance_string, result_file_folder + "/feat_importance_rf.txt", "a+")

    return result_string, learning_rate_string
Exemplo n.º 4
0
def cross_validation(fold_stances, folds, Xs, ys, non_bleeding_features, features_dir,
                     scorer_type, all_accuracies_related, all_accuracies_stance,
                     all_f1_related, all_f1_stance, all_scores, result_string, learning_rate_string):
    best_score = 0

    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        # Add BOW features to current feature vectors
        # The features are specified in BOW_feature_list
        X_train, X_test, _ = concat_non_bleeding_features(
            X_train, X_test,
            non_bleeding_features, features_dir, fold)

        # get the estimator for this loop
        clf = esitmator_definitions.get_estimator(scorer_type)

        print("Begin fitting at: " + str(datetime.datetime.now()).split('.')[0] + "\n")

        # start fitting the estimator
        clf.fit(X_train, y_train)

        # predict the labes for fitted classifier with the test data
        predicted_int = clf.predict(X_test)
        predicted = [LABELS[int(a)] for a in predicted_int]
        actual = [LABELS[int(a)] for a in y_test]

        # calculate the FNC-1 score based on the predicted and the actual labels
        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)
        score = fold_score / max_fold_score

        # calculates accuracy and f1-macro scores
        accuracy_stance = score_calculation.get_accuracy(predicted_int, y_test, stance=True)
        accuracy_related = score_calculation.get_accuracy(predicted_int, y_test, stance=False)
        f1_stance = score_calculation.get_f1score(predicted_int, y_test, stance=True)
        f1_related = score_calculation.get_f1score(predicted_int, y_test, stance=False)

        # add the scores to the list holding the stores of all folds
        all_accuracies_related.append(accuracy_related)
        all_accuracies_stance.append(accuracy_stance)
        all_f1_related.append(f1_related)
        all_f1_stance.append(f1_stance)

        # get best score of all folds
        all_scores.append(score)
        if score > best_score:
            best_score = score

        # Prepare printout for fold result
        printout = printout_manager.get_foldwise_printout(fold, accuracy_related, accuracy_stance, f1_related,
                                                          f1_stance, score)
        print(printout)  # print results for this fold
        result_string += printout  # add results to final result file

        # add to special file that shows learning rate and loss of optimizer
        if isinstance(clf, MultiThreadingFeedForwardMLP):
            learning_rate_string += clf.get_learning_rates(fold) + "\n"

    # Prepare printout for final result
    printout = printout_manager.get_cross_validation_printout(
        all_accuracies_related, all_accuracies_stance, all_f1_related, all_f1_stance, all_scores, best_score)
    print(printout)  # print cross validation results
    result_string += printout  # add cross validation results to result file

    return result_string, learning_rate_string
Exemplo n.º 5
0
def final_clf_training(Xs,
                       ys,
                       X_holdout,
                       y_holdout,
                       scorer_type,
                       sanity_check=False):
    """
    Train final classifier on all of the data to prepare it for the prediction of the FNC-1's unlabeled data
    :param Xs: All the training data's feature vectors, split in their folds
    :param ys: All the training data's labels, split in their folds
    :param X_holdout: The holdout feature vectors
    :param y_holdout: The holdout labels
    :param scorer_type: the scorer type, e.g. MLB_base (see estimator_definitions.py in utils folder)
    :param sanity_check: If true, the trained classifier predicts the labels of the data it was trained on and prints out the score
    :return: the final classifier
    """

    # stack all the feature vectors of all the folds
    X_train = np.vstack(tuple([Xs[i] for i in range(10)]))
    y_train = np.hstack(tuple([ys[i] for i in range(10)]))

    # stack the holdout feature vectors on the feature vectors of all folds
    X_all = np.concatenate([X_train, X_holdout], axis=0)
    y_all = np.concatenate([y_train, y_holdout], axis=0)

    # define and create parent folder to save all trained classifiers into
    parent_folder = "%s/data/fnc-1/mlp_models/" % (path.dirname(
        path.dirname(path.abspath(__file__))))

    # create the new save folder for the specific classifer
    scorer_folder_name = scorer_type + "_final"
    save_folder = get_save_folder(parent_folder, scorer_folder_name + "_new")

    # get classifier and only pass a save folder if the classifier should be saved
    clf = esitmator_definitions.get_estimator(scorer_type,
                                              save_folder=save_folder)

    # fit the final classifier
    clf.fit(X_all, y_all)

    # save the model
    filename = scorer_folder_name + ".sav"
    save_model(clf, save_folder,
               filename)  # save model with filename to specific folder

    # predict on the data the classifier was trained on => should give near perfect score
    if sanity_check == True:
        # get predicted and actual labels
        y_predicted = clf.predict(X_all)
        predicted = [LABELS[int(a)] for a in y_predicted]
        actual = [LABELS[int(a)] for a in y_all]

        # calc FNC score
        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)
        score = fold_score / max_fold_score

        # calc accuracy, f1 macro
        accuracy_stance = score_calculation.get_accuracy(y_predicted,
                                                         y_all,
                                                         stance=True)
        accuracy_related = score_calculation.get_accuracy(y_predicted,
                                                          y_all,
                                                          stance=False)
        f1_stance = score_calculation.get_f1score(y_predicted,
                                                  y_all,
                                                  stance=True)
        f1_related = score_calculation.get_f1score(y_predicted,
                                                   y_all,
                                                   stance=False)

        # printout results
        printout = printout_manager.get_holdout_printout(
            save_folder, accuracy_related, accuracy_stance, f1_related,
            f1_stance, score)
        print("SANITY CHECK:")
        print(printout)
    return clf
Exemplo n.º 6
0
def validate_holdout(Xs, ys, X_holdout, y_holdout, non_bleeding_features,
                     features_dir, scorer_type, feat_indices, result_string,
                     learning_rate_string):
    """
    Trains the classifier on all of the train+test data and tests it on the holdout set
    :param Xs: All the training data's feature vectors, split in their folds
    :param ys: All the training data's labels, split in their folds
    :param X_holdout: The holdout feature vectors
    :param y_holdout: The holdout labels
    :param non_bleeding_features: The list of non-bleeding features that has to be concatenated to the existing feature vectors
    :param features_dir: the directory where the features are stored
    :param scorer_type: the scorer type, e.g. MLB_base (see estimator_definitions.py in utils folder)
    :param feat_indices: indices returned by generate_features() method. They indicate at what index of the feature vector a specific
    feature starts and where it ends. This is used for printing out the feature importances by the RandomForest classifier
    :param result_string: The current result string in order to add the holdout results
    :param learning_rate_string: The current learning rate string in order to add information about the learning rate
    :return: the updated result_string and learning_rate_string
    """
    # define folder to save the classifier and create it if not existing
    parent_folder = "%s/data/fnc-1/mlp_models/" % (path.dirname(
        path.dirname(path.abspath(__file__))))

    # create the new save folder
    save_folder = get_save_folder(parent_folder, scorer_type + "_new")

    # only pass a save folder if the classifier should be saved
    best_clf = esitmator_definitions.get_estimator(scorer_type,
                                                   save_folder=save_folder)

    # stack all the feature vectors of all the folds
    X_train = np.vstack(tuple([Xs[i] for i in range(10)]))
    y_train = np.hstack(tuple([ys[i] for i in range(10)]))

    # concat non-bleeding features
    X_train, X_holdout, feat_indices_holdout = concat_non_bleeding_features(
        X_train, X_holdout, non_bleeding_features, features_dir, 'holdout')

    # test for oversampling: fits the current classifier, oversampled with a given
    # method and checks the score on the holdout set
    use_over_sampling = False
    if use_over_sampling == True:
        from imblearn.over_sampling import SMOTE
        kind = ['regular', 'borderline1', 'borderline2', 'svm']
        for m in kind:
            sm = SMOTE(kind=m)
            X_res, y_res = sm.fit_sample(X_train, y_train)
            best_clf.fit(X_res, y_res)
            y_predicted = best_clf.predict(X_holdout)
            predicted = [LABELS[int(a)] for a in y_predicted]
            actual = [LABELS[int(a)] for a in y_holdout]
            fold_score, _ = score_submission(actual, predicted)
            max_fold_score, _ = score_submission(actual, actual)
            score = fold_score / max_fold_score
            print("Score " + m + ":" + str(score))

    # fit the classifier
    best_clf.fit(X_train, y_train)

    # predict labels
    y_predicted = best_clf.predict(X_holdout)
    predicted = [LABELS[int(a)] for a in y_predicted]
    actual = [LABELS[int(a)] for a in y_holdout]

    # calc FNC score
    fold_score, _ = score_submission(actual, predicted)
    max_fold_score, _ = score_submission(actual, actual)
    score = fold_score / max_fold_score

    # calc accuracy for related/unrelated and stances
    accuracy_stance = score_calculation.get_accuracy(y_predicted,
                                                     y_holdout,
                                                     stance=True)
    accuracy_related = score_calculation.get_accuracy(y_predicted,
                                                      y_holdout,
                                                      stance=False)
    f1_stance = score_calculation.get_f1score(y_predicted,
                                              y_holdout,
                                              stance=True)
    f1_related = score_calculation.get_f1score(y_predicted,
                                               y_holdout,
                                               stance=False)

    # prepare printout for final results of holdout set
    printout = printout_manager.get_holdout_printout(save_folder,
                                                     accuracy_related,
                                                     accuracy_stance,
                                                     f1_related, f1_stance,
                                                     score)
    print(printout)  # print holdout results
    result_string += printout  # add results to string that is going to be saved into a file

    # test saving and restoring model
    filename = scorer_type + ".sav"
    save_model(best_clf, save_folder, filename)
    load_clf = load_model(parent_folder + scorer_type + "_0/",
                          filename)  # the 0th folder should always exist
    print_score_from_restored_model(load_clf, X_holdout, y_holdout)

    # add to special file that shows learning rate and loss of optimizer
    if isinstance(best_clf, MultiThreadingFeedForwardMLP):
        learning_rate_string += best_clf.get_learning_rates('holdout') + "\n"

    # print feature importances
    if scorer_type == 'randomforest':
        result_file_folder = "%s" % (path.dirname(
            path.dirname(path.abspath(__file__))))
        importances = best_clf.feature_importances_
        std = np.std(
            [tree.feature_importances_ for tree in best_clf.estimators_],
            axis=0)
        indices = np.argsort(importances)[::-1]
        feat_indices.append(feat_indices_holdout)

        feat_importance_string = str(feat_indices) + "\n"
        for i in indices:
            feat_importance_string += str(i) + ";" + str(
                importances[i]) + ";" + str(std[i]) + "\n"

        # save feature importances as file
        printout_manager.save_file(
            feat_importance_string,
            result_file_folder + "/feat_importance_rf.txt", "a+")

    return result_string, learning_rate_string
Exemplo n.º 7
0
def cross_validation(fold_stances, folds, Xs, ys, non_bleeding_features,
                     features_dir, scorer_type, all_accuracies_related,
                     all_accuracies_stance, all_f1_related, all_f1_stance,
                     all_scores, result_string, learning_rate_string):
    best_score = 0

    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        # Add BOW features to current feature vectors
        # The features are specified in BOW_feature_list
        X_train, X_test, _ = concat_non_bleeding_features(
            X_train, X_test, non_bleeding_features, features_dir, fold)

        # get the estimator for this loop
        clf = esitmator_definitions.get_estimator(scorer_type)

        print("Begin fitting at: " +
              str(datetime.datetime.now()).split('.')[0] + "\n")

        # start fitting the estimator
        clf.fit(X_train, y_train)

        # predict the labes for fitted classifier with the test data
        predicted_int = clf.predict(X_test)
        predicted = [LABELS[int(a)] for a in predicted_int]
        actual = [LABELS[int(a)] for a in y_test]

        # calculate the FNC-1 score based on the predicted and the actual labels
        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)
        score = fold_score / max_fold_score

        # calculates accuracy and f1-macro scores
        accuracy_stance = score_calculation.get_accuracy(predicted_int,
                                                         y_test,
                                                         stance=True)
        accuracy_related = score_calculation.get_accuracy(predicted_int,
                                                          y_test,
                                                          stance=False)
        f1_stance = score_calculation.get_f1score(predicted_int,
                                                  y_test,
                                                  stance=True)
        f1_related = score_calculation.get_f1score(predicted_int,
                                                   y_test,
                                                   stance=False)

        # add the scores to the list holding the stores of all folds
        all_accuracies_related.append(accuracy_related)
        all_accuracies_stance.append(accuracy_stance)
        all_f1_related.append(f1_related)
        all_f1_stance.append(f1_stance)

        # get best score of all folds
        all_scores.append(score)
        if score > best_score:
            best_score = score

        # Prepare printout for fold result
        printout = printout_manager.get_foldwise_printout(
            fold, accuracy_related, accuracy_stance, f1_related, f1_stance,
            score)
        print(printout)  # print results for this fold
        result_string += printout  # add results to final result file

        # add to special file that shows learning rate and loss of optimizer
        if isinstance(clf, MultiThreadingFeedForwardMLP):
            learning_rate_string += clf.get_learning_rates(fold) + "\n"

    # Prepare printout for final result
    printout = printout_manager.get_cross_validation_printout(
        all_accuracies_related, all_accuracies_stance, all_f1_related,
        all_f1_stance, all_scores, best_score)
    print(printout)  # print cross validation results
    result_string += printout  # add cross validation results to result file

    return result_string, learning_rate_string