def predict(m, fn):
    """ returns a list of 0s and 1s, corresponding to the lines in the specified file.

    :param m: the trained model
    :type m: BaseClassifier
    :param fn: the full path to a file in the same format as the test set
    :type fn: str
    :return: a list of 0s and 1s, corresponding to the lines in the specified file
    :rtype: list
    """
    dm_test = DataManager(fn, is_train=False, algorithm_name=m.clf_name)
    dm_test.run_first_preprocessing_flow()
    X_test, _ = dm_test.complete_preprocessing_flow()
    return m.predict(X_test)
def train_best_model():
    """ training a XGBoost classifier from scratch with it's best hyper-parameters.

    :return: a trained XGBoost classifier built with the best performing hyper-parameters.
    :rtype: XGBoostClassifier
    """
    clf = XGBoostClassifier()
    clf.set_best_hyper_parameters()  # sets the best hyper-parameters that were found in the optimization stage.
    dm_train = DataManager('trump_train.tsv', is_train=True, algorithm_name=clf.clf_name)
    # we had two stages of the preprocessing flow because of the padding of the text features used in the NN algorithms:
    dm_train.run_first_preprocessing_flow()
    X_train, y_train = dm_train.complete_preprocessing_flow()
    # fit the classifier using all of the training data:
    clf.fit(X_train, y_train)
    return clf
Exemplo n.º 3
0
for clf in classifiers:
    print(f'Initialized classifier {clf.clf_name}')
    evaluator = Evaluator(clf)
    print('Initialized evaluator')
    dm_train = DataManager('trump_train.tsv',
                           is_train=True,
                           algorithm_name=clf.clf_name)
    dm_test = DataManager('trump_test.tsv',
                          is_train=False,
                          algorithm_name=clf.clf_name)

    print('Initialized Data manager')
    dm_train.run_first_preprocessing_flow()
    dm_test.run_first_preprocessing_flow()
    fix_max_length(dm_train, dm_test)
    X_train, y_train = dm_train.complete_preprocessing_flow()
    X_test, _ = dm_test.complete_preprocessing_flow()
    if clf.clf_name == 'RNN':
        clf.sequence_length = dm_train.max_length
    print('Cleaned X, y')
    best_score = evaluator.optimize_hyper_parameters(X_train,
                                                     y_train,
                                                     cv=3,
                                                     scoring='f1')
    print('Done')
    print(
        f'The classifier {clf.clf_name} gave us a best 3-fold score of: {best_score}'
    )
    print(clf.hyper_parameters)
    best_hyper_parameters_solver[clf.clf_name] = clf.hyper_parameters
    best_score_solver[clf.clf_name] = best_score