def predict(m, fn): """ returns a list of 0s and 1s, corresponding to the lines in the specified file. :param m: the trained model :type m: BaseClassifier :param fn: the full path to a file in the same format as the test set :type fn: str :return: a list of 0s and 1s, corresponding to the lines in the specified file :rtype: list """ dm_test = DataManager(fn, is_train=False, algorithm_name=m.clf_name) dm_test.run_first_preprocessing_flow() X_test, _ = dm_test.complete_preprocessing_flow() return m.predict(X_test)
def train_best_model(): """ training a XGBoost classifier from scratch with it's best hyper-parameters. :return: a trained XGBoost classifier built with the best performing hyper-parameters. :rtype: XGBoostClassifier """ clf = XGBoostClassifier() clf.set_best_hyper_parameters() # sets the best hyper-parameters that were found in the optimization stage. dm_train = DataManager('trump_train.tsv', is_train=True, algorithm_name=clf.clf_name) # we had two stages of the preprocessing flow because of the padding of the text features used in the NN algorithms: dm_train.run_first_preprocessing_flow() X_train, y_train = dm_train.complete_preprocessing_flow() # fit the classifier using all of the training data: clf.fit(X_train, y_train) return clf
for clf in classifiers: print(f'Initialized classifier {clf.clf_name}') evaluator = Evaluator(clf) print('Initialized evaluator') dm_train = DataManager('trump_train.tsv', is_train=True, algorithm_name=clf.clf_name) dm_test = DataManager('trump_test.tsv', is_train=False, algorithm_name=clf.clf_name) print('Initialized Data manager') dm_train.run_first_preprocessing_flow() dm_test.run_first_preprocessing_flow() fix_max_length(dm_train, dm_test) X_train, y_train = dm_train.complete_preprocessing_flow() X_test, _ = dm_test.complete_preprocessing_flow() if clf.clf_name == 'RNN': clf.sequence_length = dm_train.max_length print('Cleaned X, y') best_score = evaluator.optimize_hyper_parameters(X_train, y_train, cv=3, scoring='f1') print('Done') print( f'The classifier {clf.clf_name} gave us a best 3-fold score of: {best_score}' ) print(clf.hyper_parameters) best_hyper_parameters_solver[clf.clf_name] = clf.hyper_parameters best_score_solver[clf.clf_name] = best_score