def kfold_average_score(learner, files, dirs, k=5, min_word_len=2, min_freq=10, feature_size=160, weight='tfidf', sw=True): scores = [0] * k kfold = KFold(files, dirs, k) for ii in range(k): train_X, train_Y, test_X, test_Y = kfold.kth(ii) features = Features(train_X, train_Y, min_word_len, min_freq, feature_size, sw) x = [] for f_name in train_X: x.append(features.get_x_vector(f_name, weight)) learner.fit(x, features.y_transform(train_Y)) x = [] for f_name in test_X: x.append(features.get_x_vector(f_name, weight)) scores[ii] = f1_score(features.y_transform(test_Y), learner.predict(x).tolist()) return mean(scores)
if __name__ == '__main__': parser = argparse.ArgumentParser("Read in data directory") parser.add_argument('data_dir') print('Reading Data Path') files, dirs = fe.get_file_name_and_path(parser.parse_args().data_dir) print('Spliting Train-Test Set ') train_x, train_y, test_x, test_y = train_test_split(files, dirs, 0.25) learner = svm.SVC(kernel='rbf', C=1) features = Features(train_x, train_y, 3, 0, 160) x = [] for f_name in train_x: x.append(features.get_x_vector(f_name, 'tfidf')) learner.fit(x, features.y_transform(train_y)) x=[] for f_name in test_x: x.append(features.get_x_vector(f_name, 'tfidf')) print('Score:', f1_score(features.y_transform(test_y), learner.predict(x).tolist())) # print('Test if "TFIDF" is better than "TF"') # print('TF:', kfold_average_score(learner, train_x, train_y, weight='tf',feature_size=2)) # print('TFIDF:', kfold_average_score(learner, train_x, train_y, weight='tfidf',feature_size=2)) # # # # # print('Test which min length of word is best') # lst = [] # for i in range(10): # lst.append(kfold_average_score(learner, files, dirs, k=5, min_word_len=i))