def leave_one_out_score(data_set_name): # open data set data_set = DataSet(data_set_name) x, y = data_set.load_training_data() correct_count = 0 for i in range(len(y)): x_train = np.delete(x, i, axis=0) y_train = np.delete(y, i) x_test = [x[i]] y_test = [y[i]] # train model model = train_model(x_train, y_train) if model.predict(x_test)[0] == y_test: correct_count += 1 print(i, 'correct,', correct_count / (i + 1)) else: print(i, 'wrong,', correct_count / (i + 1)) print('accuracy:', correct_count / len(y))
def cross_validation(data_set_name): # open data set data_set = DataSet(data_set_name) # get train and test set x, y = data_set.load_training_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, shuffle=True) # train and save model model = train_model(x_train, y_train) # get cross validation performance predict = model.predict(x_test) accuracy = accuracy_score(y_test, predict) print('Cross validation accuracy:', accuracy) # print_classification_matrix(predict, y_test) return accuracy, y_test, predict
def cross_corpus(train_set_name, test_set_name): # open data set and train model train_set = DataSet(train_set_name) x_train, y_train = train_set.load_training_data() model = train_model(x_train, y_train) # open test set and predict labels test_set = DataSet(test_set_name) x_test, y_test = test_set.load_training_data() predict = model.predict(x_test) print_classification_matrix(predict, y_test) return model
doc_start = time.time() # read raw text and parse tree text = data_set.get_text(counter.count) label = data_set.data['labels'][counter.count] annotation = data_set.load_stanford_annotation(counter.count) # insert row to matrix # also, initialize feature matrix if it is None row = stanford_feature.get_features(annotation) row.append(label) if self.feature_matrix is None: self.feature_matrix = np.zeros( [data_set.data['count'], len(row)]) self.feature_matrix[counter.count, :] = row # count annd print counter.increment() print('%i, %i%% %.2f seconds (%.0f total))' % (counter.count - 1, 100 * counter.count / data_set.data['count'], time.time() - doc_start, time.time() - start)) counter.commit() if __name__ == '__main__': data_set = DataSet('cepp') FeatureExtractor(data_set)
start = time.time() while counter.count < data_set.data['count']: doc_start = time.time() # read raw text and parse tree text = data_set.get_text(counter.count) label = data_set.data['labels'][counter.count] annotation = data_set.load_stanford_annotation(counter.count) # extract features into a row array row = extract_features(text, annotation) row.append(label) # initialize feature matrix if it is None if feature_matrix is None: feature_matrix = np.zeros([data_set.data['count'], len(row)]) # insert row array to matrix feature_matrix[counter.count, :] = row # count and print counter.increment() print('%i, %i%% %.2f seconds (%.0f total))' % ( counter.count - 1, 100 * counter.count / data_set.data['count'], time.time() - doc_start, time.time() - start)) counter.commit() if __name__ == '__main__': process_feature(DataSet('core-standard'), restart=True)
if __name__ == '__main__': # which data set to use train_on = 'cepp' test_on = 'core-standard' # Leave one out validation (very slow) # leave_one_out_score('cepp') # Cross validation accuracy, y_test, predict = cross_validation(train_on) print(confusion_matrix(y_test, predict)) # 10-Fold Cross validation x, y = DataSet(train_on).load_training_data() model = RandomForestClassifier(n_estimators=1000, oob_score=True, warm_start=True) print('10-Fold Cross Validation Score:', np.mean(cross_val_score(model, x, y, cv=10))) # save fully trained model model.fit(x, y) DataSet(train_on).save_model(model, 'random-forest') # load and evaluate on train (just to see if it is correct) loaded_model = DataSet(train_on).load_model('random-forest') predict = loaded_model.predict(x) print_classification_matrix(predict, y)
len(sentence['tokens']) for sentence in annotation['sentences']) num_stopwords = count_stopwords(annotation) # features complex_token_ratio = count_complex_tokens(sentences) / num_tokens number_meaningful_bigrams_percorpus = float( count_meaningful_bigrams(annotation)) / num_tokens stopword_ratio = num_stopwords / num_tokens stopword_per_sentence = num_stopwords / num_sentences # summarize features into an array scores = ttr_pos(annotation) scores.extend([ complex_token_ratio, number_meaningful_bigrams_percorpus, stopword_ratio, stopword_per_sentence ]) return scores if __name__ == '__main__': cepp = DataSet('cepp') annotation = cepp.load_stanford_annotation(0) print(count_meaningful_bigrams(annotation)) #test_data = """Eleven states working in conjunction with the U.S Department of Transportation (D.O.T) have agreed to implement an ordinance banning the use of electronic cigarettes in vehicles meaning if you are a resident of one of the impacted states, you will be prohibited from utilizing an electronic cigarette while inside your vehicle.""" #print(extract_lexical_features([test_data, test_data])) #print(list(find_ngrams(word_tokenize(test_data), 2))) #print(meanigful_bigrams(find_ngrams(word_tokenize(test_data), 2)))
from nutrition.structure.data_set import DataSet if __name__ == '__main__': data_set = DataSet('cepp') levels = ['KET', 'PET', 'FCE', 'CAE', 'CPE'] num_articles = [64, 60, 71, 67, 69] labels = [] text_id = 0 for l in range(0, 5): print('working on level', l) for i in range(1, num_articles[l] + 1): print('working on text', i) path = '{}/_origin/{}/{}.txt'.format(data_set.path, levels[l], i) data_set.import_raw_text(path, text_id) labels.append(l) text_id += 1 data_set.set_labels(labels)
from nutrition.structure.data_set import DataSet import numpy as np if __name__ == '__main__': data_set = DataSet('newsela') labels = np.genfromtxt('D:/master project/data/newsela/average_level.csv', delimiter=',') data_set.set_labels(labels[:, 1].tolist()) for i in range(0, 17027): path = 'D:/master project/data/newsela/text/{}.txt'.format(i + 1) data_set.import_raw_text(path, i) print(i)
sys.exit() # pickle the result data_set.save_stanford_annotation(counter.count, annotation) # save the new count counter.increment() # print time information print('%i, %i%% %.2f seconds (%.0f total))' % (counter.count - 1, 100 * counter.count / data_set.data['count'], time.time() - doc_start, time.time() - start)) if __name__ == '__main__': process_stanford(DataSet('learning-corpus'), restart=False) # sw = Stopwatch() # # data_set = DataSet('cepp') # sw.lap('test') # nlp = StanfordCoreNLP(STANFORD_SERVER) # sw.lap('test') # text = fix(data_set.get_text(0)) # print(text) # sw.lap('test') # annotation = nlp.annotate(text, properties={ # 'annotators': 'lemma,parse', # 'outputFormat': 'json', # 'coref.algorithm': 'statistical' # })
print(sum(abs(y_test - cv_predict) < 0.5) / len(y)) # train set performance model.fit(x, y) predict_train = model.predict(x) #plt.figure() #plt.scatter(y, predict_train) #plt.title('Train set') # cross corpus x_cc, y_cc = cc_set.load_training_data() x_cc = x_cc[:, features] cc_predict = model.predict(x_cc) plt.figure() plt.scatter(y_cc, cc_predict) plt.title('CEPP model on Newsela') plt.show() if __name__ == '__main__': model = SVC(kernel='linear') data_set = DataSet('cepp') cc_set = DataSet('core-standard') # features = [6,66,69,86,112,115,118,121] # RFE # features = [120, 121, 66, 68, 115, 45] # Greedy # features = [15, 31] # NP/VP per sentence features = list(range(0, 122)) eval_plot(model, data_set, cc_set, features)
from nutrition.structure.data_set import DataSet import os if __name__ == '__main__': data_set = DataSet('nil') text_id = 0 labels = [] for level in range(1, 4): folder = 'D:/master project/data/news_in_levels/News_in_levels_level{}/articles/'.format( level) for filename in os.listdir(folder): # ignore files that are very small (< n bytes) if os.stat(folder + filename).st_size < 10: print('ignored {} because its size is only {} bytes'.format( folder + filename, os.stat(folder + filename).st_size)) continue data_set.import_raw_text(folder + filename, text_id) labels.append(level) text_id += 1 print(text_id) data_set.set_labels(labels)
text = data_set.get_text(counter.count) # call stanford annotate api annotation = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) if type(annotation) is str: print('Error returned by stanford parser:', str) sys.exit() # pickle the result data_set.save_stanford_annotation(counter.count, annotation) # save the new count counter.increment() # print time information print('%i, %i%% %.2f seconds (%.0f total))' % (counter.count-1, 100*counter.count/data_set.data['count'], time.time() - doc_start, time.time() - start)) if __name__ == '__main__': data_set = DataSet('newsela') parse(data_set)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from nutrition.structure.data_set import DataSet import matplotlib.pyplot as plt if __name__ == '__main__': x, y = DataSet('cepp').load_training_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, shuffle=True, random_state=42) model = RandomForestRegressor(n_estimators=1000, oob_score=True, warm_start=True) model.fit(x_train, y_train) predict = model.predict(x_test) # print(accuracy_score(y_test, predict)) plt.figure() plt.scatter(y_test, predict) plt.show()
import os from nutrition.structure.data_set import DataSet if __name__ == '__main__': data_set = DataSet('core-standard'); raw_folder = 'D:/master project/dataold/core-standards-readability' grades = ['1', '2-3', '4-5', '6-8', '9-10', '11-CCR'] bans = ['poetry', 'drama'] labels = [] text_id = 0 for l in range(0, 6): print('working on level', l) path_level = raw_folder + '/grade ' + grades[l] for cat in os.listdir(path_level): path_cat = path_level + '/' + cat if any(ban in cat for ban in bans): print('ignoring ' + path_cat) else: for file in os.listdir(path_cat): path_text = path_cat + '/' + file print('reading ' + path_text) data_set.import_raw_text(path_text, text_id) labels.append(l) text_id += 1 data_set.set_labels(labels)
# average node depth count_sum_node_depth(sentence_trees) / token_count, # average word length count_sum_word_length(sentence_trees) / token_count, # function words (stopwords which are not DT/determiners) count_function_words(tokens) ]) return features if __name__ == '__main__': annotation = DataSet('cepp').load_stanford_annotation(0) #print(count_difficult_words(annotation['sentences'])) sentences = annotation['sentences'] num_sentences = len(sentences) tokens = [token for sentence in sentences for token in sentence['tokens']] print(count_function_words(tokens)) # # # pprint.pprint(annotation) # # sentence_trees = [Tree.fromstring(sentence['parse']) for sentence in annotation['sentences']] # # sentence_trees[0].pretty_print() # for sentence in annotation['sentences']: # for token in sentence['tokens']: # print(token['lemma'], token['word'], token['originalText'], token['pos']) #
from nltk.corpus import stopwords from nutrition.structure.data_set import DataSet if __name__ == '__main__': DataSet('learning-corpus').delete_row(1437)
import os from nutrition.structure.data_set import DataSet if __name__ == '__main__': levels = [ 'elementry', 'pre_inter', 'intermediate', 'upper_inter', 'advanced' ] data_set = DataSet('learning-corpus') labels = [] text_id = 0 root_path = 'D:/master project/dataold/learning_corpus' for level in range(len(levels)): level_path = root_path + '/' + levels[level] for file_name in os.listdir(level_path): text_path = level_path + '/' + file_name with open(text_path, 'r', encoding='utf8') as file: text_length = len(file.read()) if 10 < text_length < 100000: labels.append(level) data_set.import_raw_text(text_path, text_id) text_id += 1 data_set.set_labels(labels)
# train model on train data set model = train_linear(train_data_set) # evaluate model on test data set (cross corpus) x, y = test_data_set.load_training_data() score = cross_val_score(model, x, y, scoring='neg_mean_squared_error') print('Mean squared error: {}'.format(-score)) # plot predict = model.predict(x) plt.scatter(y, predict) plt.show() if __name__ == '__main__': data_set = DataSet('cepp') x, y = data_set.load_training_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, shuffle=True, random_state=0) regr = linear_model.LinearRegression(normalize=True) #regr = linear_model.Ridge(alpha=0.001, normalize=True) regr.fit(x_train, y_train) predict = regr.predict(x_test)
# c = current column id for c in range(0, len(x_train[0])): c_x_train = np.delete(x_train, c, 1) c_x_test = np.delete(x_test, c, 1) c_error = get_error(c_x_train, y_train) if b_error == -1 or b_error > c_error: b_id = c b_error = c_error b_cv_error = get_cv_error(c_x_train, c_x_test, y_train, y_test) x_train = np.delete(x_train, b_id, 1) x_test = np.delete(x_test, b_id, 1) features = np.delete(features, b_id, 0) feature_ids = np.delete(feature_ids, b_id, 0) print(features, feature_ids, b_error, b_cv_error, len(x_train[0])) plot_x_num_features.append(len(x_test[0])) plot_y_error.append(b_error) plot_y_cv_error.append(min(10, b_cv_error)) #print(features, b_error) plt.scatter(plot_x_num_features, plot_y_error) plt.scatter(plot_x_num_features, plot_y_cv_error) plt.show() if __name__ == '__main__': #backward_feature_selection(DataSet('cepp'), DataSet('mini-newsela')) forward_feature_selection(DataSet('cepp'), DataSet('mini-newsela'))
predict_train = model.predict(x_train) plt.figure() plt.title('train') plt.scatter(y_train, predict_train) # plot test performance predict = model.predict(x_test) plt.figure() plt.title('test') plt.scatter(y_test, predict) plt.show() def eval_cc_linear(train_data_set, test_data_set): # train model on train data set model = train_linear(train_data_set) # evaluate model on test data set (cross corpus) x, y = test_data_set.load_training_data() score = cross_val_score(model, x, y, scoring='neg_mean_squared_error') print('Mean squared error: {}'.format(-score)) # plot predict = model.predict(x) plt.scatter(y, predict) plt.show() if __name__ == '__main__': eval_linear(DataSet('cepp'))