def select_best_corpora(): f_selection = [ 'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector', 'Word Lengths', 'Word Level Levenshtein', 'NE Coupling', 'NE LCS syn' ] feature_files = os.listdir(slash.join(processed_news_features_path)) with open(slash.join(saved_models_path + ['select_best_corpora_scores_file.csv']), 'w', newline='') as f: writer = csv.writer(f, delimiter=';') # add featureselection #header = ['modelnum', 'usedcorpora', 'corpussize', 'loss', 'accuracy', # 'loss on meter', 'accuracy on meter', 'featureselection'] header = [ 'filename', 'mscoco', 'msrp', 'msrp-a', 'opinosis', 'p4p', 'quora', 'balanced', 'corpussize', 'loss', 'accuracy', 'loss on meter', 'accuracy on meter' ] writer.writerow(header) modelnum = 0 for f_name in feature_files: start = time.time() model, score, rounded, y = classify(f_selection, 0.1, f_name) loss, accuracy = score X_train, X_test, y_train, y_test = get_meter_features( model, 0.1, f_selection, None) loss_on_meter, accuracy_on_meter = meter_classify( X_train, X_test, y_train, y_test) new_model_row = [0 for _ in list(range(len(header)))] # new_model_row[0] = modelnum new_model_row[0] = f_name # usedcorpora = feature_file_name[0][:-4].split('_') usedcorpora = f_name[:-4].split('_') new_model_row[1] = 1 if 'mscoco' in usedcorpora else 0 new_model_row[2] = 1 if 'msrp' in usedcorpora else 0 new_model_row[3] = 1 if 'msrpa' in usedcorpora else 0 new_model_row[4] = 1 if 'opinosis' in usedcorpora else 0 new_model_row[5] = 1 if 'p4p' in usedcorpora else 0 new_model_row[6] = 1 if 'quora' in usedcorpora else 0 new_model_row[7] = 1 if usedcorpora[0] == 'balance' else 0 new_model_row[8] = int(usedcorpora[-1]) # corpussize new_model_row[9] = loss new_model_row[10] = accuracy new_model_row[11] = loss_on_meter new_model_row[12] = accuracy_on_meter writer.writerow(new_model_row) print(modelnum, 'done in', time.time() - start) K.clear_session() modelnum += 1
def save(model, modelnum): #create directory os.makedirs(slash.join(saved_models_path + [modelnum])) path = slash.join(saved_models_path + [modelnum, modelnum]) #p = slash.join(saved_models_path) #print(os.listdir(p)) model_json = model.to_json() with open(path + '.json', 'w') as f: f.write(model_json) model.save_weights(path + '.h5')
def parse(file_name): #file_name = os.listdir(slash.join(processed_meter_corpus_path))[0] file_path = slash.join(processed_meter_corpus_path + [file_name]) feature_names = ['TARGET_ID', 'SOURCE_ID', 'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector', # 'TFIDF': tfidf_X, 'Word Embeddings', 'Word Lengths', 'Word Level Levenshtein', 'WN Path Matrix', 'WN LCH Matrix', 'WN WUP Matrix', 'NE Coupling', 'NE Coupling syn', 'NE GT', 'NE GT syn', 'NE LCS', 'NE LCS syn', 'NE Overlap', 'NE Overlap syn', 'LCSubstring', 'Sentence Lengths', 'String Matching', 'Punctuation Overlap'] features = {'TARGET_ID': [], 'SOURCE_ID': [], 'GST': [], 'GST syn': [], 'LCS': [], 'LCS syn': [], 'TO': [], 'TO syn': [], 'Sparse Vector': [], # 'TFIDF': tfidf_X, 'Word Embeddings': [], 'Word Lengths': [], 'Word Level Levenshtein': [], 'WN Path Matrix': [], 'WN LCH Matrix': [], 'WN WUP Matrix': [], 'NE Coupling': [], 'NE Coupling syn': [], 'NE GT': [], 'NE GT syn': [], 'NE LCS': [], 'NE LCS syn': [], 'NE Overlap': [], 'NE Overlap syn': [], 'LCSubstring': [], 'Sentence Lengths': [], 'String Matching': [], 'Punctuation Overlap': []} with open(file_path, 'r') as f: reader = csv.reader(f, delimiter=',') for line in reader: if len(line)>0 and line[0] != 'source': derived = line[2] f = list(zip(feature_names, [float(n) for n in line[3:]])) #print(line[3], line[4]) for label, val in f: features[label].append(val) return features, derived
def export(self): """ Export the scores to a json file """ #data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) + '/data/Similarity/' data_path = slash.join(vand_folder_path + ['sim_matrix.txt']) with open(data_path, 'w') as file: json.dump(self.similarity_dict, file)
def load(name): #path = project_root + '/saved_models/' path = slash.join(saved_models_path + [name, name]) with open(path + '.json', 'r') as g: file = g.read() model = model_from_json(file) model.load_weights(path + '.h5') return model
def feature_selection(): f2 = [ 'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector', # 'TFIDF': tfidf_X, 'Word Embeddings', 'Word Lengths', 'Word Level Levenshtein', # 'WN Path Matrix', 'WN LCH Matrix', 'WN WUP Matrix', 'NE Coupling', 'NE Coupling syn', 'NE GT', 'NE GT syn', 'NE LCS', 'NE LCS syn', 'NE Overlap', 'NE Overlap syn', 'LCSubstring', 'Sentence Lengths', 'String Matching', 'Punctuation Overlap' ] feature_files = os.listdir(slash.join(processed_news_features_path)) f = ['Sparse Vector', 'Word Level Levenshtein', 'TO', 'GST'] #must be changed in method to return this X, y = classify(f2, 0.1, feature_files[0]) nan_indeces = numpy.isnan(X) X[nan_indeces] = 0 print(numpy.isnan(numpy.min(X))) model = LogisticRegression() rfe = RFE(model, 1) fit = rfe.fit(X, y) features = fit.support_ ranking = fit.ranking_ indices = [f2[i] for i, x in enumerate(features) if x] print(indices) t = [] #print(ranking) features_ranked = [] for i, num in enumerate(ranking): features_ranked.append((num, f2[i])) features_ranked_sorted = sorted(features_ranked, key=lambda x: x[0]) print(features_ranked_sorted[:11])
def test_5_features(): feature_files = os.listdir(slash.join(processed_news_features_path)) best_corpus = feature_files[75] f_selections = [ 'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector', 'Word Lengths', 'Word Level Levenshtein', 'NE Coupling', 'NE LCS syn' ] a = [] for c in itertools.combinations(f_selections, 5): a.append(c) print(len(a))
def run(primary_model, test_size, feature_selection, b_mode): #model = load(model_name) file_names = os.listdir(slash.join(processed_meter_corpus_path)) scores = [] y = [] for file_name in file_names: f = featurize(feature_selection, file_name, primary_model) if f is not None: score, label = f #print(score) scores.append(score) new_y = 0 if label == '0' else 1 y.append(new_y) #y.append(int(label)) #break #print(scores) X = np.array([np.array(i) for i in scores]) if b_mode == 0: b_X, b_y = balance_both(X, y) elif b_mode == 1: b_X, b_y = balance_down(X, y) elif b_mode == 2: b_X, b_y = balance_up(X, y) else: b_X = X b_y = y X_train, X_test, y_train, y_test = train_test_split(b_X, b_y, test_size=test_size, shuffle=True) return X_train, X_test, y_train, y_test
def parse(file_name): """ At the moment only returns the featuresets and labels, but more information is available :return: """ feature_names = ['source_id', 'target_id', 'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector', # 'TFIDF': tfidf_X, 'Word Embeddings', 'Word Lengths', 'Word Level Levenshtein', 'WN Path Matrix', 'WN LCH Matrix', 'WN WUP Matrix', 'NE Coupling', 'NE Coupling syn', 'NE GT', 'NE GT syn', 'NE LCS', 'NE LCS syn', 'NE Overlap', 'NE Overlap syn', 'LCSubstring', 'Sentence Lengths', 'String Matching', 'Punctuation Overlap'] file_path = slash.join(processed_test_files_path + [file_name]) features = {'source_id':[], 'target_id':[], 'GST':[], 'GST syn':[], 'LCS':[], 'LCS syn':[], 'TO':[], 'TO syn':[], 'Sparse Vector':[], # 'TFIDF': tfidf_X, 'Word Embeddings':[], 'Word Lengths':[], 'Word Level Levenshtein':[], 'WN Path Matrix':[], 'WN LCH Matrix':[], 'WN WUP Matrix':[], 'NE Coupling':[], 'NE Coupling syn':[], 'NE GT':[], 'NE GT syn':[], 'NE LCS':[], 'NE LCS syn':[], 'NE Overlap':[], 'NE Overlap syn':[], 'LCSubstring':[], 'Sentence Lengths':[], 'String Matching':[], 'Punctuation Overlap':[]} with open(file_path, 'r') as f: reader = csv.reader(f, delimiter=',') for line in reader: if line[0] != 'source': source, target, source_id, target_id = line[:4] f = list(zip(feature_names, [int(source_id), int(target_id)] + [float(n) for n in line[4:]])) for label, val in f: features[label].append(val) return source, target, features
def test_different_feature_selections(): """f_selections = ['GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector', 'Word Lengths', 'Word Level Levenshtein', 'NE Coupling', 'NE LCS syn']""" f_selections = [ 'GST syn', 'LCS syn', 'TO syn', 'GST', 'TO', 'LCS', 'LCSubstring', 'Word Lengths', 'Word Level Levenshtein', 'String Matching', 'NE Coupling' ] feature_files = os.listdir(slash.join(processed_news_features_path)) best_c_indeces = [75, 4, 61, 33, 104] best_corpora = [(feature_files[i], i) for i in best_c_indeces] with open(slash.join(saved_models_path + ['test_different_features_scores_file_new.csv']), 'w', newline='') as f: writer = csv.writer(f, delimiter=';') # add featureselection header = [ 'corpusnum', 'loss', 'accuracy', 'loss on meter', 'accuracy on meter', 'GST syn', 'LCS syn', 'TO syn', 'GST', 'TO', 'LCS', 'LCSubstring', 'Word Lengths', 'Word Level Levenshtein', 'String Matching', 'NE Coupling' ] writer.writerow(header) modelnum = 0 for c_name in best_corpora: f_name = c_name[0] num = c_name[1] for i in range(5, 12): new_model_row = [0 for _ in list(range(len(header)))] f = f_selections[:i] start = time.time() model, score, rounded, y = classify(f, 0.1, f_name) loss, accuracy = score X_train, X_test, y_train, y_test = get_meter_features( model, 0.1, f, None) loss_on_meter, accuracy_on_meter = meter_classify( X_train, X_test, y_train, y_test) new_model_row[0] = num new_model_row[1] = loss new_model_row[2] = accuracy new_model_row[3] = loss_on_meter new_model_row[4] = accuracy_on_meter new_model_row[5] = 1 if 'GST syn' in f else 0 new_model_row[6] = 1 if 'LCS syn' in f else 0 new_model_row[7] = 1 if 'TO syn' in f else 0 new_model_row[8] = 1 if 'GST' in f else 0 new_model_row[9] = 1 if 'TO' in f else 0 new_model_row[10] = 1 if 'LCS' in f else 0 new_model_row[11] = 1 if 'LCSSubstring' in f else 0 new_model_row[12] = 1 if 'Word Lenghts' in f else 0 new_model_row[13] = 1 if 'Word Level Levenshtein' in f else 0 new_model_row[14] = 1 if 'String Matching' in f else 0 new_model_row[15] = 1 if 'NE Coupling' in f else 0 writer.writerow(new_model_row) print(modelnum, 'done in', time.time() - start) K.clear_session() modelnum += 1 if modelnum == 3: break
def iterate_all_feature_selections_and_files(): feature_selection = [ #'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector', # 'TFIDF': tfidf_X, #'Word Embeddings', 'Word Lengths', 'Word Level Levenshtein', #'WN Path Matrix', 'WN LCH Matrix', 'WN WUP Matrix', #'NE Coupling', 'NE Coupling syn', 'NE GT', 'NE GT syn', 'NE LCS', 'NE LCS syn', 'NE Overlap', 'NE Overlap syn', 'LCSubstring', 'Sentence Lengths', 'String Matching', 'Punctuation Overlap' ] best_feautures = [ 'Sparse Vector', 'Word Level Levenshtein', 'TO', 'GST', 'NE LCS' ] best_features_all_combinations = [] for i in range(2, 5): for c in itertools.combinations(best_feautures, i): best_features_all_combinations.append(c) feature_files = os.listdir(slash.join(processed_news_features_path)) #nums = [18, 94, 9, 78, 93, 76, 40, 59, 45, 49, 15, 54, 85, 25] nums = [9, 15, 18, 49, 78, 94] best_files = [(feature_files[i], i) for i in nums] with open(slash.join(saved_models_path + ['multiple_features_scores_file.csv']), 'w', newline='') as f: writer = csv.writer(f, delimiter=';') # add featureselection #header = ['modelnum', 'usedcorpora', 'corpussize', 'loss', 'accuracy', # 'loss on meter', 'accuracy on meter', 'featureselection'] header = [ 'modelnum', 'mscoco', 'msrp', 'msrp-a', 'opinosis', 'p4p', 'quora', 'balanced', 'corpussize', 'loss', 'accuracy', 'loss on meter', 'accuracy on meter', 'Sparse Vector', 'Word Level Levenshtein', 'TO', 'GST', 'NE LCS' ] writer.writerow(header) modelnum = 0 for feature_combination in best_features_all_combinations: for feature_file_name in best_files: start = time.time() new_model_row = [0 for _ in list(range(len(header)))] #new_model_row[0] = modelnum new_model_row[0] = feature_file_name[1] #usedcorpora = feature_file_name[0][:-4].split('_') usedcorpora = feature_file_name[0][:-4].split('_') new_model_row[1] = 1 if 'mscoco' in usedcorpora else 0 new_model_row[2] = 1 if 'msrp' in usedcorpora else 0 new_model_row[3] = 1 if 'msrpa' in usedcorpora else 0 new_model_row[4] = 1 if 'opinosis' in usedcorpora else 0 new_model_row[5] = 1 if 'p4p' in usedcorpora else 0 new_model_row[6] = 1 if 'quora' in usedcorpora else 0 new_model_row[7] = 1 if usedcorpora[0] == 'balance' else 0 new_model_row[8] = int(usedcorpora[-1]) # corpussize model, score = classify(feature_selection, 0.1, feature_file_name[0]) #model, score = classify(feature_selection, 0.1, feature_file_name) loss, accuracy = score # meterstuff X_train, X_test, y_train, y_test = get_meter_features( model, 0.1, feature_selection) loss_on_meter, accuracy_on_meter = meter_classify( X_train, X_test, y_train, y_test) new_model_row[9] = loss new_model_row[10] = accuracy new_model_row[11] = loss_on_meter new_model_row[12] = accuracy_on_meter new_model_row[ 13] = 1 if 'Sparse Vector' in feature_combination else 0 new_model_row[ 14] = 1 if 'Word Level Levenshtein' in feature_combination else 0 new_model_row[15] = 1 if 'TO' in feature_combination else 0 new_model_row[16] = 1 if 'GST' in feature_combination else 0 new_model_row[17] = 1 if 'NE LCS' in feature_combination else 0 writer.writerow(new_model_row) print(modelnum, 'done in', time.time() - start) K.clear_session() modelnum += 1 """for feature_file_name in feature_files:
def parse(file_name): """ At the moment only returns the featuresets and labels, but more information is available :return: """ feature_names = [ 'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector', # 'TFIDF': tfidf_X, 'Word Embeddings', 'Word Lengths', 'Word Level Levenshtein', 'WN Path Matrix', 'WN LCH Matrix', 'WN WUP Matrix', 'NE Coupling', 'NE Coupling syn', 'NE GT', 'NE GT syn', 'NE LCS', 'NE LCS syn', 'NE Overlap', 'NE Overlap syn', 'LCSubstring', 'Sentence Lengths', 'String Matching', 'Punctuation Overlap' ] file_path = slash.join(processed_news_features_path + [file_name]) return 's', 't', file_path phrase_pairs = [] labels = [] origins = [] features = { 'GST': [], 'GST syn': [], 'LCS': [], 'LCS syn': [], 'TO': [], 'TO syn': [], 'Sparse Vector': [], # 'TFIDF': tfidf_X, 'Word Embeddings': [], 'Word Lengths': [], 'Word Level Levenshtein': [], 'WN Path Matrix': [], 'WN LCH Matrix': [], 'WN WUP Matrix': [], 'NE Coupling': [], 'NE Coupling syn': [], 'NE GT': [], 'NE GT syn': [], 'NE LCS': [], 'NE LCS syn': [], 'NE Overlap': [], 'NE Overlap syn': [], 'LCSubstring': [], 'Sentence Lengths': [], 'String Matching': [], 'Punctuation Overlap': [] } with open(file_path, 'r') as f: reader = csv.reader(f, delimiter=',') #i = 0 for line in reader: if len(line) > 0 and line[0] != 'source': #print(line) phrase_pairs.append(line[:2]) #labels.append([1,0] if line[2] == '1' else [0,1]) labels.append(int(line[2])) #labels.append(float(line[2])) origins.append(line[3]) f = list(zip(feature_names, [float(n) for n in line[4:]])) #print(f) for label, val in f: features[label].append(val) #i+=1 #if i == 800: # break return features, labels