def main(*x, **r): # 1st r start_time = time.time() base = '/share/aagrawa8/Data/SE/' #base = '/home/amrit/GITHUB/LDAClassification/results/SE/' path = os.path.join(base, 'jaccard_tune_grow_oracle', r['file'], str(r['term'])) #path = os.path.join(base, 'untuned_svm_topics_smote', r['file'], str(r['term'])) if not os.path.exists(path): os.makedirs(path) l = np.asarray(x) b = int(l[0]) path1 = path + "/K_" + str(b) + "_a_" + str(l[1]) + "_b_" + str( l[2]) + ".txt" with open(path1, "w") as f: f.truncate() topics, tops, word, corpus, tar, log = _test_LDA( l, path1, file=r['file'], data_samples=r['data_samples'], target=r['target']) top = [] fscore = svmtopics.main(data=tops, file=r['file'], target=tar, tune=r['tune']) for i in topics: temp = str(i.encode('ascii', 'ignore')) top.append(temp) a = jaccard(b, score_topics=top, term=r['term']) fo = open(path1, 'a+') #fo.write("\nScore: " + str(a)) fo.write("\nScore: " + str(a)) fo.write("\nRuntime: --- %s seconds ---\n" % (time.time() - start_time)) fo.close() return a, fscore
def _topics(res=''): #fileB = ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF', 'processed_citemap.txt'] #fileB = ['SE0', 'SE6', 'SE1', 'SE8', 'SE3'] filepath = '/share/aagrawa8/Data/SE/' start_time = time.time() #filepath='/Users/amrit/GITHUB/LDAClassification/dataset/SE/' random.seed(1) global bounds cross_tune='no' grow_oracle='yes' data_samples, labellist = readfile1(filepath + str(res)+'.txt') split = split_two(corpus=data_samples, label=labellist) pos = np.array(split['pos']) neg = np.array(split['neg']) cut_pos = int(len(pos) * 80 / 100) cut_neg = int(len(neg) * 80 / 100) ##list of f2 scores tunedlis=[] untunedlis=[] #dictionary containing bestone, time for 1 run, f2 cross={} #dictionary containing cross, lis,full time file={} for folds in range(5): start_time1 = time.time() pos_shuffle = range(0, len(pos)) neg_shuffle=range(0, len(neg)) shuffle(pos_shuffle) shuffle(neg_shuffle) pos=pos[pos_shuffle] neg=neg[neg_shuffle] data_train, train_label=list(pos)[:cut_pos]+list(neg)[:cut_neg],['pos']*cut_pos + ['neg']*cut_neg data_test, test_label = list(pos)[cut_pos:]+list(neg)[cut_neg:], ['pos']*(len(pos)-cut_pos) + ['neg']*(len(neg)-cut_neg) # stability score format dict, file,lab=score # parameter variations (k,a,b), format, list of lists, file,lab=[[k,a,b], Rn score, fscore] #final_para_dic={} # final paras and scores, file, lab=[[k,a,b],[r, f1]] de = DE(F=0.7, CR=0.3, x='rand') global max_fitness max_fitness = 0 pop = [[random.randint(bounds[0][0], bounds[0][1]), random.uniform(bounds[1][0], bounds[1][1]), random.uniform(bounds[2][0], bounds[2][1])] for _ in range(10)] v, score, final_para_dic = de.solve(main, pop, iterations=3, file=res, term=7, data_samples=data_train,target=train_label,tune='yes') ##score is a list of [jaccard and fscore] bestone = [v,score] # runtime,format dict, file,=runtime in secs l=bestone tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') tf = tf_vectorizer.fit_transform(data_train+data_test) lda1 = lda.LDA(n_topics=int(l[0][0]), alpha=l[0][1], eta=l[0][2], n_iter=200) lda1.fit_transform(tf) tops = lda1.doc_topic_ tops = csr_matrix(tops) tops = l2normalize(tops).toarray() f2=svmtopics.main(data=tops, file=res, target=train_label+test_label,tune='no') tunedlis.append(f2) ## untuned experiment lda2 = lda.LDA(n_topics=20, alpha=0.1, eta=0.01, n_iter=200) lda2.fit_transform(tf) tops1 = lda2.doc_topic_ tops1 = csr_matrix(tops1) tops1 = l2normalize(tops1).toarray() untuned_f2 = svmtopics.main(data=tops1, file=res, target=train_label + test_label, tune='no') untunedlis.append(untuned_f2) time2 = time.time() - start_time1 cross[folds] = [ bestone,time2,f2,untuned_f2] print("\nRuntime for 1 loop of DE termination: --- %s seconds ---\n" % (time2)) time1=time.time() - start_time print(tunedlis) print(untunedlis) file[res]=[cross,tunedlis,untunedlis,time1] print("\nTotal Runtime: --- %s seconds ---\n" % (time.time() - start_time)) with open('dump/DE_jaccard_tune_grow_oracle'+res+'.pickle', 'wb') as handle: pickle.dump(file, handle) ##untuned experiment '''l={}
def _topics(res=''): #fileB = ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF', 'processed_citemap.txt'] #fileB = ['SE0', 'SE6', 'SE1', 'SE8', 'SE3'] filepath = '/share/aagrawa8/Data/SE/' start_time = time.time() #filepath='/Users/amrit/GITHUB/LDAClassification/dataset/SE/' data_samples, labellist = readfile1(filepath + str(res) + '.txt') split = split_two(corpus=data_samples, label=labellist) pos = split['pos'] neg = split['neg'] cut_pos = int(len(pos) * 80 / 100) cut_neg = int(len(neg) * 80 / 100) data_train, train_label = pos[:cut_pos] + neg[:cut_neg], [ 'pos' ] * cut_pos + ['neg'] * cut_neg data_test, test_label = pos[cut_pos:] + neg[cut_neg:], ['pos'] * ( len(pos) - cut_pos) + ['neg'] * (len(neg) - cut_neg) labels = [7] #[1, 2, 3, 4, 5, 6, 7, 8, 9] random.seed(1) global bounds # stability score format dict, file,lab=score result = {} # parameter variations (k,a,b), format, list of lists, file,lab=[[k,a,b], Rn score, fscore] final_para_dic = {} # final paras and scores, file, lab=[[k,a,b],[r, f1]] bestone = {} de = DE(F=0.7, CR=0.3, x='rand') temp1 = {} temp2 = {} temp3 = {} for lab in labels: global max_fitness max_fitness = 0 #print(res+'\t'+str(lab)) pop = [[ random.randint(bounds[0][0], bounds[0][1]), random.uniform(bounds[1][0], bounds[1][1]), random.uniform(bounds[2][0], bounds[2][1]) ] for _ in range(10)] v, score, l = de.solve(main, pop, iterations=5, file=res, term=lab, data_samples=data_train, target=train_label, tune='yes') temp1[lab] = l ##score is a list of [jaccard and fscore] print(v, '->', score) temp3[lab] = score temp2[lab] = [v, score] result[res] = temp3 final_para_dic[res] = temp1 bestone[res] = temp2 print(result) print(bestone) print(final_para_dic) time1 = {} # runtime,format dict, file,=runtime in secs time1[res] = time.time() - start_time l = bestone[res][7] print(l) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') tf = tf_vectorizer.fit_transform(data_train + data_test) lda1 = lda.LDA(n_topics=int(l[0][0]), alpha=l[0][1], eta=l[0][2], n_iter=200) lda1.fit_transform(tf) tops = lda1.doc_topic_ fscore = {} fscore[res] = svmtopics.main(data=tops, file=res, target=train_label + test_label, tune='no') print(fscore) temp = [result, final_para_dic, bestone, time1, fscore] with open('dump/DE_class_topics_' + res + '.pickle', 'wb') as handle: pickle.dump(temp, handle) print("\nTotal Runtime: --- %s seconds ---\n" % (time1[res])) ##untuned experiment '''l={}
def _topics(res=''): # fileB = ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF', 'processed_citemap.txt'] # fileB = ['SE0', 'SE6', 'SE1', 'SE8', 'SE3'] filepath = '/share/aagrawa8/Data/SE/' start_time = time.time() #filepath = '/Users/amrit/GITHUB/LDAClassification/dataset/SE/' random.seed(1) global bounds cross_tune = 'no' grow_oracle = 'yes' data_samples, labellist = readfile1(filepath + str(res) + '.txt') split = split_two(corpus=data_samples, label=labellist) pos = np.array(split['pos']) neg = np.array(split['neg']) cut_pos, cut_neg = cut_position(pos, neg, percentage=20) ##list of f2 scores untuned_lis = [] tuned_lis = [] # dictionary containing bestone, time for 1 run, f2 cross = {} # dictionary containing cross, lis,full time file = {} for folds in range(5): start_time1 = time.time() pos_shuffle = range(0, len(pos)) neg_shuffle = range(0, len(neg)) shuffle(pos_shuffle) shuffle(neg_shuffle) pos = pos[pos_shuffle] neg = neg[neg_shuffle] data_train, train_label, data_test, test_label = divide_train_test( pos, neg, cut_pos, cut_neg) de = DE(F=0.7, CR=0.3, x='rand') global max_fitness max_fitness = 0 pop = [[ random.randint(bounds[0][0], bounds[0][1]), random.uniform(bounds[1][0], bounds[1][1]), random.uniform(bounds[2][0], bounds[2][1]) ] for _ in range(10)] v, score, final_para_dic = de.solve(main, pop, iterations=3, bounds=bounds, file=res, term=7, data_samples=data_train, target=train_label, tune='yes') ##score is a list of [jaccard and fscore] bestone = [v, score] # runtime,format dict, file,=runtime in secs l = bestone tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') tf = tf_vectorizer.fit_transform(data_train + data_test) lda1 = lda.LDA(n_topics=int(l[0][0]), alpha=l[0][1], eta=l[0][2], n_iter=200) lda1.fit_transform(tf) tops = lda1.doc_topic_ split = split_two(corpus=tops, label=np.array(train_label + test_label)) pos1 = np.array(split['pos']) neg1 = np.array(split['neg']) data_train, train_label, data_test, test_label = divide_train_test( pos1, neg1, cut_pos, cut_neg) ## Run with default features perc = len(train_label) * 100 / len(train_label + test_label) weight_length = int(l[0][0]) new_bounds = bound * weight_length pop1 = [1.0 for _ in range(weight_length)] f21 = svmtopics.main(*pop1, data=data_train + data_test, target=train_label + test_label, tune='no', percentage=perc) untuned_lis.append(f21) time2 = time.time() - start_time1 bestone.append(time2) split1 = split_two(corpus=data_test, label=np.array(test_label)) pos1 = np.array(split1['pos']) neg1 = np.array(split1['neg']) cut_pos1, cut_neg1 = cut_position(pos1, neg1, percentage=50) data_grow, grow_label, data_test, test_label = divide_train_test( pos1, neg1, cut_pos1, cut_neg1) start_time2 = time.time() ## Another DE to find the magic weights max_fitness = 0 pop = [[ random.uniform(bound[0][0], bound[0][1]) for _ in range(weight_length) ] for _ in range(10)] perc1 = (len(train_label) + len(grow_label) / 2) * 100 / len(train_label + grow_label) v, score, final_para_dic = de.solve(svmtopics.main, pop, iterations=6, bounds=new_bounds, data=data_train + data_grow, target=train_label + grow_label, tune='no', percentage=perc1) bestone1 = [v, score] perc1 = (len(train_label) + len(grow_label) ) * 100 / len(train_label + grow_label + test_label) #testing the modified features. f22 = svmtopics.main(*v, data=data_train + data_grow + data_test, target=train_label + grow_label + test_label, tune='no', percentage=perc1) time3 = time.time() - start_time2 bestone1.append(time3) tuned_lis.append(f22) cross[folds] = [bestone, bestone1, f21, f22] print("\nRuntime for 1 loop of DE termination: --- %s seconds ---\n" % (time2 + time3)) time1 = time.time() - start_time file[res] = [cross, untuned_lis, tuned_lis, time1] print(file[res]) print("\nTotal Runtime: --- %s seconds ---\n" % (time.time() - start_time)) with open('dump/DE_magic_weights_' + res + '.pickle', 'wb') as handle: pickle.dump(file, handle)
def _topics(res=''): #fileB = ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF', 'processed_citemap.txt'] #fileB = ['SE0', 'SE6', 'SE1', 'SE8', 'SE3'] filepath = '/share/aagrawa8/Data/SE/' start_time = time.time() #filepath='/home/amrit/GITHUB/LDAClassification/dataset/SE/' data_samples, labellist = readfile1(filepath + str(res) + '.txt') labels = [7] #[1, 2, 3, 4, 5, 6, 7, 8, 9] random.seed(1) global bounds # stability score format dict, file,lab=score result = {} # parameter variations (k,a,b), format, dict, file,lab,each score=k,a,b final_para_dic = {} # final generation, format dict, file,lab=parameter, score final_current_dic = {} de = DE(F=0.7, CR=0.3, x='rand') temp1 = {} temp2 = {} temp3 = {} for lab in labels: global max_fitness max_fitness = 0 #print(res+'\t'+str(lab)) pop = [[ random.randint(bounds[0][0], bounds[0][1]), random.uniform(bounds[1][0], bounds[1][1]), random.uniform(bounds[2][0], bounds[2][1]) ] for _ in range(10)] v, score, para_dict, gen = de.solve(main, pop, iterations=3, file=res, term=lab, data_samples=data_samples, target=labellist) temp1[lab] = para_dict #temp2[lab]=gen #print(v, '->', score) temp3[lab] = score result[res] = temp3 final_para_dic[res] = temp1 #final_current_dic[res]=temp2 print(result) #print(final_current_dic) print(final_para_dic) time1 = {} ## Running the lda again with max score l = final_para_dic[res][7][result[res][7]] print(l) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') tf = tf_vectorizer.fit_transform(data_samples) lda1 = lda.LDA(n_topics=int(l[0][0]), alpha=l[0][1], eta=l[0][2], n_iter=100) lda1.fit_transform(tf) tops = lda1.doc_topic_ fscore = {} fscore[res] = svmtopics.main(data=tops, file=res, target=labellist) # runtime,format dict, file,=runtime in secs time1[res] = time.time() - start_time temp = [result, final_para_dic, time1, fscore] with open('dump/DE_class_topics_' + res + '.pickle', 'wb') as handle: pickle.dump(temp, handle) print("\nTotal Runtime: --- %s seconds ---\n" % (time1[res])) ##untuned experiment '''tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')