def findDistribution(N=0.3,length=1000,sampled=400): sentences = [] for s in df['Sentence']: sentences.append(s) if(len(sentences)==length): break print("[INFO] No of sentences= "+str(len(sentences))) vectorizer = Vectorizer() vectorizer.bert(sentences) vectors_bert = vectorizer.vectors data=[] for i in range(length): for j in range(i+1,length): dist = spatial.distance.cosine(vectors_bert[i], vectors_bert[j]) data.append([i+1,j+1,dist]) if(((i+1)/length * 100)%10==0): print(str((i+1)/length * 100)+" % done") data_sorted=sorted(data,key=l2,reverse=True) G = snap.TUNGraph.New() for i in range(length): G.AddNode(i) val=int(length*N) for i in range (val): G.AddEdge(data_sorted[i][0],data_sorted[i][1]) PRankH = G.GetPageRank() adj=dict() for i in G.Nodes(): adj[i.GetId()]=[] for id in G.Nodes(): i=id.GetId() for w in id.GetOutEdges(): adj[i].append(w) pagerank=dict() for item in PRankH: pagerank[item]= PRankH[item] final=[] while(len(final)<sampled): pr_list=makelist(pagerank) pr_list=sort(pr_list) val=pr_list[0][0] for u in adj[val]: if u in pagerank: pagerank[u]*=0.8 pagerank.pop(val) final.append(val) counts=dict() for i in range(7): counts[i]=0 for i in final: counts[df_label.iloc[i,1]]+=1 return counts
from sent2vec.vectorizer import Vectorizer import pandas as pd import time # setup sentence_data = pd.read_csv("./data/tasks/sentence_correction/task_data.csv") whole_sentences = [] if __debug__: print(sentence_data.columns) start_time = time.time() # each "row" contains its "values" as list item # save corrected sentences to "whole_sentences" for row, values in sentence_data.iterrows(): whole_sentences.append(values[2].format(values[3].strip("{}"))) sentence_data["sentence_corpus"] = whole_sentences # create vectorized items and save them as list vectorizer = Vectorizer() vectorizer.bert(sentence_data["sentence_corpus"]) sentence_data["sentence_vectors"] = vectorizer.vectors.tolist() if __debug__: print(sentence_data.index) end_time = time.time() - start_time print(end_time) sentence_data.to_pickle("pickled_sentences")
stat = group(d.split(' '), 4) f = [] for s in stat: f.extend(pea_pick(s)) f = [ff for ff in f if len(ff) > 0] pairs = group(f, 2) pairs = [[' '.join(pp) for pp in p] for p in pairs] return pairs for sent in data: sentences = chunky(sent) dist = [] for p in sentences: print(p) if len(p) > 1: vectorizer = Vectorizer() vectorizer.bert(p) vectors_bert = vectorizer.vectors dist.append( spatial.distance.cosine(vectors_bert[0], vectors_bert[1])) print(dist[-1]) avg = int(np.average(dist) * 1000000) print("Sentence: " + sent) print(avg) print()
#Change this to ' for x in range(1): ' to rapidaly test on first 2 queries for x in range(49): #iterate queries again print("starting query...") ''' docidarray = [] #one for each query for i in range (1000): #again the top 1000 results docid = results[i][0] docidarray.append(docid) ''' #do the bert encoding docVectors = encodeBERT(firstResultsList[x], Documents) #-> [['bert processed tweet', doc id],..] #print (numpy.array(docVectors[1][0])) #encode query vectorizer = Vectorizer() vectorizer.bert(queriesLst[x][1]) #current query queryVect = vectorizer.vectors newRank = [] for i in range (len(docVectors)): #calculate vector length dist = spatial.distance.cosine(queryVect[0], numpy.array(docVectors[i][0])) newRank.append([dist, docVectors[i][1]]) #-> appends [similarity distance, doc id] #3.rank the docs again based on scores (use sorted() function) sortedNewRank = sorted(newRank) newDocRankingList.append(sortedNewRank) print(newDocRankingList) #4. write to results file
def sent2vec_feature(utterances): vectorizer = Vectorizer() vectorizer.bert(utterances) return vectorizer.vectors
topic_info_dict: Dict[str, str] = file_operation.extract_file() result_dict = {} tokens_dict: Dict[str, List[str]] = tokenizer.tokenize_bert(topic_info_dict) train_query, test_query = file_operation.extract_queries_for_bert() nltk.download('punkt') for topic_id in train_query: result_dict[topic_id] = {} train_query[topic_id] = nltk.tokenize.sent_tokenize( train_query[topic_id]) vectorizer = Vectorizer() for topic_id in train_query: topic_id = str(topic_id) executor = ThreadPoolExecutor(len(tokens_dict)) num_of_sentence_query = len(train_query[topic_id]) i = 0 ths = [] temp_bm25_dict = bm25_dict[topic_id] sorted_bm25 = list( dict( sorted(temp_bm25_dict.items(), key=lambda item: item[1], reverse=True)).keys())[:100] for doc_id in sorted_bm25:
def add(request): if request.FILES: uploadedFile = request.FILES['fname'] sepsent = get_sentence_from_file(uploadedFile) print(sepsent) i = 1 Svectors = [] # list of all bert vectors centroids = {} # list of centers of clusters # for each cluster we defined separate distance and vector list Dist1 = [] Dist2 = [] Dist3 = [] Dist4 = [] Cluster1 = [] Cluster2 = [] Cluster3 = [] Cluster4 = [] # showing the progress information on the display i = 0 # we take each sentence from the list and calculate its bert representation in vector for x in sepsent: progress_percent = round(((i * 100) / len(sepsent)), 2) remained_time_h = int(((7 * len(sepsent)) - (i * 7)) / 3600) remained_time_m = ((7 * len(sepsent)) - (i * 7)) % 3600 print(' ---------------- progress :' + str(progress_percent) + '% ---------remaining time(hh:mm): ' + str(remained_time_h) + ':' + str(remained_time_m) + ' ------', end='\r') i = i + 1 vectorizer = Vectorizer() vectorizer.bert(x) vectors_bert = vectorizer.vectors Svectors.append(vectors_bert[0]) # we took 4 random centers for k means algorithm if os.path.isfile('center_json_data.json'): pfc = open('center_json_data.json') jcenter = json.load(pfc) pfc.close() centroid1 = jcenter['center1'] centroid2 = jcenter['center2'] centroid3 = jcenter['center3'] centroid4 = jcenter['center4'] else: centroid1v = sum(Svectors) / len(Svectors) centroid2v = sum(Svectors) / (len(Svectors) / 2) centroid3v = sum(Svectors) / (len(Svectors) / 10) centroid4v = sum(Svectors) / (len(Svectors) / 28) centroid1 = centroid1v.tolist() centroid2 = centroid2v.tolist() centroid3 = centroid3v.tolist() centroid4 = centroid4v.tolist() print(centroid1) # creating json format for them to save them later lock1 = 0 lock2 = 0 lock3 = 0 lock4 = 0 loop_no = 0 while True: print('---cluster:---') print(len(Cluster1)) print(len(Cluster2)) print(len(Cluster3)) print(len(Cluster4)) print('----------------') print('#######################') if len(Cluster1) > 0: if (centroid1 != (sum(Cluster1) / len(Cluster1))).all(): centroidiv1 = sum(Cluster1) / len(Cluster1) centroid1 = centroidiv1.tolist() else: lock1 = 1 else: if loop_no > 100: lock1 = 1 if len(Cluster2) > 0: if (centroid2 != (sum(Cluster2) / len(Cluster2))).all(): centroidiv2 = sum(Cluster2) / len(Cluster2) centroid2 = centroidiv2.tolist() else: lock2 = 1 else: if loop_no > 100: lock2 = 1 if len(Cluster3) > 0: if (centroid3 != (sum(Cluster3) / len(Cluster3))).all(): centroidiv3 = sum(Cluster3) / len(Cluster3) centroid3 = centroidiv3.tolist() else: lock3 = 1 else: if loop_no > 100: lock3 = 1 if len(Cluster4) > 0: if (centroid4 != (sum(Cluster4) / len(Cluster4))).all(): centroidiv4 = sum(Cluster4) / len(Cluster4) centroid4 = centroidiv4.tolist() else: lock4 = 1 else: if loop_no > 100: lock4 = 1 Dist1.clear() Cluster1.clear() Dist2.clear() Cluster2.clear() Dist3.clear() Cluster3.clear() Dist4.clear() Cluster4.clear() for x in Svectors: Tdist1 = spatial.distance.cosine(centroid1, x) Tdist2 = spatial.distance.cosine(centroid2, x) Tdist3 = spatial.distance.cosine(centroid3, x) Tdist4 = spatial.distance.cosine(centroid4, x) if Tdist1 == min([Tdist1, Tdist2, Tdist3, Tdist4]): Dist1.append(Tdist1) Cluster1.append(x) elif Tdist2 == min([Tdist1, Tdist2, Tdist3, Tdist4]): Dist2.append(Tdist2) Cluster2.append(x) elif Tdist3 == min([Tdist1, Tdist2, Tdist3, Tdist4]): Dist3.append(Tdist3) Cluster3.append(x) elif Tdist4 == min([Tdist1, Tdist2, Tdist3, Tdist4]): Dist4.append(Tdist4) Cluster4.append(x) print('---lock---') print(lock1) print(lock2) print(lock3) print(lock4) loop_no = loop_no + 1 if lock1 == 1 and lock2 == 1 and lock3 == 1 and lock4 == 1: print('break') break json_center = { 'center1': centroid1, 'center2': centroid2, 'center3': centroid3, 'center4': centroid4, } with open('center_json_data.json', 'w') as fc: json.dump(json_center, fc) fc.close() if os.path.isfile('meanDistance_json_data.json'): pfd = open('meanDistance_json_data.json') jdist = json.load(pfd) previous_dist1 = jdist['dist1'] previous_dist2 = jdist['dist2'] previous_dist3 = jdist['dist3'] previous_dist4 = jdist['dist4'] if previous_dist1 != 0: Dist1.append(previous_dist1) if previous_dist2 != 0: Dist2.append(previous_dist2) if previous_dist3 != 0: Dist3.append(previous_dist3) if previous_dist4 != 0: Dist4.append(previous_dist4) if len(Dist1) > 0: MeanDist1 = sum(Dist1) / len(Dist1) else: MeanDist1 = 0 if len(Dist2) > 0: MeanDist2 = sum(Dist2) / len(Dist2) else: MeanDist2 = 0 if len(Dist3) > 0: MeanDist3 = sum(Dist3) / len(Dist3) else: MeanDist3 = 0 if len(Dist4) > 0: MeanDist4 = sum(Dist4) / len(Dist4) else: MeanDist4 = 0 json_MeanDist = { 'dist1': MeanDist1, 'dist2': MeanDist2, 'dist3': MeanDist3, 'dist4': MeanDist4, } with open('meanDistance_json_data.json', 'w') as fd: json.dump(json_MeanDist, fd) fd.close() context = {'center': 'centroi', 'dist': 'MeanDist'} else: context = {'filename': '', 'dist': ''} return render(request, 'ndex.html', context)