def evaluate(request): fc = open('center_json_data.json') jcenter = json.load(fc) center1 = jcenter['center1'] center2 = jcenter['center2'] center3 = jcenter['center3'] center4 = jcenter['center4'] fd = open('meanDistance_json_data.json') jdist = json.load(fd) dist1 = jdist['dist1'] dist2 = jdist['dist2'] dist3 = jdist['dist3'] dist4 = jdist['dist4'] text = request.GET['text'] vectorizer = Vectorizer() vectorizer.bert(text) vectors_bert = vectorizer.vectors Tdist1 = spatial.distance.cosine(center1, vectors_bert[0]) Tdist2 = spatial.distance.cosine(center2, vectors_bert[0]) Tdist3 = spatial.distance.cosine(center3, vectors_bert[0]) Tdist4 = spatial.distance.cosine(center4, vectors_bert[0]) result = '' if Tdist1 < dist1: result = 'hatespeech' elif Tdist2 < dist2: result = 'hatespeech' elif Tdist3 < dist3: result = 'hatespeech' elif Tdist4 < dist4: result = 'hatespeech' else: result = 'not hatespeech' context = {'title': 'evaluating', 'result': result} return render(request, 'evaluate.html', context)
def test_bert_03(): sentences = ["401k retirement accounts", "401k retirement accounts"] vectorizer = Vectorizer() vectorizer.bert(sentences) dist = spatial.distance.cosine(vectorizer.vectors[0], vectorizer.vectors[1]) assert dist == 0
def test_bert_01(): sentences = [ "This is an awesome book to learn NLP.", "DistilBERT is an amazing NLP model.", "We can interchangeably use embedding or encoding, or vectorizing.", ] vectorizer = Vectorizer() vectorizer.bert(sentences) assert len(vectorizer.vectors[0, :]) == 768
def compare_two_sentences(sentence_1, sentence_2): sentences = [sentence_1, sentence_2] vectorizer = Vectorizer() vectorizer.bert(sentences) vec_1, vec_2 = vectorizer.vectors dist = spatial.distance.cosine(vec_1, vec_2) return dist
def test_bert_04(): sentences = ["401k retirement accounts"] vectorizer = Vectorizer() vectorizer.bert(sentences) vec_1 = vectorizer.vectors[0] vectorizer.bert(sentences) vec_2 = vectorizer.vectors[0] dist = spatial.distance.cosine(vec_1, vec_2) assert dist == 0
def test_bert_02(): sentences = [ "This is an awesome book to learn NLP.", "DistilBERT is an amazing NLP model.", "We can interchangeably use embedding, encoding, or vectorizing.", ] vectorizer = Vectorizer() vectorizer.bert(sentences) dist_1 = spatial.distance.cosine(vectorizer.vectors[0], vectorizer.vectors[1]) dist_2 = spatial.distance.cosine(vectorizer.vectors[0], vectorizer.vectors[2]) print('dist_1: {0}, dist_2: {1}'.format(dist_1, dist_2)) assert dist_1 < dist_2
def bert(query, tweetList): print("start of BERT") ##Added for A2 part 1. vectorizer = Vectorizer() queryString = "" for word in query: queryString = queryString + " " + word queryString = [queryString] queryString.extend(tweetList) print("Number of strings being processed " + str(len(queryString))) vectorizer.bert(queryString) vectors = vectorizer.vectors print("end of BERT") return vectors
def sen_similarity(sen1: str, sen2: str): """ Returns similarity between two input sentences :param sen1: first sentence :param sen2: second sentence :return: similarity score between 0 and 1, closer to 0 more similar. """ # vectorize the sentences vectorizer = Vectorizer() vectorizer.bert([sen1, sen2]) vectors_bert = vectorizer.vectors similarity = spatial.distance.cosine(vectors_bert[0], vectors_bert[1]) return similarity
class SentenceSimilarity_BERT(SentenceSimilarity_abstract): def __init__(self): self.vectorizer = Vectorizer() # this function computes the similarity between two sentences, the more similar the two snetends are the lower the # computed score is def compute_SentenceToSentence_similarity(self, sentenceA, sentenceB): sentences = [sentenceA, sentenceB] self.vectorizer.bert(sentences) vectors = self.vectorizer.vectors embeddingOf_sentenceA = vectors[0] embeddingOf_sentenceB = vectors[1] distance = spatial.distance.cosine(embeddingOf_sentenceA, embeddingOf_sentenceB) return distance
def encodeBERT (docIDArray, Documents): #isolating the 1000 chosen documents for a query chosenTweetList = [] tweets=[] for d in docIDArray: tweets.append(Documents[d][0]) #actual tweets #running BERT vectorizer = Vectorizer() vectorizer.bert(tweets) vectors = vectorizer.vectors #print(vectors[0]) for i in range(len(docIDArray)): chosenTweetList.append([vectors[i].tolist(), docIDArray[i]]) #-> ['berted tweet', doc id] return chosenTweetList
def test_complete(): sentences = [ "Alice is in the Wonderland.", "Alice is not in the Wonderland.", ] vectorizer = Vectorizer() vectorizer.bert(sentences) vectors_bert = vectorizer.vectors dist_bert = spatial.distance.cosine(vectors_bert[0], vectors_bert[1]) splitter = Splitter() splitter.sent2words(sentences=sentences, remove_stop_words=['not'], add_stop_words=[]) vectorizer.word2vec(splitter.words, pretrained_vectors_path=PRETRAINED_VECTORS_PATH_WIKI) vectors_w2v = vectorizer.vectors dist_w2v = spatial.distance.cosine(vectors_w2v[0], vectors_w2v[1]) print('dist_bert: {0}, dist_w2v: {1}'.format(dist_bert, dist_w2v)) assert dist_w2v > dist_bert
class SentenceSimilarity_translationBased(SentenceSimilarity_abstract): def __init__(self): self.vectorizer = Vectorizer() self.translationModel = EasyNMT('opus-mt') self.targetLanguage = "en" # this function computes the similarity between two sentences, the more similar the two snetends are the lower the # computed score is def compute_SentenceToSentence_similarity(self, sentenceA, sentenceB): sourceLanguageA = self.translationModel.language_detection(sentenceA) translationsA = self.translationModel.translate( [sentenceA], source_lang=sourceLanguageA, target_lang=self.targetLanguage) sourceLanguageB = self.translationModel.language_detection(sentenceB) translationsB = self.translationModel.translate( [sentenceB], source_lang=sourceLanguageB, target_lang=self.targetLanguage) sentences = [translationsA[0], translationsB[0]] self.vectorizer.bert(sentences) vectors = self.vectorizer.vectors embeddingOf_sentenceA = vectors[0] embeddingOf_sentenceB = vectors[1] print("\nsentenceA \"" + sentenceA + "\" --- sourceLanguageA=" + sourceLanguageA + " --- translation = " + translationsA[0]) print("sentenceB \"" + sentenceB + "\" --- sourceLanguageB=" + sourceLanguageB + " --- translation = " + translationsB[0]) distance = spatial.distance.cosine(embeddingOf_sentenceA, embeddingOf_sentenceB) return distance
def k_mean(request): message = '' c_l_s = [] sentence = request.POST['sentences'] k = request.POST['k'] sepsent = re.split('\n', sentence) print(sepsent) vectorizer = Vectorizer() vectorizer.bert(sepsent) vectors_bert = vectorizer.vectors vecTosen_dictionary = {} counter = 0 for v in vectors_bert: vecTosen_dictionary[sum(v)] = sepsent[counter] counter = counter + 1 [clusters, x, y] = group_similer_vectors(vectors_bert, k) if len(clusters) != 0: for cluster in clusters: print('################################') c = [] for y in cluster.listelement: c.append(vecTosen_dictionary[sum(y)]) c_l_s.append(c) else: message += 'wrong cluster no. it should be less then sentence numbers' context = { 'clustered_sentences': c_l_s, 'k': len(c_l_s), 'msg': message, 'x': x, 'y': y, } return render(request, 'main.html', context)
def sent2vec_feature(utterances): vectorizer = Vectorizer() vectorizer.bert(utterances) return vectorizer.vectors
for x in range(49): #iterate queries again print("starting query...") ''' docidarray = [] #one for each query for i in range (1000): #again the top 1000 results docid = results[i][0] docidarray.append(docid) ''' #do the bert encoding docVectors = encodeBERT(firstResultsList[x], Documents) #-> [['bert processed tweet', doc id],..] #print (numpy.array(docVectors[1][0])) #encode query vectorizer = Vectorizer() vectorizer.bert(queriesLst[x][1]) #current query queryVect = vectorizer.vectors newRank = [] for i in range (len(docVectors)): #calculate vector length dist = spatial.distance.cosine(queryVect[0], numpy.array(docVectors[i][0])) newRank.append([dist, docVectors[i][1]]) #-> appends [similarity distance, doc id] #3.rank the docs again based on scores (use sorted() function) sortedNewRank = sorted(newRank) newDocRankingList.append(sortedNewRank) print(newDocRankingList) #4. write to results file print(querycount)
stat = group(d.split(' '), 4) f = [] for s in stat: f.extend(pea_pick(s)) f = [ff for ff in f if len(ff) > 0] pairs = group(f, 2) pairs = [[' '.join(pp) for pp in p] for p in pairs] return pairs for sent in data: sentences = chunky(sent) dist = [] for p in sentences: print(p) if len(p) > 1: vectorizer = Vectorizer() vectorizer.bert(p) vectors_bert = vectorizer.vectors dist.append( spatial.distance.cosine(vectors_bert[0], vectors_bert[1])) print(dist[-1]) avg = int(np.average(dist) * 1000000) print("Sentence: " + sent) print(avg) print()
from sent2vec.vectorizer import Vectorizer from scipy import spatial from array import * import numpy sentences = [ "This is an awesome book to learn NLP.", "DistilBERT is an amazing NLP model.", "We can interchangeably use embedding, encoding, or vectorizing.", ] #encoding sentences using BERT language model vectorizer = Vectorizer() vectorizer.bert(sentences) vectors = vectorizer.vectors newv=[] for i in vectors: newv.append(i.tolist()) print(newv[1]) #computing cosine distance vectors. Smaller distance -> greater similarity dist_1 = spatial.distance.cosine(numpy.array(newv[0]), numpy.array(newv[1])) dist_2 = spatial.distance.cosine(numpy.array(newv[0]), numpy.array(newv[2])) print('dist_1: {0}, dist_2: {1}'.format(dist_1, dist_2)) assert dist_1 < dist_2 # dist_1: 0.043, dist_2: 0.192
def add(request): if request.FILES: uploadedFile = request.FILES['fname'] fs = FileSystemStorage() name = fs.save(uploadedFile.name, uploadedFile) f = open(fs.path(name), 'rt') sentenceRR = f.read() sentenceR = re.sub('@', '', sentenceRR) sentencee = re.sub('[uU][Ss][Ee][Rr]', '', sentenceR) sentenceee = re.sub('_', ' ', sentencee) sentenceeee = re.sub('-', ' ', sentenceee) sentenceeeee = re.sub('=', ' ', sentenceeee) sentence = re.sub('%', ' ', sentenceeeee) sepsent = re.findall('\d,+(.*)\n', sentence) print(sepsent) i = 1 centroids = [] # for each cluster we defined separate distance and vector list Dist1 = [] Dist2 = [] Dist3 = [] Dist4 = [] Cluster1 = [] Cluster2 = [] Cluster3 = [] Cluster4 = [] # showing the progress information on the display i = 0 # we take each sentence from the list and calculate its bert representation in vector for x in sepsent: progress_percent = round(((i * 100) / len(sepsent)), 2) remained_time_h = int(((7 * len(sepsent)) - (i * 7)) / 3600) remained_time_m = ((7 * len(sepsent)) - (i * 7)) % 3600 print(' ---------------- progress :' + str(progress_percent) + '% ---------remaining time(hh:mm): ' + str(remained_time_h) + ':' + str(remained_time_m) + ' ------', end='\r') i = i + 1 vectorizer = Vectorizer() vectorizer.bert(x) vectors_bert = vectorizer.vectors centroids.append(vectors_bert[0]) print # we took 4 random centers for k means algorithm if os.path.isfile('center_json_data.json'): pfc = open('center_json_data.json') jcenter = json.load(pfc) pfc.close() centroid1 = jcenter['center1'] centroid2 = jcenter['center2'] centroid3 = jcenter['center3'] centroid4 = jcenter['center4'] else: centroid1v = sum(centroids) / len(centroids) centroid2v = sum(centroids) / (len(centroids) / 2) centroid3v = sum(centroids) / (len(centroids) / 10) centroid4v = sum(centroids) / (len(centroids) / 28) centroid1 = centroid1v.tolist() centroid2 = centroid2v.tolist() centroid3 = centroid3v.tolist() centroid4 = centroid4v.tolist() print(centroid1) # creating json format for them to save them later lock1 = 0 lock2 = 0 lock3 = 0 lock4 = 0 loop_no = 0 while True: print('---cluster:---') print(len(Cluster1)) print(len(Cluster2)) print(len(Cluster3)) print(len(Cluster4)) print('----------------') print('#######################') if len(Cluster1) > 0: if (centroid1 != (sum(Cluster1) / len(Cluster1))).all(): centroidiv1 = sum(Cluster1) / len(Cluster1) centroid1 = centroidiv1.tolist() else: lock1 = 1 else: if loop_no > 100: lock1 = 1 if len(Cluster2) > 0: if (centroid2 != (sum(Cluster2) / len(Cluster2))).all(): centroidiv2 = sum(Cluster2) / len(Cluster2) centroid2 = centroidiv2.tolist() else: lock2 = 1 else: if loop_no > 100: lock2 = 1 if len(Cluster3) > 0: if (centroid3 != (sum(Cluster3) / len(Cluster3))).all(): centroidiv3 = sum(Cluster3) / len(Cluster3) centroid3 = centroidiv3.tolist() else: lock3 = 1 else: if loop_no > 100: lock3 = 1 if len(Cluster4) > 0: if (centroid4 != (sum(Cluster4) / len(Cluster4))).all(): centroidiv4 = sum(Cluster4) / len(Cluster4) centroid4 = centroidiv4.tolist() else: lock4 = 1 else: if loop_no > 100: lock4 = 1 Dist1.clear() Cluster1.clear() Dist2.clear() Cluster2.clear() Dist3.clear() Cluster3.clear() Dist4.clear() Cluster4.clear() for x in centroids: Tdist1 = spatial.distance.cosine(centroid1, x) Tdist2 = spatial.distance.cosine(centroid2, x) Tdist3 = spatial.distance.cosine(centroid3, x) Tdist4 = spatial.distance.cosine(centroid4, x) if Tdist1 == min([Tdist1, Tdist2, Tdist3, Tdist4]): Dist1.append(Tdist1) Cluster1.append(x) elif Tdist2 == min([Tdist1, Tdist2, Tdist3, Tdist4]): Dist2.append(Tdist2) Cluster2.append(x) elif Tdist3 == min([Tdist1, Tdist2, Tdist3, Tdist4]): Dist3.append(Tdist3) Cluster3.append(x) elif Tdist4 == min([Tdist1, Tdist2, Tdist3, Tdist4]): Dist4.append(Tdist4) Cluster4.append(x) print('---lock---') print(lock1) print(lock2) print(lock3) print(lock4) loop_no = loop_no + 1 if lock1 == 1 and lock2 == 1 and lock3 == 1 and lock4 == 1: print('break') break json_center = { 'center1': centroid1, 'center2': centroid2, 'center3': centroid3, 'center4': centroid4, } with open('center_json_data.json', 'w') as fc: json.dump(json_center, fc) fc.close() if os.path.isfile('meanDistance_json_data.json'): pfd = open('meanDistance_json_data.json') jdist = json.load(pfd) previous_dist1 = jdist['dist1'] previous_dist2 = jdist['dist2'] previous_dist3 = jdist['dist3'] previous_dist4 = jdist['dist4'] if previous_dist1 != 0: Dist1.append(previous_dist1) if previous_dist2 != 0: Dist2.append(previous_dist2) if previous_dist3 != 0: Dist3.append(previous_dist3) if previous_dist4 != 0: Dist4.append(previous_dist4) if len(Dist1) > 0: MeanDist1 = sum(Dist1) / len(Dist1) else: MeanDist1 = 0 if len(Dist2) > 0: MeanDist2 = sum(Dist2) / len(Dist2) else: MeanDist2 = 0 if len(Dist3) > 0: MeanDist3 = sum(Dist3) / len(Dist3) else: MeanDist3 = 0 if len(Dist4) > 0: MeanDist4 = sum(Dist4) / len(Dist4) else: MeanDist4 = 0 json_MeanDist = { 'dist1': MeanDist1, 'dist2': MeanDist2, 'dist3': MeanDist3, 'dist4': MeanDist4, } with open('meanDistance_json_data.json', 'w') as fd: json.dump(json_MeanDist, fd) fd.close() f.close() fs.delete(name) context = {'center': 'centroi', 'dist': 'MeanDist'} else: context = {'filename': '', 'dist': ''} return render(request, 'ndex.html', context)
def findDistribution(N=0.3,length=1000,sampled=400): sentences = [] for s in df['Sentence']: sentences.append(s) if(len(sentences)==length): break print("[INFO] No of sentences= "+str(len(sentences))) vectorizer = Vectorizer() vectorizer.bert(sentences) vectors_bert = vectorizer.vectors data=[] for i in range(length): for j in range(i+1,length): dist = spatial.distance.cosine(vectors_bert[i], vectors_bert[j]) data.append([i+1,j+1,dist]) if(((i+1)/length * 100)%10==0): print(str((i+1)/length * 100)+" % done") data_sorted=sorted(data,key=l2,reverse=True) G = snap.TUNGraph.New() for i in range(length): G.AddNode(i) val=int(length*N) for i in range (val): G.AddEdge(data_sorted[i][0],data_sorted[i][1]) PRankH = G.GetPageRank() adj=dict() for i in G.Nodes(): adj[i.GetId()]=[] for id in G.Nodes(): i=id.GetId() for w in id.GetOutEdges(): adj[i].append(w) pagerank=dict() for item in PRankH: pagerank[item]= PRankH[item] final=[] while(len(final)<sampled): pr_list=makelist(pagerank) pr_list=sort(pr_list) val=pr_list[0][0] for u in adj[val]: if u in pagerank: pagerank[u]*=0.8 pagerank.pop(val) final.append(val) counts=dict() for i in range(7): counts[i]=0 for i in final: counts[df_label.iloc[i,1]]+=1 return counts
continue # create a compounded text description str_raw = manga.title + ". " + manga.description if "description" in manga.external_al: str_raw = str_raw + manga.external_al["description"] # loop through all the related and append those descriptions also for related in manga.related: if related["id"] in data2mangadexid: str_raw = str_raw + manga_data[data2mangadexid[ related["id"]]].description break # append description is not long enough if len(manga.description) > min_desc_chars: # encode the description vectorizer = Vectorizer() vectorizer.bert([manga_utils.clean_string(str_raw, False)]) # append encoding to the corpus corpus.append((idx, vectorizer.vectors[0])) id2corups[idx] = counter corups2id[counter] = idx counter = counter + 1 # print out how far we are if idx % 10 == 0: print( str(round(100 * float(idx) / len(manga_data), 2)) + "% -> encoding completed") # write the BERT vectors to file file = open(bert_corpus_file, 'wb') data = { "corpus": corpus,
from sent2vec.vectorizer import Vectorizer import pandas as pd import time # setup sentence_data = pd.read_csv("./data/tasks/sentence_correction/task_data.csv") whole_sentences = [] if __debug__: print(sentence_data.columns) start_time = time.time() # each "row" contains its "values" as list item # save corrected sentences to "whole_sentences" for row, values in sentence_data.iterrows(): whole_sentences.append(values[2].format(values[3].strip("{}"))) sentence_data["sentence_corpus"] = whole_sentences # create vectorized items and save them as list vectorizer = Vectorizer() vectorizer.bert(sentence_data["sentence_corpus"]) sentence_data["sentence_vectors"] = vectorizer.vectors.tolist() if __debug__: print(sentence_data.index) end_time = time.time() - start_time print(end_time) sentence_data.to_pickle("pickled_sentences")