예제 #1
0
def evaluate(request):
    fc = open('center_json_data.json')
    jcenter = json.load(fc)
    center1 = jcenter['center1']
    center2 = jcenter['center2']
    center3 = jcenter['center3']
    center4 = jcenter['center4']
    fd = open('meanDistance_json_data.json')
    jdist = json.load(fd)
    dist1 = jdist['dist1']
    dist2 = jdist['dist2']
    dist3 = jdist['dist3']
    dist4 = jdist['dist4']
    text = request.GET['text']
    vectorizer = Vectorizer()
    vectorizer.bert(text)
    vectors_bert = vectorizer.vectors
    Tdist1 = spatial.distance.cosine(center1, vectors_bert[0])
    Tdist2 = spatial.distance.cosine(center2, vectors_bert[0])
    Tdist3 = spatial.distance.cosine(center3, vectors_bert[0])
    Tdist4 = spatial.distance.cosine(center4, vectors_bert[0])

    result = ''
    if Tdist1 < dist1:
        result = 'hatespeech'
    elif Tdist2 < dist2:
        result = 'hatespeech'
    elif Tdist3 < dist3:
        result = 'hatespeech'
    elif Tdist4 < dist4:
        result = 'hatespeech'
    else:
        result = 'not hatespeech'
    context = {'title': 'evaluating', 'result': result}
    return render(request, 'evaluate.html', context)
예제 #2
0
def test_bert_03():
    sentences = ["401k retirement accounts", "401k retirement accounts"]
    vectorizer = Vectorizer()
    vectorizer.bert(sentences)
    dist = spatial.distance.cosine(vectorizer.vectors[0],
                                   vectorizer.vectors[1])
    assert dist == 0
예제 #3
0
def test_bert_01():
    sentences = [
        "This is an awesome book to learn NLP.",
        "DistilBERT is an amazing NLP model.",
        "We can interchangeably use embedding or encoding, or vectorizing.",
    ]
    vectorizer = Vectorizer()
    vectorizer.bert(sentences)
    assert len(vectorizer.vectors[0, :]) == 768
예제 #4
0
def compare_two_sentences(sentence_1, sentence_2):
    sentences = [sentence_1, sentence_2]

    vectorizer = Vectorizer()
    vectorizer.bert(sentences)
    vec_1, vec_2 = vectorizer.vectors

    dist = spatial.distance.cosine(vec_1, vec_2)
    return dist
예제 #5
0
def test_bert_04():
    sentences = ["401k retirement accounts"]
    vectorizer = Vectorizer()
    vectorizer.bert(sentences)
    vec_1 = vectorizer.vectors[0]
    vectorizer.bert(sentences)
    vec_2 = vectorizer.vectors[0]
    dist = spatial.distance.cosine(vec_1, vec_2)
    assert dist == 0
예제 #6
0
def test_bert_02():
    sentences = [
        "This is an awesome book to learn NLP.",
        "DistilBERT is an amazing NLP model.",
        "We can interchangeably use embedding, encoding, or vectorizing.",
    ]
    vectorizer = Vectorizer()
    vectorizer.bert(sentences)
    dist_1 = spatial.distance.cosine(vectorizer.vectors[0],
                                     vectorizer.vectors[1])
    dist_2 = spatial.distance.cosine(vectorizer.vectors[0],
                                     vectorizer.vectors[2])
    print('dist_1: {0}, dist_2: {1}'.format(dist_1, dist_2))
    assert dist_1 < dist_2
예제 #7
0
def bert(query, tweetList):
    print("start of BERT")
    ##Added for A2 part 1.
    vectorizer = Vectorizer()
    queryString = ""
    for word in query:
        queryString = queryString + " " + word
    queryString = [queryString]
    queryString.extend(tweetList)
    print("Number of strings being processed " + str(len(queryString)))
    vectorizer.bert(queryString)
    vectors = vectorizer.vectors
    print("end of BERT")
    return vectors
예제 #8
0
def sen_similarity(sen1: str, sen2: str):
    """ Returns similarity between two input sentences

    :param sen1: first sentence
    :param sen2: second sentence

    :return: similarity score between 0 and 1, closer to 0 more similar.
    """

    # vectorize the sentences
    vectorizer = Vectorizer()
    vectorizer.bert([sen1, sen2])
    vectors_bert = vectorizer.vectors

    similarity = spatial.distance.cosine(vectors_bert[0], vectors_bert[1])

    return similarity
    
class SentenceSimilarity_BERT(SentenceSimilarity_abstract):
    def __init__(self):
        self.vectorizer = Vectorizer()

    # this function computes the similarity between two sentences, the more similar the two snetends are the lower the
    # computed score is
    def compute_SentenceToSentence_similarity(self, sentenceA, sentenceB):
        sentences = [sentenceA, sentenceB]

        self.vectorizer.bert(sentences)
        vectors = self.vectorizer.vectors

        embeddingOf_sentenceA = vectors[0]
        embeddingOf_sentenceB = vectors[1]

        distance = spatial.distance.cosine(embeddingOf_sentenceA,
                                           embeddingOf_sentenceB)

        return distance
예제 #10
0
def encodeBERT (docIDArray, Documents):
    #isolating the 1000 chosen documents for a query
    chosenTweetList = []
    tweets=[]

    for d in docIDArray:
        tweets.append(Documents[d][0]) #actual tweets 

    #running BERT
    vectorizer = Vectorizer()
    vectorizer.bert(tweets)
    vectors = vectorizer.vectors
    #print(vectors[0])

    for i in range(len(docIDArray)): 
        
        chosenTweetList.append([vectors[i].tolist(), docIDArray[i]]) #-> ['berted tweet', doc id]
    
    
    return chosenTweetList 
예제 #11
0
def test_complete():
    sentences = [
        "Alice is in the Wonderland.",
        "Alice is not in the Wonderland.",
    ]
    vectorizer = Vectorizer()
    vectorizer.bert(sentences)
    vectors_bert = vectorizer.vectors
    dist_bert = spatial.distance.cosine(vectors_bert[0], vectors_bert[1])

    splitter = Splitter()
    splitter.sent2words(sentences=sentences,
                        remove_stop_words=['not'],
                        add_stop_words=[])
    vectorizer.word2vec(splitter.words,
                        pretrained_vectors_path=PRETRAINED_VECTORS_PATH_WIKI)
    vectors_w2v = vectorizer.vectors
    dist_w2v = spatial.distance.cosine(vectors_w2v[0], vectors_w2v[1])

    print('dist_bert: {0}, dist_w2v: {1}'.format(dist_bert, dist_w2v))
    assert dist_w2v > dist_bert
class SentenceSimilarity_translationBased(SentenceSimilarity_abstract):
    def __init__(self):
        self.vectorizer = Vectorizer()
        self.translationModel = EasyNMT('opus-mt')
        self.targetLanguage = "en"

    # this function computes the similarity between two sentences, the more similar the two snetends are the lower the
    # computed score is
    def compute_SentenceToSentence_similarity(self, sentenceA, sentenceB):

        sourceLanguageA = self.translationModel.language_detection(sentenceA)
        translationsA = self.translationModel.translate(
            [sentenceA],
            source_lang=sourceLanguageA,
            target_lang=self.targetLanguage)

        sourceLanguageB = self.translationModel.language_detection(sentenceB)
        translationsB = self.translationModel.translate(
            [sentenceB],
            source_lang=sourceLanguageB,
            target_lang=self.targetLanguage)

        sentences = [translationsA[0], translationsB[0]]

        self.vectorizer.bert(sentences)
        vectors = self.vectorizer.vectors

        embeddingOf_sentenceA = vectors[0]
        embeddingOf_sentenceB = vectors[1]

        print("\nsentenceA \"" + sentenceA + "\" --- sourceLanguageA=" +
              sourceLanguageA + " --- translation = " + translationsA[0])
        print("sentenceB \"" + sentenceB + "\" --- sourceLanguageB=" +
              sourceLanguageB + " --- translation = " + translationsB[0])

        distance = spatial.distance.cosine(embeddingOf_sentenceA,
                                           embeddingOf_sentenceB)

        return distance
예제 #13
0
def k_mean(request):
    message = ''
    c_l_s = []
    sentence = request.POST['sentences']
    k = request.POST['k']
    sepsent = re.split('\n', sentence)
    print(sepsent)
    vectorizer = Vectorizer()
    vectorizer.bert(sepsent)
    vectors_bert = vectorizer.vectors
    vecTosen_dictionary = {}

    counter = 0
    for v in vectors_bert:
        vecTosen_dictionary[sum(v)] = sepsent[counter]
        counter = counter + 1

    [clusters, x, y] = group_similer_vectors(vectors_bert, k)
    if len(clusters) != 0:
        for cluster in clusters:
            print('################################')
            c = []
            for y in cluster.listelement:
                c.append(vecTosen_dictionary[sum(y)])
            c_l_s.append(c)
    else:
        message += 'wrong cluster no. it should be less then sentence numbers'

    context = {
        'clustered_sentences': c_l_s,
        'k': len(c_l_s),
        'msg': message,
        'x': x,
        'y': y,
    }
    return render(request, 'main.html', context)
예제 #14
0
def sent2vec_feature(utterances):
    vectorizer = Vectorizer()
    vectorizer.bert(utterances)
    return vectorizer.vectors
예제 #15
0
for x in range(49): #iterate queries again
    print("starting query...")
    '''
    docidarray = [] #one for each query
    for i in range (1000): #again the top 1000 results
        docid = results[i][0]
        docidarray.append(docid)
    '''
    
    #do the bert encoding
    docVectors = encodeBERT(firstResultsList[x], Documents) #-> [['bert processed tweet', doc id],..]

    #print (numpy.array(docVectors[1][0]))
    #encode query 
    vectorizer = Vectorizer()
    vectorizer.bert(queriesLst[x][1]) #current query
    queryVect = vectorizer.vectors
    newRank = []
    
    for i in range (len(docVectors)): #calculate vector length

        dist = spatial.distance.cosine(queryVect[0], numpy.array(docVectors[i][0]))
        newRank.append([dist, docVectors[i][1]]) #-> appends [similarity distance, doc id]

    #3.rank the docs again based on scores (use sorted() function)
    sortedNewRank = sorted(newRank) 
    newDocRankingList.append(sortedNewRank)
    print(newDocRankingList)

    #4. write to results file
    print(querycount)
예제 #16
0
    stat = group(d.split(' '), 4)
    f = []
    for s in stat:
        f.extend(pea_pick(s))
    f = [ff for ff in f if len(ff) > 0]
    pairs = group(f, 2)
    pairs = [[' '.join(pp) for pp in p] for p in pairs]
    return pairs


for sent in data:

    sentences = chunky(sent)

    dist = []
    for p in sentences:
        print(p)
        if len(p) > 1:
            vectorizer = Vectorizer()
            vectorizer.bert(p)
            vectors_bert = vectorizer.vectors

            dist.append(
                spatial.distance.cosine(vectors_bert[0], vectors_bert[1]))
            print(dist[-1])

    avg = int(np.average(dist) * 1000000)

    print("Sentence: " + sent)
    print(avg)
    print()
from sent2vec.vectorizer import Vectorizer
from scipy import spatial
from array import *
import numpy

sentences = [
    "This is an awesome book to learn NLP.",
    "DistilBERT is an amazing NLP model.",
    "We can interchangeably use embedding, encoding, or vectorizing.",
]
#encoding sentences using BERT language model 
vectorizer = Vectorizer()
vectorizer.bert(sentences)
vectors = vectorizer.vectors
newv=[]
for i in vectors:
	newv.append(i.tolist())

print(newv[1])

#computing cosine distance vectors. Smaller distance -> greater similarity
dist_1 = spatial.distance.cosine(numpy.array(newv[0]), numpy.array(newv[1]))
dist_2 = spatial.distance.cosine(numpy.array(newv[0]), numpy.array(newv[2]))
print('dist_1: {0}, dist_2: {1}'.format(dist_1, dist_2))
assert dist_1 < dist_2
# dist_1: 0.043, dist_2: 0.192
예제 #18
0
def add(request):

    if request.FILES:
        uploadedFile = request.FILES['fname']
        fs = FileSystemStorage()
        name = fs.save(uploadedFile.name, uploadedFile)
        f = open(fs.path(name), 'rt')
        sentenceRR = f.read()
        sentenceR = re.sub('@', '', sentenceRR)
        sentencee = re.sub('[uU][Ss][Ee][Rr]', '', sentenceR)
        sentenceee = re.sub('_', ' ', sentencee)
        sentenceeee = re.sub('-', ' ', sentenceee)
        sentenceeeee = re.sub('=', ' ', sentenceeee)
        sentence = re.sub('%', ' ', sentenceeeee)

        sepsent = re.findall('\d,+(.*)\n', sentence)
        print(sepsent)
        i = 1
        centroids = []

        # for each cluster we defined separate distance and vector list
        Dist1 = []
        Dist2 = []
        Dist3 = []
        Dist4 = []
        Cluster1 = []
        Cluster2 = []
        Cluster3 = []
        Cluster4 = []

        # showing the progress information on the display
        i = 0

        # we take each sentence from the list and calculate its bert representation in vector
        for x in sepsent:
            progress_percent = round(((i * 100) / len(sepsent)), 2)
            remained_time_h = int(((7 * len(sepsent)) - (i * 7)) / 3600)
            remained_time_m = ((7 * len(sepsent)) - (i * 7)) % 3600
            print(' ----------------  progress :' + str(progress_percent) +
                  '% ---------remaining time(hh:mm): ' + str(remained_time_h) +
                  ':' + str(remained_time_m) + ' ------',
                  end='\r')
            i = i + 1

            vectorizer = Vectorizer()
            vectorizer.bert(x)
            vectors_bert = vectorizer.vectors
            centroids.append(vectors_bert[0])
            print
        # we took 4 random centers for k means algorithm
        if os.path.isfile('center_json_data.json'):
            pfc = open('center_json_data.json')
            jcenter = json.load(pfc)
            pfc.close()
            centroid1 = jcenter['center1']
            centroid2 = jcenter['center2']
            centroid3 = jcenter['center3']
            centroid4 = jcenter['center4']
        else:
            centroid1v = sum(centroids) / len(centroids)
            centroid2v = sum(centroids) / (len(centroids) / 2)
            centroid3v = sum(centroids) / (len(centroids) / 10)
            centroid4v = sum(centroids) / (len(centroids) / 28)
            centroid1 = centroid1v.tolist()
            centroid2 = centroid2v.tolist()
            centroid3 = centroid3v.tolist()
            centroid4 = centroid4v.tolist()

        print(centroid1)

        # creating json format for them to save them later

        lock1 = 0
        lock2 = 0
        lock3 = 0
        lock4 = 0

        loop_no = 0

        while True:
            print('---cluster:---')
            print(len(Cluster1))
            print(len(Cluster2))
            print(len(Cluster3))
            print(len(Cluster4))

            print('----------------')

            print('#######################')
            if len(Cluster1) > 0:
                if (centroid1 != (sum(Cluster1) / len(Cluster1))).all():
                    centroidiv1 = sum(Cluster1) / len(Cluster1)
                    centroid1 = centroidiv1.tolist()

                else:
                    lock1 = 1
            else:
                if loop_no > 100:
                    lock1 = 1
            if len(Cluster2) > 0:
                if (centroid2 != (sum(Cluster2) / len(Cluster2))).all():
                    centroidiv2 = sum(Cluster2) / len(Cluster2)
                    centroid2 = centroidiv2.tolist()

                else:
                    lock2 = 1
            else:
                if loop_no > 100:
                    lock2 = 1
            if len(Cluster3) > 0:
                if (centroid3 != (sum(Cluster3) / len(Cluster3))).all():
                    centroidiv3 = sum(Cluster3) / len(Cluster3)
                    centroid3 = centroidiv3.tolist()

                else:
                    lock3 = 1
            else:
                if loop_no > 100:
                    lock3 = 1
            if len(Cluster4) > 0:
                if (centroid4 != (sum(Cluster4) / len(Cluster4))).all():
                    centroidiv4 = sum(Cluster4) / len(Cluster4)
                    centroid4 = centroidiv4.tolist()

                else:
                    lock4 = 1
            else:
                if loop_no > 100:
                    lock4 = 1
            Dist1.clear()
            Cluster1.clear()
            Dist2.clear()
            Cluster2.clear()
            Dist3.clear()
            Cluster3.clear()
            Dist4.clear()
            Cluster4.clear()
            for x in centroids:

                Tdist1 = spatial.distance.cosine(centroid1, x)
                Tdist2 = spatial.distance.cosine(centroid2, x)
                Tdist3 = spatial.distance.cosine(centroid3, x)
                Tdist4 = spatial.distance.cosine(centroid4, x)

                if Tdist1 == min([Tdist1, Tdist2, Tdist3, Tdist4]):
                    Dist1.append(Tdist1)
                    Cluster1.append(x)
                elif Tdist2 == min([Tdist1, Tdist2, Tdist3, Tdist4]):
                    Dist2.append(Tdist2)
                    Cluster2.append(x)
                elif Tdist3 == min([Tdist1, Tdist2, Tdist3, Tdist4]):
                    Dist3.append(Tdist3)
                    Cluster3.append(x)
                elif Tdist4 == min([Tdist1, Tdist2, Tdist3, Tdist4]):
                    Dist4.append(Tdist4)
                    Cluster4.append(x)
            print('---lock---')
            print(lock1)
            print(lock2)
            print(lock3)
            print(lock4)
            loop_no = loop_no + 1
            if lock1 == 1 and lock2 == 1 and lock3 == 1 and lock4 == 1:
                print('break')
                break

        json_center = {
            'center1': centroid1,
            'center2': centroid2,
            'center3': centroid3,
            'center4': centroid4,
        }

        with open('center_json_data.json', 'w') as fc:
            json.dump(json_center, fc)
        fc.close()

        if os.path.isfile('meanDistance_json_data.json'):
            pfd = open('meanDistance_json_data.json')
            jdist = json.load(pfd)
            previous_dist1 = jdist['dist1']
            previous_dist2 = jdist['dist2']
            previous_dist3 = jdist['dist3']
            previous_dist4 = jdist['dist4']
            if previous_dist1 != 0:
                Dist1.append(previous_dist1)
            if previous_dist2 != 0:
                Dist2.append(previous_dist2)
            if previous_dist3 != 0:
                Dist3.append(previous_dist3)
            if previous_dist4 != 0:
                Dist4.append(previous_dist4)

        if len(Dist1) > 0:
            MeanDist1 = sum(Dist1) / len(Dist1)
        else:
            MeanDist1 = 0
        if len(Dist2) > 0:
            MeanDist2 = sum(Dist2) / len(Dist2)
        else:
            MeanDist2 = 0
        if len(Dist3) > 0:
            MeanDist3 = sum(Dist3) / len(Dist3)
        else:
            MeanDist3 = 0
        if len(Dist4) > 0:
            MeanDist4 = sum(Dist4) / len(Dist4)
        else:
            MeanDist4 = 0

        json_MeanDist = {
            'dist1': MeanDist1,
            'dist2': MeanDist2,
            'dist3': MeanDist3,
            'dist4': MeanDist4,
        }
        with open('meanDistance_json_data.json', 'w') as fd:
            json.dump(json_MeanDist, fd)
        fd.close()

        f.close()
        fs.delete(name)
        context = {'center': 'centroi', 'dist': 'MeanDist'}
    else:
        context = {'filename': '', 'dist': ''}
    return render(request, 'ndex.html', context)
예제 #19
0
def findDistribution(N=0.3,length=1000,sampled=400):
  sentences = []
  for s in df['Sentence']:
    sentences.append(s)
    if(len(sentences)==length):
      break

  print("[INFO] No of sentences= "+str(len(sentences)))
  vectorizer = Vectorizer()
  vectorizer.bert(sentences)
  vectors_bert = vectorizer.vectors
  data=[]

  for i in range(length):
    for j in range(i+1,length):
      dist = spatial.distance.cosine(vectors_bert[i], vectors_bert[j])
      data.append([i+1,j+1,dist])
    if(((i+1)/length * 100)%10==0):
      print(str((i+1)/length * 100)+" % done")  
  data_sorted=sorted(data,key=l2,reverse=True) 

  G = snap.TUNGraph.New()
  for i in range(length):
    G.AddNode(i)

  val=int(length*N)
  for i in range (val):
    G.AddEdge(data_sorted[i][0],data_sorted[i][1]) 


  PRankH = G.GetPageRank()

  adj=dict()
  for i in G.Nodes():
    adj[i.GetId()]=[]

  for id in G.Nodes():
    i=id.GetId()
    for w in id.GetOutEdges():
      adj[i].append(w)

  pagerank=dict()
  for item in PRankH:
      pagerank[item]= PRankH[item]

  final=[]
  while(len(final)<sampled):
    pr_list=makelist(pagerank)
    pr_list=sort(pr_list)
    val=pr_list[0][0]
    for u in adj[val]:
      if u in pagerank:
        pagerank[u]*=0.8
    pagerank.pop(val)
    final.append(val) 

  counts=dict()
  for i in range(7):
    counts[i]=0
  for i in final:
    counts[df_label.iloc[i,1]]+=1

  return counts
예제 #20
0
        continue
    # create a compounded text description
    str_raw = manga.title + ". " + manga.description
    if "description" in manga.external_al:
        str_raw = str_raw + manga.external_al["description"]
    # loop through all the related and append those descriptions also
    for related in manga.related:
        if related["id"] in data2mangadexid:
            str_raw = str_raw + manga_data[data2mangadexid[
                related["id"]]].description
            break
    # append description is not long enough
    if len(manga.description) > min_desc_chars:
        # encode the description
        vectorizer = Vectorizer()
        vectorizer.bert([manga_utils.clean_string(str_raw, False)])
        # append encoding to the corpus
        corpus.append((idx, vectorizer.vectors[0]))
        id2corups[idx] = counter
        corups2id[counter] = idx
        counter = counter + 1
    # print out how far we are
    if idx % 10 == 0:
        print(
            str(round(100 * float(idx) / len(manga_data), 2)) +
            "% -> encoding completed")

# write the BERT vectors to file
file = open(bert_corpus_file, 'wb')
data = {
    "corpus": corpus,
예제 #21
0
from sent2vec.vectorizer import Vectorizer
import pandas as pd
import time

# setup
sentence_data = pd.read_csv("./data/tasks/sentence_correction/task_data.csv")
whole_sentences = []

if __debug__:
    print(sentence_data.columns)
    start_time = time.time()

# each "row" contains its "values" as list item
# save corrected sentences to "whole_sentences"
for row, values in sentence_data.iterrows():
    whole_sentences.append(values[2].format(values[3].strip("{}")))

sentence_data["sentence_corpus"] = whole_sentences

# create vectorized items and save them as list
vectorizer = Vectorizer()
vectorizer.bert(sentence_data["sentence_corpus"])
sentence_data["sentence_vectors"] = vectorizer.vectors.tolist()

if __debug__:
    print(sentence_data.index)
    end_time = time.time() - start_time
    print(end_time)

sentence_data.to_pickle("pickled_sentences")