示例#1
0
def run(sub_task=1):
    documents = [
        "Éste texto no tiene nada que ver con los demás",
        "La plata fue entregada en camiones color plata",
        "El cargamento de oro llegó en un camión. El cargamento de oro llegó en un camión. El cargamento de oro llegó en un camión",
        "Cargamentos de oro dañados por el fuego",
        "El cargamento de oro llegó en un camión"
    ]
    
    query = ["oro plata camión"]
    if sub_task <= 2:
        text_vectorizer, text_vector = vectorizer.vectorize(documents)
        query_vector = text_vectorizer.transform(query)
        
        if sub_task == 1:
            distances = np.array([np.linalg.norm(text_vector[i].toarray() - query_vector.toarray()) for i in range(text_vector.shape[0])])
        elif sub_task == 2:
            distances = np.array([cosine_distance(text_vector[i].toarray()[0], query_vector.toarray()[0]) for i in range(text_vector.shape[0])])
    elif sub_task >= 3:
        if sub_task == 3:
            text_vectorizer, text_vector = vectorizer.vectorize(documents, stop_words=stopwords.spanish)
        elif sub_task == 4:
            text_vectorizer, text_vector = vectorizer.vectorize(documents, stop_words=stopwords.spanish, tokenizer=SpanishTokenizer())
        elif sub_task == 5:
            text_vectorizer, text_vector = vectorizer.tf_idf_vectorize(documents, stop_words=stopwords.spanish, tokenizer=SpanishTokenizer())
            
        query_vector = text_vectorizer.transform(query)
        
        distances = np.array([cosine_distance(text_vector[i].toarray()[0], query_vector.toarray()[0]) for i in range(text_vector.shape[0])])
    
    min_distance = np.argmin(distances)
    
    print("Documento mas parecido: {0}.\nDistancia: {1}\nTexto del documento:\n{2}".format(min_distance, np.amin(distances), documents[min_distance]))
示例#2
0
def Clustering(orig, minclusters, maxclusters) :
    '''returns (distortion score, number of clusters, cluster assignment)'''

    # perform clustering
    clusterer = GAAClusterer()
    clusterer.cluster(orig)
    vrc = []

    # calculate distortions
    wb = len(orig)
    centroid = numpy.mean(orig, axis=0)
    for vector in orig : wb -= cosine_distance(vector, centroid)
    lowerbound = minclusters
    if lowerbound < 2 : lowerbound = 2
    for k in range(lowerbound, maxclusters + 1) :
        clusterer.update_clusters(k)
        gaac = []
        ww = len(orig)
        for vector in orig :
            maxcos = None
            for j in range(k) :
                clust = clusterer._centroids[j]
                cdist = cosine_distance(vector, clust)
                if not maxcos or cdist > maxcos[0] :
                    maxcos = (cdist, j)
            ww -= maxcos[0]
            gaac.append(maxcos[1])
        vrc.append(((wb/(k - 1)) / (ww/(len(orig) - k)), k, gaac))
    khat = (float("inf"), vrc[0][1], vrc[0][2])
    for k in range(1, len(vrc) - 1) :
        dist = (vrc[k+1][0] - vrc[k][0]) - (vrc[k][0] - vrc[k-1][0])
        if dist < khat[0] : khat = (dist, vrc[k][1], vrc[k][2])

    return khat
示例#3
0
文件: gaac.py 项目: DrDub/nltk
 def classify_vectorspace(self, vector):
     best = None
     for i in range(self._num_clusters):
         centroid = self._centroids[i]
         dist = cosine_distance(vector, centroid)
         if not best or dist < best[0]:
             best = (dist, i)
     return best[1]
示例#4
0
    def sentence_similarity(sent1, sent2, stopwords):

        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            if w in stopwords:
                continue
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            if w in stopwords:
                continue
            vector2[all_words.index(w)] += 1

        return 1 - cosine_distance(vector1, vector2)
def sentence_similarity(s1, s2, stopwords=None):
    if stopwords is None:
        stopwords = []

    s1 = [a.lower() for a in s1]
    s2 = [a.lower() for a in s2]

    all_words = list(set(s1 + s2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    for w in s1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1

    for w in s2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)
示例#6
0
def __sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []

    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]

    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1

    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)
示例#7
0
def finding_similarity(line1, line2, stop=None):
    if stop is None:
        stop = list()

    line1 = [word.lower() for word in line1]
    line2 = [word.lower() for word in line2]

    all_words = list(set(line1 + line2))

    check1 = [0] * len(all_words)
    check2 = [0] * len(all_words)

    for word in line1:
        if word in stop:
            continue
        check1[all_words.index(word)] += 1

    for word in line2:
        if word in stop:
            continue
        check2[all_words.index(word)] += 1

    return 1 - cosine_distance(check1, check2)
示例#8
0
    def sentence_similarity(self,
                            vector1,
                            vector2,
                            id_1,
                            id_2,
                            text_words_count,
                            stopwords=None):
        r = vector1.shape[1]

        vector1 = np.array(np.reshape(vector1, (r, 1)))
        vector2 = np.array(np.reshape(vector2, (r, 1)))

        v1 = [vector1[i][0] for i in xrange(r)]
        v2 = [vector2[i][0] for i in xrange(r)]

        wc = text_words_count[-1]
        #print sum(v1), sum(v2)
        if sum(v1) == 0. or sum(v2) == 0.:
            return abs(text_words_count[id_2] -
                       text_words_count[id_1]) * 0.25 / wc
        else:
            return (1 - cosine_distance(v1, v2)) * 0.75 + abs(
                text_words_count[id_2] - text_words_count[id_1]) * 0.25 / wc
示例#9
0
    def similar_sentence(self, sent1, sent2):
        if self.stop_words is None:
            self.stop_words = list()

        sent1 = list(map(lambda d: d.lower(), sent1))
        sent2 = list(map(lambda m: m.lower(), sent2))

        total_sentence = list(set(sent1 + sent2))

        vect1 = [0] * len(total_sentence)
        vect2 = [0] * len(total_sentence)

        for e in sent1:
            if e in self.stop_words:
                continue
            vect1[total_sentence.index(e)] += 1

        for m in sent2:
            if m in self.stop_words:
                continue
            vect2[total_sentence.index(m)] += 1

        return 1 - cosine_distance(vect1, vect2)
def sentence_similarity(sent1, sent2):
    """
    计算两个句子之间的相似性
    :param sent1:
    :param sent2:
    :return:
    """
    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    for word in sent1:
        vector1[all_words.index(word)] += 1

    for word in sent2:
        vector2[all_words.index(word)] += 1
    # print(sent1)
    # print(sent2)
    # print('vector1:{}'.format(vector1))
    # print('vector2:{}'.format(vector2))
    # cosine_distance 越大越不相似
    return 1 - cosine_distance(vector1, vector2)
示例#11
0
文件: gaac.py 项目: DrDub/nltk
    def cluster_vectorspace(self, vectors, trace=False):
        # variables describing the initial situation
        N = len(vectors)
        cluster_len = [1]*N
        cluster_count = N
        index_map = numpy.arange(N)

        # construct the similarity matrix
        dims = (N, N)
        dist = numpy.ones(dims, dtype=numpy.float)*numpy.inf
        for i in range(N):
            for j in range(i+1, N):
                dist[i, j] = cosine_distance(vectors[i], vectors[j])

        while cluster_count > max(self._num_clusters, 1):
            i, j = numpy.unravel_index(dist.argmin(), dims)
            if trace:
                print("merging %d and %d" % (i, j))

            # update similarities for merging i and j
            self._merge_similarities(dist, cluster_len, i, j)

            # remove j
            dist[:, j] = numpy.inf
            dist[j, :] = numpy.inf

            # merge the clusters
            cluster_len[i] = cluster_len[i]+cluster_len[j]
            self._dendrogram.merge(index_map[i], index_map[j])
            cluster_count -= 1

            # update the index map to reflect the indexes if we
            # had removed j
            index_map[j+1:] -= 1
            index_map[j] = N

        self.update_clusters(self._num_clusters)
示例#12
0
    def cluster_vectorspace(self, vectors, trace=False):
        # variables describing the initial situation
        N = len(vectors)
        cluster_len = [1] * N
        cluster_count = N
        index_map = numpy.arange(N)

        # construct the similarity matrix
        dims = (N, N)
        dist = numpy.ones(dims, dtype=numpy.float) * numpy.inf
        for i in range(N):
            for j in range(i + 1, N):
                dist[i, j] = cosine_distance(vectors[i], vectors[j])

        while cluster_count > max(self._num_clusters, 1):
            i, j = numpy.unravel_index(dist.argmin(), dims)
            if trace:
                print("merging %d and %d" % (i, j))

            # update similarities for merging i and j
            self._merge_similarities(dist, cluster_len, i, j)

            # remove j
            dist[:, j] = numpy.inf
            dist[j, :] = numpy.inf

            # merge the clusters
            cluster_len[i] = cluster_len[i] + cluster_len[j]
            self._dendrogram.merge(index_map[i], index_map[j])
            cluster_count -= 1

            # update the index map to reflect the indexes if we
            # had removed j
            index_map[j + 1 :] -= 1
            index_map[j] = N

        self.update_clusters(self._num_clusters)
示例#13
0
def sentence_similarity(sentence1, sentence2, stopwords=None):
    if stopwords is None:
        stopwords = []
    sentence1 = [word.lower() for word in sentence1]
    sentence2 = [word.lower() for word in sentence2]

    all_words = list(set(sentence1 + sentence2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # building vector for the first sentence
    for word in sentence1:
        if word in stopwords:
            continue
        vector1[all_words.index(word)] += 1

    # building vector for the second sentence
    for word in sentence2:
        if word in stopwords:
            continue
        vector2[all_words.index(word)] += 1

    return 1 - cosine_distance(vector1, vector2)
示例#14
0
    def get_sentence_similarity(s1, s2):
        stop_words = stopwords.words('english')

        s1 = [word.lower() for word in s1]
        s2 = [word.lower() for word in s2]

        all_words = list(set(s1 + s2))

        v1 = [0] * len(all_words)
        v2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in s1:
            if w in stop_words:
                continue
            v1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in s2:
            if w in stop_words:
                continue
            v2[all_words.index(w)] += 1

        return 1 - cosine_distance(v1, v2)
示例#15
0
def sentence_similarity(sent1, sent2, stopwords=None):

    #use empty list if stopwords aren't present for given language
    if stopwords is None:
        stopwords = []

    #convert the words to lowercase for nltk.stopwords[]
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]

    #remove redundant words
    all_words = list(set(sent1 + sent2))

    #initialize vector for sent1 and sent2
    #a numpy array can also be used by making further alterations in the code
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # build the vector for the first sentence
    for w in sent1:

        #remove stopwords
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1

    # build the vector for the second sentence
    for w in sent2:

        #remove stopwords
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1

    #cosine distance calculated according to above formula
    return 1 - cosine_distance(vector1, vector2)
def findSimilarity():
	import os
	import numpy as np
	from gensim import corpora

	list_of_files = [] 
	for (dirpath, dirnames, filenames) in os.walk(categoryPath):
		for filename in filenames:
			target_file = os.path.join(dirpath, filename)

			list_of_files.append(target_file)

	for i in range(0, 50):
		ref = list_of_files[i]
		refDoc = preprocess(ref)
		refDict = buildDict(refDoc)

		for j in range(i + 1, 50, 1):
			candidate = list_of_files[j]
			candidateDoc = preprocess(candidate)
			candidateDict = buildDict(candidateDoc)

			combineDict = refDict.copy()
			combineDict.update(candidateDict)
			refWordList = getWordCountList(combineDict.keys(), refDict)
			candidateWordList = getWordCountList(combineDict.keys(), candidateDict)

			refArray = np.asarray(refWordList, dtype=int).reshape(-1)
			candidateArray = np.asarray(candidateWordList, dtype=int).reshape(-1)

			sim = cosine_distance(refArray, candidateArray)

			outputfile = getOutputFileName(ref, candidate)

			with open(outputfile, 'w') as writer:
				writer.write(str(np.asscalar(sim)))
示例#17
0
    def sentence_similarity(self, sent1, sent2, stopwords=None):
        if stopwords is None:
            stopwords = []
        stemmer = PorterStemmer()
        sent1 = [stemmer.stem(w.lower()) for w in sent1]
        sent2 = [stemmer.stem(w.lower()) for w in sent2]
        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            if w in stopwords:
                continue
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            if w in stopwords:
                continue
            vector2[all_words.index(w)] += 1

        return 1 - cosine_distance(vector1, vector2)
示例#18
0
    def sentence_similarity(self, sent1, sent2):
        '''
        Calculate cosine similarity between two sentences
        '''
        sent1 = sent1.split(' ')
        sent2 = sent2.split(' ')

        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            vector2[all_words.index(w)] += 1

        return 1 - cosine_distance(vector1, vector2)
def sent_similarity_calculation(s1, s2):

    #     print(s1,s2)
    s1_tokens = nltk.word_tokenize(s1)
    s2_tokens = nltk.word_tokenize(s2)
    s1 = ' '.join(
        [w.lower() for w in s1_tokens if re.fullmatch(r'[a-zA-Z]*', w)])
    s2 = ' '.join(
        [w.lower() for w in s2_tokens if re.fullmatch(r'[a-zA-Z]*', w)])

    # tokenization
    s1_tokens = list(filter(remove_stopwords, nltk.word_tokenize(s1)))
    s2_tokens = list(filter(remove_stopwords, nltk.word_tokenize(s2)))
    #     print(s1_tokens,s2_tokens)
    all_words = list(set(s1_tokens + s2_tokens))
    #print(all_words)
    v1 = [0] * len(all_words)
    v2 = [0] * len(all_words)
    for x in s1_tokens:
        v1[all_words.index(x)] += 1
    for x in s2_tokens:
        v2[all_words.index(x)] += 1
    #print(v1,v2)
    return 1 - cosine_distance(v1, v2)
示例#20
0
def sentence_similarity(sent1, sent2):
    wakati = MeCab.Tagger('-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    node1, node2 = wakati.parseToNode(sent1), wakati.parseToNode(sent2)
    sent1, sent2 = set(), set()
    
    # Exclude blanks and those with specific parts of speech(Adverbs, particles, conjunctions, auxiliary verbs)
    while node1:
        word = node1.surface
        hinshi = node1.feature.split(",")[0]
        if word == " " or hinshi in ["副詞", "助詞", "接続詞", "助動詞"]:
            node1 = node1.next
            continue
        sent1.add(word)
        node1 = node1.next
    
    while node2:
        word = node2.surface
        hinshi = node2.feature.split(",")[0]
        if word == " " or hinshi in ["副詞", "助詞", "接続詞", "助動詞"]:
            node2 = node2.next
            continue
        sent2.add(word)
        node2 = node2.next

    # Bag of words
    all_words = list(sent1 | sent2)
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    for word in sent1:
        vector1[all_words.index(word)] += 1
    for word in sent2:
        vector2[all_words.index(word)] += 1

    # cosine similarity equals 1 - cosine distance
    return 1 - cosine_distance(vector1, vector2)
示例#21
0
 def sentence_similarity_calculator(self, vectorized_sentence_1,
                                    vectorized_sentence_2):
     return 1 - cosine_distance(vectorized_sentence_1,
                                vectorized_sentence_2)
示例#22
0
def similarity(s1, s2):
    s1_av = sentence_vector(tokenize_sentence(s1))
    s2_av = sentence_vector(tokenize_sentence(s2))

    return 1 - cosine_distance(s1_av, s2_av)
        for w in lstWordsSent1:
            vecWordCount1[lstWordsInSents.index(w)] += 1
     
        # build word count vector for the second sentence
        for w in lstWordsSent2:
            vecWordCount2[lstWordsInSents.index(w)] += 1

        #print(vecWordCount1)
        #print(vecWordCount2)
        #print(type(vecWordCount1))
        #print(type(vecWordCount2))
        #print(vecWordCount1.shape)
        #print(vecWordCount2.shape)

        # cosine distance
        similarity_matrix[idx1][idx2] = 1 - cosine_distance(vecWordCount1, vecWordCount2)
        print(similarity_matrix[idx1][idx2])

        #time.sleep(2)

#print(similarity_matrix)
#print(similarity_matrix.shape)
#print(similarity_matrix.shape[0])
#print(similarity_matrix.shape[1])
#print(type(similarity_matrix))

#for idx1 in range(intListLength):
#    for idx2 in range(intListLength):
#        print(similarity_matrix[idx1][idx2])

#print(similarity_matrix[10][10])
示例#24
0
        if len(tweet_embeds) == 0:
            tweet_embeds = embeds
        else:
            tweet_embeds = np.vstack([tweet_embeds, embeds])

    return tweet_embeds


tweet_embeds = get_samples()

print(tweet_embeds.shape)
print(NUM_CLUSTERS)
kclusterer = KMeansClusterer(NUM_CLUSTERS,
                             distance=nltk.cluster.util.cosine_distance,
                             repeats=1,
                             rng=RNG)

print('clustering...')
assigned_clusters = kclusterer.cluster(tweet_embeds, assign_clusters=True)

means = np.array(kclusterer.means())

print('calculating sum of distances...')
sum_dists = []
for i, c in enumerate(assigned_clusters):
    sum_dists.append(cosine_distance(means[c], tweet_embeds[i]))
print(np.mean(sum_dists))  # the smaller, the better

print('saving...')
np.save(OUTPUT_DIR + '/cluster_' + str(NUM_CLUSTERS) + '_means.npy', means)
示例#25
0
def compute_text_similarity(text1, text2, text1tags, text2tags):
    """ Compute text similarity using cosine
    """
    #stemming is the process for reducing inflected (or sometimes derived) words to their stem, base or root form
    stemmer = nltk.stem.WordNetLemmatizer()
    sentences_text1 = split_sentences(text1)
    sentences_text2 = split_sentences(text2)
    tokens_text1 = []
    tokens_text2 = []
    #print("sentence 1",sentences_text1)
    #print("sentence 2",sentences_text2)

    for element in text1tags:
        tokens_text1.extend(split_into_tokens(element))
    for element in text2tags:
        tokens_text2.extend(split_into_tokens(element))

    for sentence in sentences_text1:
        tokenstemp = split_into_tokens(sentence.lower())
        tokens_text1.extend(tokenstemp)

    for sentence in sentences_text2:
        tokenstemp = split_into_tokens(sentence.lower())
        tokens_text2.extend(tokenstemp)

    if (len(text1tags) > 0):
        tokens_text1.extend(text1tags)
    if (len(text2tags) > 0):
        tokens_text2.extend(text2tags)

    tokens1Filtered = [
        stemmer.lemmatize(x) for x in tokens_text1 if x not in stopWords
    ]

    tokens2Filtered = [
        stemmer.lemmatize(x) for x in tokens_text2 if x not in stopWords
    ]

    #  remove duplicate tokens
    tokens1Filtered = set(tokens1Filtered)
    tokens2Filtered = set(tokens2Filtered)
    print("final tokens1 ", tokens_text1)

    print("final tokens2 ", tokens_text2)
    tokensList = []

    text1vector = []
    text2vector = []

    if len(tokens1Filtered) < len(tokens2Filtered):
        tokensList = tokens1Filtered
    else:
        tokensList = tokens2Filtered

    for token in tokensList:
        if token in tokens1Filtered:
            text1vector.append(1)
        else:
            text1vector.append(0)
        if token in tokens2Filtered:
            text2vector.append(1)
        else:
            text2vector.append(0)

    cosine_similarity = 1 - cosine_distance(text1vector, text2vector)
    if numpy.isnan(cosine_similarity):
        cosine_similarity = 0

    return cosine_similarity
示例#26
0
def dist_bw_sent_doc_cos(vec1,vec2):
    dist_arr={}
    for num,sent in vec1.items():
        dist =cosine_distance(vec2,sent)
        dist_arr[num]=dist
    return dist_arr
示例#27
0
def compute_text_similarity(text1, text2, text1tags, text2tags):
    """ Compute text similarity using cosine
    """
    #stemming is the process for reducing inflected (or sometimes derived) words to their stem, base or root form
    tokens_text1 = []
    tokens_text2 = []
    stemmer = nltk.stem.porter.PorterStemmer()#.WordNetLemmatizer()
    '''
    sentences_text1 = split_sentences(text1)
    sentences_text2 = split_sentences(text2)

    #print("sentence 1",sentences_text1)
    #print("sentence 2",sentences_text2)
    
    #for tags in text1tags:
        #pass

    
    for element in text1tags:
        tokens_text1.extend(split_into_tokens(element))
    for element in text2tags:
        tokens_text2.extend(split_into_tokens(element))
    
    for sentence in sentences_text1:
        tokenstemp = split_into_tokens(sentence.lower())
        tokens_text1.extend(tokenstemp)
    
    for sentence in sentences_text2:
        tokenstemp = split_into_tokens(sentence.lower())
        tokens_text2.extend(tokenstemp)
    if (len(text1tags) > 0):  
        tokens_text1.extend(text1tags)
    if (len(text2tags) > 0):    
        tokens_text2.extend(text2tags)
        '''
    for element in text1tags:
        tokens_text1.extend(split_into_tokens(element))
    for element in text2tags:
        tokens_text2.extend(split_into_tokens(element))
        
       

    
    

    tokens1Filtered = [stemmer.stem(x)  for x in tokens_text1 if x not in stopWords]
    
    tokens2Filtered = [stemmer.stem(x) for x in tokens_text2 if x not in stopWords]
    
    #  remove duplicate tokens
    tokens1Filtered = set(tokens1Filtered)
    tokens2Filtered = set(tokens2Filtered)

    tokensList=[]

    text1vector = []
    text2vector = []
    
    if len(tokens1Filtered) < len(tokens2Filtered):
        tokensList = tokens1Filtered
    else:
        tokensList = tokens2Filtered

    for token in tokensList:
        if token in tokens1Filtered:
            text1vector.append(1)
        else:
            text1vector.append(0)
        if token in tokens2Filtered:
            text2vector.append(1)
        else:
            text2vector.append(0)  

    cosine_similarity = 1-cosine_distance(text1vector,text2vector)
    if numpy.isnan(cosine_similarity):
        cosine_similarity = 0
    '''
    with open(Path+"data/cosinesimilarity.txt","a") as fp:
        fp.write(str(tokens1Filtered))
        fp.write("\n")
        fp.write("                           -------vs---------                 ")
        fp.write("\n")
        fp.write(str(tokens2Filtered))
        fp.write("\n")
        fp.write(str(cosine_similarity))
        fp.write("\n")
        fp.write("\n")'''
        
    return cosine_similarity
示例#28
0
def sentence_similarity(sent1, sent2, method, stop_words):
    if method == "glove":
        full_vect_1 = []
        full_vect_2 = []
        for word in preprocess_sentence(sent1, stop_words):
            try:
                full_vect_1 += [model_glove[word]]
            except:
                print(word)
        for word in preprocess_sentence(sent2, stop_words):
            try:
                full_vect_2 += [model_glove[word]]
            except:
                print(word)
        vector_1 = np.mean(full_vect_1, axis=0)
        vector_2 = np.mean(full_vect_2, axis=0)
        return 1 - cosine_distance(vector_1, vector_2)
    elif method == "word2vec":
        full_vect_1 = []
        full_vect_2 = []
        for word in preprocess_sentence(sent1, stop_words):
            try:
                full_vect_1 += [model_word2vec[word]]
            except:
                print(word)
        for word in preprocess_sentence(sent2, stop_words):
            try:
                full_vect_2 += [model_word2vec[word]]
            except:
                print(word)
        vector_1 = np.mean(full_vect_1, axis=0)
        vector_2 = np.mean(full_vect_2, axis=0)
        return 1 - cosine_distance(vector_1, vector_2)
    elif method == 'countvectorizer':
        preprocessed_sent1 = preprocess_sentence(sent1, stop_words)
        preprocessed_sent2 = preprocess_sentence(sent2, stop_words)
        all_words = list(set(preprocessed_sent1 + preprocessed_sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence (approach: count vectorizer)
        for w in preprocessed_sent1:
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in preprocessed_sent2:
            vector2[all_words.index(w)] += 1
        return 1 - cosine_distance(vector1, vector2)
    elif method == 'tfidfvectorizer':
        preprocessed_sent1 = ' '.join(preprocess_sentence(sent1, stop_words))
        preprocessed_sent2 = ' '.join(preprocess_sentence(sent2, stop_words))

        tfidfvectorizer = TfidfVectorizer()
        X = tfidfvectorizer.fit_transform(
            [preprocessed_sent1, preprocessed_sent2]).toarray()
        #        print(tfidfvectorizer.get_feature_names())

        vector1 = X[0]
        vector2 = X[1]

        return 1 - cosine_distance(vector1, vector2)
def similar_sent(s1, s2):               #Tính độ tương tự giữa 2 câu theo khoảng cách cosine
    return 1 - cosine_distance(s1, s2)
示例#30
0
	sentences = raw.splitlines()	

#get unique words
unique = list(set(words))

#vectorize the sentences
vectorized = []
n = len(sentences)
for i in xrange(n):
	vector = []
	for j in xrange(len(unique)):
		if unique[j] in sentences[i]:
			vector.append(1)
		else:
			vector.append(0)
	vectorized.append(vector)

#create cosine similarity matrix
dist = np.zeros(n**2).reshape((n, n))
for i in xrange(n):
	for j in xrange(i):
		dist[i][j] = cosine_distance(np.asarray(vectorized[i]), np.asarray(vectorized[j]))
		dist[j][i] = dist[i][j]

#plot it
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(dist, interpolation='nearest')
fig.colorbar(cax)
plt.show()
示例#31
0
def getDsim(similarities, pairWiseSimilarityMatrix, index):
    for i in range(0, pairWiseSimilarityMatrix.shape[0]):
        similarities[index][i] = 1 - cosine_distance(
            pairWiseSimilarityMatrix[index], pairWiseSimilarityMatrix[i])
示例#32
0
def sentence_similarity(dfRow1,dfRow2):
    v1=dfRow1.values[0]
    v2=dfRow2.values[0]
    return 1-cosine_distance(v1,v2)
示例#33
0
    print("K-Medias: {0} s".format(time.time()- t))
    
    pickle.dump(clusters, open("clusters_test.pickle", "wb"))
else:
    clusters = pickle.load(open("clusters_test.pickle", "rb"))

i=1
print("Porcentaje de grupos similares entre los documentos")
for query in newsgroups[:10]:
    query_vector = text_vectorizer.transform([query])
    
    query_cluster = clusters.predict(query_vector)
    
    documents = text_vector[np.where(query_cluster == clusters.labels_)[0], :]
    
    distances = np.array([[j, cosine_distance(documents[j].toarray()[0], query_vector.toarray()[0])] for j in range(documents.shape[0])])
    distances = distances[distances[:,-1].argsort()][:5]

    groups = [newsgroups_obj.target[int(j[0])] for j in distances]
    
    groups, unique_counts = np.unique(groups, return_counts=True)
    
    percentages = [j / np.sum(unique_counts) for j in unique_counts]
    
    print("Documento {0}, Grupo mas frecuente: {1}-{2}%".format(i, groups[np.argmax(percentages)], np.amax(percentages)*100))
    i += 1

i= 1
print("Porcentaje de grupos similares al de la consulta")
for query in newsgroups[:10]:
    query_vector = text_vectorizer.transform([query])
示例#34
0
def sentence_simmilarity(s1, s2):
    return 1 - cosine_distance(s1, s2)
示例#35
0
 def sentence_similarity(self, vector1, vector2):
     return abs(1 - cosine_distance(vector1, vector2))