def cos_similarity(self, first, second): v1 = self.get_vector(first) v2 = self.get_vector(second) if v1 is not None and v2 is not None: return 1 - cos_distance(v1.A, v2.A) else: return None
def get_best_doc_num(query, sentences): #get the most likely documents according to the query min = 1 index = 0 for i in range(sentences.shape[0]): dist = cos_distance(query, sentences[i]) if dist < min: min = dist index = i return index
def cos_similarity(ref_counts, gen_counts): """ Computes cosine similarity between dictionaries of form {name: count}. Non-present elements are considered zero: sim = <r, g> / ||r|| / ||g|| """ if len(ref_counts) == 0 or len(gen_counts) == 0: return np.nan keys = np.unique(list(ref_counts.keys()) + list(gen_counts.keys())) ref_vec = np.array([ref_counts.get(k, 0) for k in keys]) gen_vec = np.array([gen_counts.get(k, 0) for k in keys]) return 1 - cos_distance(ref_vec, gen_vec)
def scoreDocuments(query, matrix, reference): results = [] #(reference, score) i = 0 for column in matrix: # print query.shape # print matrix[i].shape # print matrix[i] score = cos_distance(query, matrix[i]) name = reference[i] results.append([name, score]) i = i + 1 # if i%100 == 0: # print matrix[i] # print score #print results #results.sort(key=lambda x: x[1]) return results
texts.append(get_BOW(paras)) # create words-paragraph frequency matrix vectorizer = DictVectorizer() brownMatrix = vectorizer.fit_transform(texts).transpose() # get dense vectors of length 500 using truncated SVD svd = TruncatedSVD(n_components=500) brownMatrixSVD = svd.fit_transform(brownMatrix) # create dictionary of wordpair/cosine-similarity mappings using LSA method for filtered word pairs cosineSimilarityDict = {} for word1, word2 in wordSimDict: word1Index = vectorizer.feature_names_.index(word1) word2Index = vectorizer.feature_names_.index(word2) cosSim = 1 - cos_distance(brownMatrixSVD[word1Index, :], brownMatrixSVD[word2Index, :]) cosineSimilarityDict[word1, word2] = cosSim print(cosineSimilarityDict) # create dictionary of wordpair/word2vec-similarity mappings for filtered word pairs using sentences from brown corpus brownSentences = nltk.corpus.brown.sents() model = Word2Vec(brownSentences, min_count=5, size=500, iter=50) word2vecSimilarityDict = {} for word1, word2 in wordSimDict: word2vecSimilarityDict[word1, word2] = model.wv.similarity(word1, word2) print(word2vecSimilarityDict) # compare similarities with the gold standard using pearson correlation co-efficient wordSimGoldStanardList = list(wordSimDict.values())
def cosine_distance(x, y): '''Calculates the cosine distance between x and y.''' return cos_distance(x, y)