def text_similarity(self, t1, t2): if self._model == 'tfidf': t1_vec = matutils.any2sparse(self.text2model(t1)) t2_vec = matutils.any2sparse(self.text2model(t2)) return matutils.cossim(t1_vec, t2_vec) else: t1_vec = matutils.any2sparse(self.text2model(t1)) t2_vec = matutils.any2sparse(self.text2model(t2)) return matutils.cossim(t1_vec, t2_vec)
def make_scores_for_sample(): doc2vec_model = doc2vec.Doc2Vec.load('doc2vec_weigths') logging.info('doc2vec loaded') tfidf_unigram_model = TfidfModel.load('tfidf_unigram') logging.info('tfidf unigram loaded') tfidf_bigram_model = TfidfModel.load('tfidf_bigram') logging.info('tfidf bigram loaded') d1 = corpora.Dictionary.load('./dict_1.gensim') logging.info('dict1 loaded') d2 = corpora.Dictionary.load('./dict_2.gensim') logging.info('dict2 loaded') queries = pd.read_csv('./queries_norm.tsv', sep='\t', header=None, index_col=0) sample = pd.read_csv('./sample.csv', sep=',').sort_values(by=['DocumentId']) with open('./submission.csv', 'w') as f: writer = csv.writer(f, delimiter=',') writer.writerow(['QueryId', 'DocumentId', 'Score']) for idx, row in tqdm(sample.iterrows()): query_id = row['QueryId'] doc_id = row['DocumentId'] doc2vec_score = doc2vec_model.docvecs.similarity('DOC_%d' % doc_id, 'QUERY_%d' % query_id) doc = get_doc(doc_id) query = str(queries.loc[query_id]) doc_title = str(doc[1]) doc_content = str(doc[2]) doc_title_words = doc_title.split() doc_content_words = doc_content.split() query_words = query.split() doc_title_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(doc_title_words[:-1], doc_title_words[1:])))) doc_content_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(doc_content_words[:-1], doc_content_words[1:])))) query_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(query_words[:-1], query_words[1:])))) doc_title_words = d1.doc2bow(doc_title_words) doc_content_words = d1.doc2bow(doc_content_words) query_words = d1.doc2bow(query_words) doc_title_words = tfidf_unigram_model[doc_title_words] doc_content_words = tfidf_unigram_model[doc_content_words] query_words = tfidf_unigram_model[query_words] doc_title_bigrams = tfidf_bigram_model[doc_title_bigrams] doc_content_bigrams = tfidf_bigram_model[doc_content_bigrams] query_bigrams = tfidf_bigram_model[query_bigrams] tfidf_title_score_uni = matutils.cossim(doc_title_words, query_words) tfidf_content_score_uni = matutils.cossim(doc_content_words, query_words) tfidf_title_score_bi = matutils.cossim(doc_title_bigrams, query_bigrams) tfidf_content_score_bi = matutils.cossim(doc_content_bigrams, query_bigrams) score = (2 * tfidf_content_score_bi + 2 * tfidf_title_score_uni + tfidf_content_score_uni + 0.5 * doc2vec_score) / 5.5 writer.writerow([query_id, doc_id, score])
def similarity(self, t, extraction_pattern): (bef, bet, aft) = (0, 0, 0) if t.bef_vector is not None: bef = cossim(t.bef_vector, extraction_pattern.centroid_bef) if t.bet_vector is not None: bet = cossim(t.bet_vector, extraction_pattern.centroid_bet) if t.aft_vector is not None: aft = cossim(t.aft_vector, extraction_pattern.centroid_aft) return self.config.alpha * bef + self.config.beta * bet + self.config.gamma * aft
def similarity(self, t, extraction_pattern): (bef, bet, aft) = (0, 0, 0) if t.bef_vector is not None and extraction_pattern.centroid_bef is not None: bef = cossim(t.bef_vector, extraction_pattern.centroid_bef) if t.bet_vector is not None and extraction_pattern.centroid_bet is not None: bet = cossim(t.bet_vector, extraction_pattern.centroid_bet) if t.aft_vector is not None and extraction_pattern.centroid_aft is not None: aft = cossim(t.aft_vector, extraction_pattern.centroid_aft) return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft
def test(file_name): dictionary = corpora.Dictionary.load('./temp_tfidf/temp_dict') corpus = corpora.MmCorpus('./temp_tfidf/temp_mm') tfidf = models.TfidfModel.load('./temp_tfidf/tfidf_value') tags = load_tags('./temp_tfidf/tags') with open(file_name, 'r') as f, open('./result/tfidf_res', 'w') as outf: corpus_tfidf = tfidf[corpus] for line in f: items = line.decode('gbk').strip().split('\t') if len(items) != 2: raise Exception('error') qes = items[1].split(' ') new_vec = dictionary.doc2bow(qes) new_tfidf = tfidf[new_vec] h = [] k = 10 cnt = 0 for dic in corpus_tfidf: s = matutils.cossim(new_tfidf, dic) heapq.heappush(h, (s, cnt)) if len(h) > k: heapq.heappop(h) cnt += 1 candidate = '&&'.join(['%s:%s' % (tags[i], s) for (s, i) in h]) outf.write('%s\t%s\n' % (items[0].encode('gbk'), candidate.encode('gbk')))
def score(query, profile, data=None): if not len(profile.description): return [-1] vectorspace = VectorSpace([]) tokenized_description = LowerTokenizer.tokenize(profile.description) description_vector = vectorspace.vector_for_document( tokenized_document=tokenized_description, update=True) ddg_description = DuckDuckDescription.query(query.lower()) ddg_vector = [] if ddg_description: ddg_text = ddg_description['description']['text'] ddg_tokenized = LowerTokenizer.tokenize(ddg_text) ddg_vector = vectorspace.vector_for_document( tokenized_document=ddg_tokenized, update=True) if not len(ddg_vector): return [-1] return [cossim(description_vector, ddg_vector)]
def order_by_tf_id_rank(self, headline, sentences, number_of_sentences): headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline)) headline_tfidf = self.tfidf_model[headline_bow] scored_sentences = [] 'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]' #sentences = sentences.replace('\n', ' ') for sentence in self.tokenizer.tokenize(sentences): sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence)) sim = cossim(headline_tfidf, sentence_tfidf) #print(str(sim)) scored_sentences.append([sentence, sim]) sorted_sentences= sorted(scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse= True) ''' for sentence in sorted_sentences: print(str(sentence)) ''' ' return sorted_sentences ' sentences_string = "" current_sentence_number = 0 for sentence in sorted_sentences: current_sentence_number += 1 sentences_string += sentence[0] + ' ' if current_sentence_number == number_of_sentences: break #print("Ranked: \n " + sentences_string) return sentences_string
def score(tweet, webpage): lda = ldamodel.get_lda() dictionary = ldamodel.get_dictionary() tweet_vec = lda[dictionary.doc2bow(tweet['terms'])] news_vec = cached_news_vector(webpage["content"].encode("utf-8")) score = matutils.cossim(news_vec, tweet_vec) return score
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[np.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s", cor) self.assertTrue(cor > 0.6)
def ComputerSimilarity(model, corpus, blocks): print blocks; print(s); vectors = []; ##There are 2 blocks, the one before the sentence and the one after for block in blocks: block_lda = model[corpus.dictionary.doc2bow(corpus.proc(block))]; topics = np.asarray(block_lda); totalWeight = topics.sum(axis=0)[1] print(topics); print('---') #Generate words, a dictionary that represents a vector that is the normalized combination of all the topics words = {}; for row in topics: weight = row[1]/totalWeight; topicID = int(row[0]); term_list = model.get_topic_terms(topicID,T) for word_weight in term_list: word_id = word_weight[0]; word_n = word_weight[1]; if words.has_key(word_id): words[word_id] = words[word_id] + word_n * weight; else: words[word_id] = word_n * weight; aggregate_vector = words.items(); vectors.append(aggregate_vector); dot_product = matutils.cossim(vectors[0],vectors[1]) return dot_product
def sim_matrix(self, _topics2cousines): """ Return two similarities matrix @params: _topics2cousines - Required : list of topics vectors for list of cuisines (list of list of floats) """ _cuisine_matrix_e = [] _cuisine_matrix_c = [] for i, doc_a in enumerate(_topics2cousines): doc_a = doc_a[1] sim_vecs_e = [] sim_vecs_c = [] for j, doc_b in enumerate(_topics2cousines): doc_b = doc_b[1] w_sum_cs = matutils.cossim(doc_a, doc_b) w_sum_ed = 1 - self.euclidean_distance(list(doc_a), list(doc_b)) if w_sum_ed < 0: w_sum_ed = -1 * w_sum_ed sim_vecs_e.append(w_sum_ed) sim_vecs_c.append(w_sum_cs) _cuisine_matrix_e.append([_topics2cousines[i][0], sim_vecs_e]) _cuisine_matrix_c.append([_topics2cousines[i][0], sim_vecs_c]) return _cuisine_matrix_e, _cuisine_matrix_c
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6)
def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs): """ This function calculates the indirect cosine measure. Given context vectors _ _ _ _ u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect _ _ cosine measure is computed as the cosine similarity between u and w. Args: ---- topics : Topics obtained from the trained topic model. segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. per_topic_postings : per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics. measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). gamma : Gamma value for computing W', W* vectors. num_docs : Total number of documents in corresponding corpus. """ if measure == 'nlr': measure = direct_confirmation_measure.normalized_log_ratio_measure else: raise ValueError("The direct confirmation measure you entered is not currently supported.") backtrack = {} s_cos_sim = [] for top_words, s_i in zip(topics, segmented_topics): for w_prime, w_star in s_i: w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) backtrack.update(backtrack_i) w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) backtrack.update(backtrack_i) s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items()) s_cos_sim.append(s_cos_sim_i) return s_cos_sim
def order_by_tf_id_rank(self, headline, sentences, number_of_sentences): headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline)) headline_tfidf = self.tfidf_model[headline_bow] scored_sentences = [] 'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]' #sentences = sentences.replace('\n', ' ') for sentence in self.tokenizer.tokenize(sentences): sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence)) sim = cossim(headline_tfidf, sentence_tfidf) #print(str(sim)) scored_sentences.append([sentence, sim]) sorted_sentences = sorted( scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse=True) ''' for sentence in sorted_sentences: print(str(sentence)) ''' ' return sorted_sentences ' sentences_string = "" current_sentence_number = 0 for sentence in sorted_sentences: current_sentence_number += 1 sentences_string += sentence[0] + ' ' if current_sentence_number == number_of_sentences: break #print("Ranked: \n " + sentences_string) return sentences_string
def calculateCoherence(config): lsaModel = models.LsiModel.load(config['LSA']['modelFileLocation']) dictionary = corpora.Dictionary.load( config['corpus']['corpusFolderLocation'] + 'corpus.dict') corpus = corpora.MmCorpus(config['corpus']['corpusFolderLocation'] + 'corpus.mm') tfidf = models.TfidfModel(corpus) for dataset in config['corpus']['datasets']: for transcriptPath in os.listdir(dataset['path']): document = loadFileIntoList(dataset['path'] + "/" + transcriptPath, dataset['transcriptColumn'], dataset['delimiter'], dataset['rowsToSkip'], config['corpus']['sentenceSplitting'], config['corpus']['stopwords']) with open( config['coherence']['outputFolderLocation'] + transcriptPath + "_results_lsa.tsv", 'w') as outputFile: # write header outputFile.write( "coherence to previous sentence(s)\tpreprocessed sentence\tfull sentence\tcorresponding turn\n" ) lastSentencesLSAList = [] for sentence in document: if " ".join(sentence[0]): sentenceBow = dictionary.doc2bow(sentence[0]) weightIndex = 1 simLSA = 0 weightNormalizer = 0 for el in lastSentencesLSAList: simLSA += 1 / weightIndex * matutils.cossim( lsaModel[tfidf[sentenceBow]], el) weightNormalizer += 1 / weightIndex weightIndex += 1 if weightNormalizer > 0: simLSA /= weightNormalizer lastSentencesLSAList.insert( 0, lsaModel[tfidf[sentenceBow]]) if len(lastSentencesLSAList ) > config['coherence']['slidingWindow']: del lastSentencesLSAList[-1] outputFile.write(str(simLSA) + "\t") outputFile.write(" ".join(sentence[0]) + "\t") outputFile.write(" ".join(sentence[1]) + "\t") outputFile.write(sentence[2] + "\n")
def calc_similarity_score(tokens1, tokens2): ''' Calculate a similarity score comparing tokens1 and tokens2 using cosine similarity. ''' corpus1 = get_corpus(tokens1) corpus2 = get_corpus(tokens2) return matutils.cossim(corpus1, corpus2)
def _update_pairwise_similarity(self, topic_id, content, date): """ updates similarity data within the corpus """ bow = self.dictionary.doc2bow(content) for tid, data in self.data.items(): day_delta = (int(date) - int(data['date'])) / NUM_SECONDS_PER_DAY time_factor = math.pow(self.time_decay, day_delta) if tid == topic_id: continue bow1 = self.dictionary.doc2bow(data['body']) sim = matutils.cossim(bow, bow1) sim_1 = sim * min(1.0, 1 / time_factor) sim_2 = sim * min(1.0, time_factor) if self.irrelevant_thresh <= sim_1 <= self.duplicate_thresh: del_id = insert(data['sim_list'], topic_id, sim_1, self.max_recoms) if del_id is not None: self.data[topic_id]['appears_in'].append(tid) self.data[tid]['updated'] = True if del_id != '': remove(self.data[del_id]['appears_in'], tid) if self.irrelevant_thresh <= sim_2 <= self.duplicate_thresh: del_id = insert(self.data[topic_id]['sim_list'], tid, sim_2, self.max_recoms) if del_id is not None: self.data[tid]['appears_in'].append(topic_id) if del_id != '': remove(self.data[del_id]['appears_in'], topic_id)
def calculate_similarity(vec_sentence1, vec_sentence2, network_type): embeddings = ['d2v', 'gd2v', 'fastT', 'gloVe', 's2v'] if network_type == 'tfidf': return matutils.cossim(vec_sentence1, vec_sentence2) #if network_type=='d2v' or network_type=='gd2v': if network_type in embeddings: return 1 - spatial.distance.cosine(vec_sentence1, vec_sentence2)
def get_similarity_for_article_and_cluster(self, article, cluster): article.pre_process(self.postagger, self.stop_words, self.n_keywords) word_tfidfs = article.title_content_effective_word_tfidfs similarity = np.mean( [matutils.cossim(article.title_content_effective_word_tfidfs, word_tfidfs) for article in cluster.articles]) return similarity
def mapper(self, _, line): read_line = list(next(csv.reader([line], delimiter="\t"))) user = read_line[0] sim_user = read_line[1] user_df = self.df[self.df["user"] == user] user_rest = list(user_df["rest"].values) sim_user_df = self.df[(self.df["user"] == sim_user) & (~self.df["rest"].isin(user_rest))] for i in range(len(user_df)): rest = user_df.iloc[i]["rest"] la = float(user_df.iloc[i]["la"]) lon = float(user_df.iloc[i]["lon"]) # strip unneeded symbols vec = ast.literal_eval(user_df.iloc[i]["vec"]) lsi1 = self.lsi[vec] for j in range(len(sim_user_df)): sim_rest = sim_user_df.iloc[j]["rest"] sim_la = float(sim_user_df.iloc[j]["la"]) sim_lon = float(sim_user_df.iloc[j]["lon"]) sim_vec = ast.literal_eval(sim_user_df.iloc[j]["vec"]) lsi2 = self.lsi[sim_vec] sim_score = cossim(lsi1, lsi2) dist = haversine_distance((la, lon), (sim_la, sim_lon)) yield None, (user, sim_user, rest, sim_rest, sim_score, dist)
def calculate_lsi_requested_sim_in_df(model, dictionary, doc1ID, doc2ID, doc_data, IDfield, contentField, stoplist): """ Expects a dataframe that actually holds the text associated with each doc (which is just passed in as strings that denote the id) Built in some flexibility for different field naming conventions by making the IDfield and contentField variables """ # sim_output = [] # for pair in pairs: # low_id = pair[0] # high_id = pair[1] # idea1 = lemmatize_an_idea(all_ideas[all_ideas.ideaID == low_id].idea.values[0]) # idea2 = lemmatize_an_idea(all_ideas[all_ideas.ideaID == high_id].idea.values[0]) # vec1 = model[dictionary.doc2bow(idea1)] # vec2 = model[dictionary.doc2bow(idea2)] # sim_output.append((low_id, high_id,cossim(vec1,vec2))) text1 = lemmatize_an_idea( doc_data[doc_data[IDfield] == doc1ID][contentField].values[0], stoplist) text2 = lemmatize_an_idea( doc_data[doc_data[IDfield] == doc2ID][contentField].values[0], stoplist) vec1 = model[dictionary.doc2bow(text1)] vec2 = model[dictionary.doc2bow(text2)] return cossim(vec1, vec2)
def cossim_pairs(topic_models, num_topics=20): topic_cos_map = {} num_topics = 20 for i, m in enumerate(topic_models): cur_date = temp_dates[i] month = int(cur_date.split("-")[1]) if month < 9: break if month != 9: continue for u in range(num_topics): for j, n in enumerate(topic_models): if i == j: continue top_cs = -1 top_topic = "" for v in range(num_topics): cs = cossim(m.show_topic(u), n.show_topic(v)) if cs > top_cs: top_cs = cs top_topic = "{}:{}_{}:{}".format(i, u, j, v) topic_cos_map[top_topic] = top_cs return topic_cos_map
def tfidf_sim(self, train_data, body_dict, threshold): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar ''' bodyText_list = list(body_dict.values()) bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] vocab = corpora.Dictionary(bodyText_w) corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w] tfidf_model = models.TfidfModel(corporaBody_bow) unrelated, related, y_true, y_pred = [], [], [], [] for headline, bodyID, stance in train_data: headline_bow = vocab.doc2bow(sent2stokens_wostop(headline)) headlines_tfidf = tfidf_model[headline_bow] corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]] sim = cossim(headlines_tfidf, corporaBody_tfidf) unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred]) print_results([unrelated, related, y_true, y_pred], self.model_type)
def predict_score(self, user_id, item_id): ratings = Rating.objects.filter(user_id=user_id) rated_movies = {r['movie_id']: r['rating'] for r in ratings.values()} md = MovieDescriptions.objects.filter(imdb_id=item_id).first() rated_movies_desc = MovieDescriptions.objects.filter( imdb_id__in=rated_movies.keys()) if md is None: return 0 if rated_movies_desc is None: return 0 if rated_movies_desc.count() == 0: return 0 top = 0.0 bottom = 0.0 for rm in rated_movies_desc: lda_vector = self.corpus[int(md.lda_vector)] lda_vector_sim = self.corpus[int(rm.lda_vector)] sim = matutils.cossim(lda_vector, lda_vector_sim) rating = rated_movies[rm.imdb_id] top += sim * float(rating) bottom += sim return top / bottom
def assign_article_topics(self, article_id, heading, process_all = False): """ Assign the appropriate topics to the given article in the database """ if self._dictionary is None: self.load_dictionary() if self._tfidf is None: self.load_tfidf_model() if self._model is None: self.load_lsi_model() if self._topics is None: self.load_topics() with SessionContext(commit = True) as session: q = session.query(Word.stem, Word.cat, Word.cnt) \ .filter(Word.article_id == article_id).all() wlist = [] for stem, cat, cnt in q: # Convert stem to lowercase and replace spaces with underscores w = w_from_stem(stem, cat) if cnt == 1: wlist.append(w) else: wlist.extend([w] * cnt) topics = [] article_vector = [] if self._topics and wlist: bag = self._dictionary.doc2bow(wlist) tfidf = self._tfidf[bag] article_vector = self._model[tfidf] topic_names = [] if self._verbose: print("{0} : {1}".format(article_id, heading)) for topic_id, topic_info in self._topics.items(): topic_name = topic_info["name"] topic_vector = topic_info["vector"] topic_threshold = topic_info["threshold"] # Calculate the cosine similarity between the article and the topic similarity = matutils.cossim(article_vector, topic_vector) if self._verbose: print(" Similarity to topic {0} is {1:.3f}".format(topic_name, similarity)) if similarity >= topic_threshold: # Similar enough: this is a topic of the article topics.append(topic_id) topic_names.append((topic_name, similarity)) if topic_names and not process_all: print("Article '{0}':\n topics {1}".format(heading, topic_names)) # Topics found (if any): delete previous ones (if any) session.execute(ArticleTopic.table().delete().where(ArticleTopic.article_id == article_id)) # ...and add the new ones for topic_id in topics: session.add(ArticleTopic(article_id = article_id, topic_id = topic_id)) # Update the indexed timestamp and the article topic vector a = session.query(Article).filter(Article.id == article_id).one_or_none() if a is not None: a.indexed = datetime.utcnow() if article_vector: # Store a pure list of floats topic_vector = [ t[1] for t in article_vector ] a.topic_vector = json.dumps(topic_vector) else: a.topic_vector = None
def getComparable(source_lsi_doc, target_lsi_corpus): sims = [] for i in range(len(target_lsi_corpus)): sims.append( matutils.cossim(source_lsi_doc, target_lsi_corpus[i]) ) sortedSims = sorted(enumerate(sims), key=lambda item: -item[1]) topIndex = sortedSims[0][0] topSim = sortedSims[0][1] return sortedSims[0]
def title_similarity(t1, t2): if not t1 or not t2: return -2 else: return cossim( tokens_to_lsi(t1), tokens_to_lsi(t2) )
def description_similarity(d1, d2): if not d1 or not d2: return -2 else: return cossim( tokens_to_lsi(d1), tokens_to_lsi(d2) )
def getComparable(source_lsi_doc, target_lsi_corpus): sims = [] for i in range(len(target_lsi_corpus)): sims.append(matutils.cossim(source_lsi_doc, target_lsi_corpus[i])) sortedSims = sorted(enumerate(sims), key=lambda item: -item[1]) topIndex = sortedSims[0][0] topSim = sortedSims[0][1] return sortedSims[0]
def compute_similarity(self, doc1, doc2): """Compute the cosine similarity between two documents. :doc1: a list of strings, representing the first document. :doc2: a list of strings, representing the second document. :returns: a number between -1 and 1, representing the similarity between the two documents. """ return cossim(self.get_vector(doc1), self.get_vector(doc2))
def assign_article_topics(self, article_id, heading): """ Assign the appropriate topics to the given article in the database """ if self._dictionary is None: self.load_dictionary() if self._tfidf is None: self.load_tfidf_model() if self._model is None: self.load_lda_model() if self._topics is None: self.load_topics() with SessionContext(commit=True) as session: q = session.query(Word.stem, Word.cat, Word.cnt) \ .filter(Word.article_id == article_id).all() wlist = [] for stem, cat, cnt in q: # Convert stem to lowercase and replace spaces with underscores w = stem.lower().replace(" ", "_") + "/" + cat if cnt == 1: wlist.append(w) else: wlist.extend([w] * cnt) topics = [] if self._topics and wlist: bag = self._dictionary.doc2bow(wlist) tfidf = self._tfidf[bag] article_vector = self._model[tfidf] topic_names = [] if self._verbose: print("{0} : {1}".format(article_id, heading)) for topic_id, topic_info in self._topics.items(): topic_name = topic_info["name"] topic_vector = topic_info["vector"] topic_threshold = topic_info["threshold"] # Calculate the cosine similarity betwee the article and the topic similarity = matutils.cossim(article_vector, topic_vector) if self._verbose: print(" Similarity to topic {0} is {1:.3f}".format( topic_name, similarity)) if similarity >= topic_threshold: # Similar enough: this is a topic of the article topics.append(topic_id) topic_names.append((topic_name, similarity)) if topic_names: print("Article '{0}': topics {1}".format( heading, topic_names)) # Topics found (if any): delete previous ones (if any) session.execute(ArticleTopic.table().delete().where( ArticleTopic.article_id == article_id)) # ...and add the new ones for topic_id in topics: session.add( ArticleTopic(article_id=article_id, topic_id=topic_id)) # Update the indexed timestamp a = session.query(Article).filter( Article.id == article_id).one_or_none() if a: a.indexed = datetime.utcnow()
def getMaxSimilarity(dictTopic, vector): maxValue = 0 maxIndex = -1 for k, cluster in dictTopic.items(): oneSimilarity = np.mean([matutils.cossim(vector, v) for v in cluster]) if oneSimilarity > maxValue: maxValue = oneSimilarity maxIndex = k return maxIndex, maxValue
def wordsim(left, right): leftvec = unidict.doc2bow(left.lower().split()) rightvec = unidict.doc2bow(right.lower().split()) leftlsi = unilsi[leftvec] rightlsi = unilsi[rightvec] #leftlda = unilda[leftvec] # matutils.sparse2full(..., 300) #rightlda = unilda[rightvec] return {"lsi": matutils.cossim(leftlsi, rightlsi),}
def vsm_dist(song_A, song_B): # try: tif = models.TfidfModel(a_corps) a_tif = tif[song_A['tokenized_comments']] b_tif = tif[song_B['tokenized_comments']] dist = matutils.cossim(a_tif, b_tif) if dist == 0: dist = 0.0000001#avoid the div by 0 return dist
def get_max_similarity(self, article): title_content_word_tfidfs = article.title_content_effective_word_tfidfs title_content_max_sim = 0 title_content_max_sim_cluster_id = -1 content_word_tfidfs = article.content_effective_word_tfidfs content_max_sim = 0 content_max_sim_cluster_id = -1 title_word_tfidfs = article.title_effective_word_tfidfs title_max_sim = 0 title_max_sim_cluster_id = -1 for i in np.arange(len(self.clusters)): cluster = self.clusters[i] # title_content title_content_similarity = np.mean([ matutils.cossim(article.title_content_effective_word_tfidfs, title_content_word_tfidfs) for article in cluster.articles ]) if title_content_similarity > title_content_max_sim: title_content_max_sim = title_content_similarity title_content_max_sim_cluster_id = i content_similarity = np.mean([ matutils.cossim(article.content_effective_word_tfidfs, content_word_tfidfs) for article in cluster.articles ]) if content_similarity > content_max_sim: content_max_sim = content_similarity content_max_sim_cluster_id = i title_similarity = np.mean([ matutils.cossim(article.title_effective_word_tfidfs, title_word_tfidfs) for article in cluster.articles ]) if title_similarity > title_max_sim: title_max_sim = title_similarity title_max_sim_cluster_id = i return title_content_max_sim, title_content_max_sim_cluster_id, content_max_sim, content_max_sim_cluster_id, title_max_sim, title_max_sim_cluster_id
def _get_doc_similarity(self, doc1_tk, doc2_tk): """ :param doc1_tk: Preprocessed documents as tokens :param doc2_tk: Preprocessed documents as tokens :return: """ dis1 = self.get_topic_distrb(doc1_tk) dis2 = self.get_topic_distrb(doc2_tk) # return 1 - matutils.hellinger(dis1, dis2) return matutils.cossim(dis1, dis2)
def similarity_lsi(self, text1, text2): # convert text into bag of words model text1_bow = self.__dictionary.doc2bow(self.__preprocess_text_document(text1)) text2_bow = self.__dictionary.doc2bow(self.__preprocess_text_document(text2)) # transform text into the model's domain text1_model = self.__model_lsi[text1_bow] text2_model = self.__model_lsi[text2_bow] return cossim(text1_model, text2_model)
def reducer(self, key, value): lst_of_rvws = list(value)[0] rvws1 = lst_of_rvws[0] rvws2 = lst_of_rvws[1] # print(rvws1) sim = cossim(self.dct.doc2bow(rvws1), self.dct.doc2bow(rvws2)) # similarity.append((biz[i], biz[j], sim)) # print('reducer') join_key = str(key[0]) + '\t' + str(key[1]) yield join_key, str(sim)
def calculate_lsi_requested_sim(model, dictionary, text1, text2, stoplist): """ Base version that just takes in two strings and spits out a similarity in the provided gensim model space """ text1_lemm = lemmatize_an_idea(text1, stoplist) text2_lemm = lemmatize_an_idea(text2, stoplist) vec1 = model[dictionary.doc2bow(text1_lemm)] vec2 = model[dictionary.doc2bow(text2_lemm)] return cossim(vec1, vec2)
def get_abstract_similarity(row): if pd.notnull(row['abstract_lr']) and pd.notnull(row['abstract_cp']): lr_doc = preprocess_doc(row['abstract_lr']) cp_doc = preprocess_doc(row['abstract_cp']) lr_bow = abstract_dict.doc2bow(lr_doc) cp_bow = abstract_dict.doc2bow(cp_doc) lr_lsi = abstract_lsi[lr_bow] cp_lsi = abstract_lsi[cp_bow] return matutils.cossim(lr_lsi, cp_lsi) else: return 0
def calculate_best_score(candidate_id, reference_ids): # translate candidate to LSI vector vec_lsi = id2lsi_vec(candidate_id) # determine similarities for each of the references scores = [ matutils.cossim(vec_lsi, id2lsi_vec(reference_id)) for reference_id in reference_ids ] return max(scores)
def getMaxSimilarity(self, dictTopic, vector): # 计算新进入文档和已有文档的文本相似度,这里的相似度采用的是cosine余弦相似度 maxValue = 0 maxIndex = -1 for k, cluster in dictTopic.items(): oneSimilarity = np.mean( [matutils.cossim(vector, v) for v in cluster]) if oneSimilarity > maxValue: maxValue = oneSimilarity maxIndex = k return maxIndex, maxValue
def compare_strings(self, s1, s2): # Build vector for doc1 vec_bow1 = self.dictionary.doc2bow(s1.lower().split()) vec_lsi1 = self.lsi[vec_bow1] # convert the query to LSI space # Build vector for doc2 vec_bow2 = self.dictionary.doc2bow(s2.lower().split()) vec_lsi2 = self.lsi[vec_bow2] # Calculate cosine similarity sim = matutils.cossim(vec_lsi1, vec_lsi2) return sim
def get_link_scores(self, source, target): """ :param doc1_tk: Preprocessed documents as tokens :param doc2_tk: Preprocessed documents as tokens :return: """ doc1_tk = source['tokens'].split() doc2_tk = target['tokens'].split() dis1 = self.get_topic_distrb(doc1_tk) dis2 = self.get_topic_distrb(doc2_tk) # return 1 - matutils.hellinger(dis1, dis2) return matutils.cossim(dis1, dis2)
def loadDocuments(): documents = [] for info in LIST_OF_TRANSCRIPTS_INFO: for transcriptPath in os.listdir(info[0]): document = loadFileIntoList(info[0] + "/" + transcriptPath, info[1], info[2], info[3]) documents.append(document) for doc in documents: last_sentence_lsi = [] last_sentence_lda = [] for sentence in doc: sentence_bow = dictionary.doc2bow(sentence.split(" ")) print(sentence) #print(lsi[tfidf[sentence_bow]]) #print(lda[sentence_bow]) sim_lsi = matutils.cossim(lsi[tfidf[sentence_bow]], last_sentence_lsi) sim_lda = matutils.cossim(lda[sentence_bow], last_sentence_lda) print(sim_lsi) print(sim_lda) last_sentence_lsi = lsi[tfidf[sentence_bow]] last_sentence_lda = lda[sentence_bow]
def ComputerSimilarityOld(model, corpus, blocks): #print blocks; #print(s); vectors = []; ##There are 2 blocks, the one before the sentence and the one after for block in blocks: #print(block); words = corpus.proc(block) topic_distribution = {}; words_culled = []; for word in words: block_lda = model[corpus.dictionary.doc2bow(corpus.proc(word))]; topic_id = 0; prob = 0; for t in block_lda: if t[1] > prob: topic_id = t[0]; prob = t[1] if (prob < .1): continue; ppp = 0; word_id = corpus.dictionary.token2id[word]; if WordProbThreshold: terms = model.get_topic_terms(topic_id,100000); for term in terms: if term[0] == word_id: ppp = term[1]; if (ppp < X and WordProbThreshold): print("CUT " + word + " " + str(ppp)); continue; print(word + " " + str(ppp)); words_culled.append(word) if topic_distribution.has_key(topic_id): topic_distribution[topic_id] = topic_distribution[topic_id]+1; else: topic_distribution[topic_id] = 1; print("-------------"); #print((topic_id, prob)); #print(model[corpus.dictionary.doc2bow(words_culled)]) #print(topic_distribution) #print("bleh") vectors.append(topic_distribution.items()); dot_product = matutils.cossim(vectors[0],vectors[1]) return dot_product;
def score(query, profile, data=None): if not len(profile.description): return [-1] vectorspace = VectorSpace([]) tokenized_query = LowerTokenizer.tokenize(query) tokenized_description = LowerTokenizer.tokenize(profile.description) query_vector = vectorspace.vector_for_document( tokenized_document=tokenized_query, update=True) description_vector = vectorspace.vector_for_document( tokenized_document=tokenized_description, update=True) return [cossim(description_vector, query_vector)]
def test_get_similarities(self, mock_q_simmx_file_path, mock_a_simmx_file_path, mock_dict_file_path, mock_md_file_path): mock_md_file_path.return_value = self.test_md_file_path mock_dict_file_path.return_value = self.test_dict_file_path mock_q_simmx_file_path.return_value = self.test_q_simmx_file_path mock_a_simmx_file_path.return_value = self.test_a_simmx_file_path model_struct = TfIdfModelStruct.get_model(data_store=self.data_store) query_doc = "Is brocolli tasty to eat?" compare_docs = self.data_store.doc_set sims = model_struct.get_similarities(query_doc, compare_docs) for idx, sim in enumerate(sims): expected_sim = cossim( model_struct.get_tfidf_vec(query_doc), model_struct.get_tfidf_vec(compare_docs[idx]) ) self.assertAlmostEqual(sim[1], expected_sim)
def get_query_entity_document_score(query_document_tfidf_vector,candidate_entity_id): try: entity_document_tfidf_vector = temp_entity_id_content_vec_dic[candidate_entity_id] except KeyError as ke: content_text = "" fr = open(entity_documents_file_path + candidate_entity_id, 'r') line = fr.readline().decode(coding) while(line): line = line.strip() if(line != ''): content_text += line + " " line = fr.readline().decode(coding) fr.close() entity_document_tfidf_vector = tfidf_model[tfidf_dictionary.doc2bow(content_text.strip().lower().split())] temp_entity_id_content_vec_dic[candidate_entity_id] = entity_document_tfidf_vector score = matutils.cossim(query_document_tfidf_vector,entity_document_tfidf_vector) return score
def tfidf_distance(corpora, tfidf, tfidf_web, mean_vec, web_2, loss_weight): """compute the distance (as a function of cosine similarity) between two websites using tfidf model""" if len(mean_vec) == 0: return loss_weight try: indx_2 = tfidf_web.values().index(web_2) except ValueError: return loss_weight doc_num_2 = tfidf_web.keys()[indx_2] bow_2 = corpora[doc_num_2] tf_rap_2 = matutils.unitvec(tfidf[bow_2]) # get its tfidf representation cosine_sim = min(matutils.cossim(mean_vec, tf_rap_2), 1.0) return sqrt(2.0 * (1.0 - cosine_sim)) / 2.0 # return the distance of the two vectors
def value_for_text(self, t, rp=default_rp): space = rp.lsa_space() num_topics = space.num_topics tokens = rp.tokens(t) tokens = [[token.lower() for token in sentence] for sentence in tokens] if len(tokens) < 2: return 0 spans = np.zeros(len(tokens) - 1) for i in range(1, len(tokens)): past_sentences = tokens[:i] span_dim = len(past_sentences) if span_dim > num_topics - 1: # It's not clear, from the papers I read, what should be done # in this case. I did what seemed to not imply in loosing # information. beginning = past_sentences[0:span_dim - num_topics] past_sentences[0] = list(chain.from_iterable(beginning)) past_vectors = [sparse2full(space.get_vector(sent), num_topics) for sent in past_sentences] curr_vector = sparse2full(space.get_vector(tokens[i]), num_topics) curr_array = np.array(curr_vector).reshape(num_topics, 1) A = np.array(past_vectors).transpose() projection_matrix = dot(dot(A, pinv(dot(A.transpose(), A))), A.transpose()) projection = dot(projection_matrix, curr_array).ravel() spans[i - 1] = cossim(full2sparse(curr_vector), full2sparse(projection)) return self.get_value(spans)
def cosine_by_tfidf(list1, list2, feature): cosine_value = 0.0 intersection_set = set(list1) & set(list2) if intersection_set: union_set = set(list1) | set(list2) feature_count = {} for item in union_set: feature_count[item] = reference_dict[feature][item] # feature_count[item] = collection.find({feature: {'$in': [item]}}).count() list1_dict = dict(Counter(list1)) list2_dict = dict(Counter(list2)) # pprint(list1_dict) # pprint(list2_dict) num1 = len(list1) num2 = len(list2) list1_tfidf_dict = {} for k, v in list1_dict.items(): list1_tfidf_dict[k] = (v / num1) * math.log(movie_count / feature_count[k]) list2_tfidf_dict = {} for k, v in list2_dict.items(): list2_tfidf_dict[k] = (v / num2) * math.log(movie_count / feature_count[k]) # pprint(list1_tfidf_dict) # pprint(list2_tfidf_dict) cosine_value = matutils.cossim(list1_tfidf_dict, list2_tfidf_dict) # print cosine_value return cosine_value
def query(self, query, sorted=False): """Given a search query, returns a list of document IDs and their cosine similarity to that query. If sorted is True, results are returned sorted in descending order of cosine similarity. Otherwise, results are returned ordered by document ID.""" tokenized = self.tokenizer.tokenize(query) query_vector = self.vectorspace.vector_for_document(tokenized, update=True) results = [] for index, document in enumerate(self.documents): tokenized = self.tokenizer.tokenize(document) document_vector = self.vectorspace.vector_for_document(tokenized) similarity = cossim(query_vector, document_vector) results.append((index, similarity)) if sorted: return sorted(results, key=lambda item: -item[1]) return results
def predict(self, X_test, acronym): labels = [] confidences = [] for test_lda in X_test: similarities = map( lambda article_lda: cossim(article_lda, test_lda), self.X_train_lda) similarityByLabel = sorted( zip(similarities, self.y_train_labels), key=lambda item: item[0], reverse=True) chosen_label = similarityByLabel[0][1] max_similarity = similarityByLabel[0][0] similarityByLabelForNotChosen = [ item for item in similarityByLabel if item[1] != chosen_label] confidence = min_confidence if(similarityByLabelForNotChosen): confidence = max_similarity - \ similarityByLabelForNotChosen[0][0] labels.append(chosen_label) confidences.append(confidence) return labels, confidences
def cossineSim(self, N): print "Cossine sim" simMatrixCos = [] for x in range(0,50): topicMatrixCos = [] for y in range(0,50): model = self.model vec1 = model.get_topic_terms(x, topn=model.num_terms) #ldaVec1 = sorted(model.get_topic_terms(x, topn=model.num_terms)) #ldaVec2 = sorted(model.get_topic_terms(y, topn=model.num_terms)) ldaVec1 = model.get_topic_terms(x, topn = N) ldaVec2 = model.get_topic_terms(y, topn = N) #dense1 = gensim.matutils.sparse2full(ldaVec1, model.num_terms) #dense2 = gensim.matutils.sparse2full(ldaVec2, model.num_terms) sim = matutils.cossim(ldaVec1, ldaVec2) #simDict = (x, y, sim, self.model.show_topic(y)) simDict = (x, y, sim) topicMatrixCos.append(simDict) simMatrixCos.append(sorted(topicMatrixCos, key=itemgetter(2), reverse=True)) #for element in simMatrix: # print simMatrix #topic1Sorted = sorted(simMatrixCos[0], key=itemgetter(2)) #x = topic1Sorted return simMatrixCos
def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs): """ This function calculates the indirect cosine measure. Given context vectors _ _ _ _ u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect _ _ cosine measure is computed as the cosine similarity between u and w. The formula used is: m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) where each vector \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} Args: ---- topics : Topics obtained from the trained topic model. segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics. measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). gamma : Gamma value for computing W', W* vectors. num_docs : Total number of documents in corresponding corpus. Returns: ------- s_cos_sim : array of cosine similarity of the context vectors for each segmentation """ if measure == 'nlr': # make normalized log ratio measure tuple measure = (direct_confirmation_measure.log_ratio_measure, True) else: raise ValueError("The direct confirmation measure you entered is not currently supported.") backtrack = {} # Backtracking dictionary for storing measure values of topic id tuples eg. (1, 2). """ For backtracking context vectors, we will create a list called w_backtrack to store (w_prime, w) or (w_star, w) tuples and a corresponding list context_vector_backtrack which will create a mapping of (w_prime or w_star, w) ---> context_vector. """ w_backtrack = [] context_vector_backtrack = [] s_cos_sim = [] for top_words, s_i in zip(topics, segmented_topics): for w_prime, w_star in s_i: # Step 1. Check if (w_prime, top_words) tuple in w_backtrack. # Step 2. If yes, return corresponding context vector w_prime_index = _present(w_prime, top_words, w_backtrack) if w_backtrack and w_prime_index != -1: w_prime_context_vectors = context_vector_backtrack[w_prime_index] else: w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) backtrack.update(backtrack_i) # Update backtracking lists w_backtrack.append((w_prime, top_words)) context_vector_backtrack.append(w_prime_context_vectors) # Step 1. Check if (w_star, top_words) tuple in w_backtrack. # Step 2. If yes, check if corresponding w is the same w_star_index = _present(w_star, top_words, w_backtrack) if w_backtrack and w_star_index != -1: w_star_context_vectors = context_vector_backtrack[w_star_index] else: w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) backtrack.update(backtrack_i) # Update all backtracking lists w_backtrack.append((w_star, top_words)) context_vector_backtrack.append(w_star_context_vectors) s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items()) s_cos_sim.append(s_cos_sim_i) return s_cos_sim