def jensen_shannon_div(P, Q): """ Compute the Jensen-Shannon divergence between two probability distributions of equal length. ----- :param P: Probability distributions that sum to 1 :param Q: Probability distributions that sum to 1 :return: float """ M = 0.5 * (P + Q) # return 0.5 * (_kldiv(P, M) +_kldiv(Q, M)) return 0.5 * (kullback_leibler(P, M) + kullback_leibler(Q, M))
def rank_documents(model, model_name, type, query): sims_list = [] processed_query = read_ap.process_text(query) print(processed_query) if model_name == "LSI": if type == "bow": # calculating cosine similarity for LSI (BoW) index = gensim.similarities.MatrixSimilarity(model[corpus]) #make a bow representation of the query, and split the words vec_bow = dictionary.doc2bow(processed_query) vec_lsi = model[vec_bow] # convert the query to LSI space sims = index[vec_lsi] # get index sims = sorted(enumerate(sims), key=lambda item: -item[1]) # store the scores with the associated doc id's for the retrieval evaluation doc_ids = list(new_docs.keys()) for i, s in sims: sims_list.append((doc_ids[i], np.float64(s))) return sims_list if type == "tfidf": #calculating cosine similarity for LSI, tf idf using similarities #use the tfidf corpus -> lsi corpus corpus_lsi = model[corpus_tfidf] #transform corpus to LSI space and index it index = gensim.similarities.MatrixSimilarity(corpus_lsi) #convert query to lsi space via tf-idf vec_bow = dictionary.doc2bow(processed_query) vec_lsi = model[vec_bow] sims = index[vec_lsi] #same as with LSI BoW sims = sorted(enumerate(sims), key=lambda item: -item[1]) doc_ids = list(new_docs.keys()) for i, s in sims: sims_list.append((doc_ids[i], np.float64(s))) return sims_list else: #calculating the negative Kullback–Leibler divergence scores for LDA #transform query vec_bow = dictionary.doc2bow(processed_query) # transform query to the LDA space vec_lda_query = model[vec_bow][0] kl_divergence = [] for text in corpus: #transform current document text in bow space to lda space vec_lda_text = model[text][0] # KL(Q||D) =\sum_w p(w|Q) log p(w|D) as explained in http://times.cs.uiuc.edu/course/410s11/kldir.pdf, using gensim mathutil kl_divergence.append(kullback_leibler(vec_lda_query, vec_lda_text)) #sims = index[vec_lda] #sort the kl scores kl_divergence = sorted(enumerate(kl_divergence), key=lambda item: -item[1]) doc_ids = list(new_docs.keys()) for i, s in kl_divergence: sims_list.append((doc_ids[i], np.float64(s))) return sims_list
def get_most_similar_documents(query, corpus, dictionary, k=10): distances = [] for c in corpus: distances.append( kullback_leibler(query, c, num_features=len(dictionary))) indices = np.array(distances).argsort()[:k] return indices
def test_distributions(self): # checking bag of words as inputs vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)] result = matutils.kullback_leibler(vec_2, vec_1, 8) expected = 0.55451775 self.assertAlmostEqual(expected, result) # KL is not symetric; vec1 compared with vec2 will contain log of zeros and return infinity vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)] result = matutils.kullback_leibler(vec_1, vec_2, 8) self.assertTrue(math.isinf(result)) # checking ndarray, csr_matrix as inputs vec_1 = numpy.array([[1, 0.3], [0, 0.4], [2, 0.3]]) vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]]) result = matutils.kullback_leibler(vec_1, vec_2, 3) expected = 0.0894502 self.assertAlmostEqual(expected, result) # checking ndarray, list as inputs vec_1 = numpy.array([0.6, 0.1, 0.1, 0.2]) vec_2 = [0.2, 0.2, 0.1, 0.5] result = matutils.kullback_leibler(vec_1, vec_2) expected = 0.40659450877 self.assertAlmostEqual(expected, result) # testing LDA distribution vectors numpy.random.seed(0) model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes=100) lda_vec1 = model[[(1, 2), (2, 3)]] lda_vec2 = model[[(2, 2), (1, 3)]] result = matutils.kullback_leibler(lda_vec1, lda_vec2) expected = 4.283407e-12 self.assertAlmostEqual(expected, result)
def test_inputs(self): # checking empty inputs vec_1 = [] vec_2 = [] result = matutils.kullback_leibler(vec_1, vec_2) expected = 0.0 self.assertEqual(expected, result) # checking np array and list input vec_1 = np.array([]) vec_2 = [] result = matutils.kullback_leibler(vec_1, vec_2) expected = 0.0 self.assertEqual(expected, result) # checking scipy csr matrix and list input vec_1 = csr_matrix([]) vec_2 = [] result = matutils.kullback_leibler(vec_1, vec_2) expected = 0.0 self.assertEqual(expected, result)
def test_inputs(self): # checking empty inputs vec_1 = [] vec_2 = [] result = matutils.kullback_leibler(vec_1, vec_2) expected = 0.0 self.assertEqual(expected, result) # checking numpy array and list input vec_1 = numpy.array([]) vec_2 = [] result = matutils.kullback_leibler(vec_1, vec_2) expected = 0.0 self.assertEqual(expected, result) # checking scipy csr matrix and list input vec_1 = csr_matrix([]) vec_2 = [] result = matutils.kullback_leibler(vec_1, vec_2) expected = 0.0 self.assertEqual(expected, result)
def test_distributions(self): # checking bag of words as inputs vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)] result = matutils.kullback_leibler(vec_2, vec_1, 8) expected = 0.55451775 self.assertAlmostEqual(expected, result) # KL is not symetric; vec1 compared with vec2 will contain log of zeros and return infinity vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)] vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)] result = matutils.kullback_leibler(vec_1, vec_2, 8) self.assertTrue(math.isinf(result)) # checking ndarray, csr_matrix as inputs vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]]) vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]]) result = matutils.kullback_leibler(vec_1, vec_2, 3) expected = 0.0894502 self.assertAlmostEqual(expected, result) # checking ndarray, list as inputs vec_1 = np.array([0.6, 0.1, 0.1, 0.2]) vec_2 = [0.2, 0.2, 0.1, 0.5] result = matutils.kullback_leibler(vec_1, vec_2) expected = 0.40659450877 self.assertAlmostEqual(expected, result) # testing LDA distribution vectors np.random.seed(0) model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100) lda_vec1 = model[[(1, 2), (2, 3)]] lda_vec2 = model[[(2, 2), (1, 3)]] result = matutils.kullback_leibler(lda_vec1, lda_vec2) expected = 4.283407e-12 self.assertAlmostEqual(expected, result)
def search(self, query): query_repr = read_ap.process_text(query) vec_query = self.corpus.dictionary.doc2bow(query_repr) lda_query = sparse2full(self.model[vec_query], self.num_topics) results = defaultdict(float) for doc_id, lda_doc_repr in zip(self.corpus.doc_ids, self.lda_corpus_pers): results[doc_id] = kullback_leibler(lda_query, lda_doc_repr) results = { k: v for k, v in sorted( results.items(), key=lambda item: item[1], reverse=True) } return list(results.items())
def ranking_LDA(query, model, model_docs, num_topics=10): scores = [] # Process query to correct KL divergence form query = read_ap.process_text(query) query = dictionary.doc2bow(query) query = model[query] query = gensim.matutils.sparse2full(query, num_topics) # Calculate KL divergence for each document in the corpus for i in range(len(corpus)): doc = model_docs[i] neg_kl = float(-1 * kullback_leibler(query, doc)) scores.append((i2str[i], neg_kl)) # Sort on second tuple value scores = sorted(scores, key=lambda x: x[1], reverse=True) return scores
def get_sims(model, query, corpus_full, dictionary, n_topics): ''' get ranking for single query ''' # avoid division by 0 eps = 1e-8 # process query query_processed = read_ap.process_text(query) query_bow = dictionary.doc2bow(query_processed) q_lda = sparse2full(model[query_bow], n_topics) q_lda += eps sims = [] # loop over all docs for i, doc in enumerate(corpus_full): doc += eps sim = -1 * kullback_leibler(q_lda, doc) sims.append(sim) sim_ordered = sorted(enumerate(sims), key=lambda item: -1 * item[1]) return sim_ordered
def rank_docs(query, model, doc_ids, dictionary, corpus_modelspace, tfidf_model=None, index=None): query_prepro = read_ap.process_text(query) # transform query to bow vector space q_cspace = dictionary.doc2bow(query_prepro) if not tfidf_model == None: # transform query to tfidf vector space q_cspace = tfidf_model[q_cspace] q_modelspace = model[q_cspace] if isinstance(model, LsiModel): ## LSI scores = index[q_modelspace] results = defaultdict(float) for doc_id, score in zip(doc_ids, scores): results[doc_id] = score results = list(results.items()) results.sort(key=lambda _: -_[1]) elif isinstance(model, LdaModel): ## LDA doc_ids = list(doc_ids) scores = [] # have to use the for loop, otherwise kullback_leibler has problems for d in corpus_modelspace: scores.append(float(-kullback_leibler(q_modelspace, d))) # have to use torch here to do this more efficiently order = torch.Tensor(scores).argsort(descending=True).numpy() ordered_results = [(doc_ids[i], scores[i]) for i in order] results = dict(ordered_results) return results
for word in file: filecontent = filecontent + word + ' ' documents.append(filecontent) stoplist = set(stopwords.words('english')) texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in documents] basetext = [] for list in texts: for item in list: basetext.append(item) bow_1 = lda.id2word.doc2bow(basetext) lda_1 = lda[bow_1] print("******************", filename) print("hellinger", hellinger(lda_1, lda_2)) print("kullback_leibler", kullback_leibler(lda_1, lda_2)) print("jaccard", jaccard(lda_1, lda_2)) file.close() #dictionary = corpora.Dictionary(texts) #corpus = [dictionary.doc2bow(text) for text in texts] #lda1 = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, update_every=1, chunksize=10000, passes=5) #print(lda1) #print(texts) """ basetext=[] for list in texts: for item in list: basetext.append(item) #print(len(basetext)) #print(basetext)
lda_bow_water = model[bow_water] lda_bow_finance = model[bow_finance] lda_bow_bank = model[bow_bank] tfidf_bow_water = tfidf[bow_water] tfidf_bow_finance = tfidf[bow_finance] tfidf_bow_bank = tfidf[bow_bank] from gensim.matutils import kullback_leibler, jaccard, hellinger hellinger(lda_bow_water, lda_bow_finance) hellinger(lda_bow_finance, lda_bow_bank) hellinger(lda_bow_bank, lda_bow_water) hellinger(lda_bow_finance, lda_bow_water) kullback_leibler(lda_bow_water, lda_bow_bank) kullback_leibler(lda_bow_bank, lda_bow_water) jaccard(bow_water, bow_bank) jaccard(doc_water, doc_bank) jaccard(['word'], ['word']) def make_topics_bow(topic): # takes the string returned by model.show_topics() # split on strings to get topics and the probabilities topic = topic.split('+') # list to store topic bows topic_bow = [] for word in topic: # split probability and word
print('Coherence: {}\n'.format(c_mean)) with open('data/' + model_no + '/evaluation.txt', 'w') as f: f.write('Coherence: {}\n'.format(c_mean)) #%% # トピック間のカルバック・ライブラー距離を算出し描画する # トピックごとの、辞書内の単語を含む文がそのトピックに分類される (事後) 確率 t = model.state.get_lambda() # トピック同士の確率分布の距離を表示 (離れているほど良い) ds = [] for i in range(model.num_topics): for j in range(model.num_topics): if i != j: kl = kullback_leibler(t[i], t[j]) # print('{:02}-{:02}: {}'.format(i, j, kl)) ds.append(kl) # グラフを保存 plt.title('KL-divergence') plt.hist(ds) plt.savefig('data/' + model_no + '/kl-divergence.png') # 平均値をテキスト保存 d_mean = mean([d.astype(float) for d in ds]) print('KL-divergence: {}\n'.format(d_mean)) with open('data/' + model_no + '/evaluation.txt', 'a') as f: f.write('KL-divergence: {}\n'.format(d_mean))
############################################################################### # Makes sense, right? In the first example, Document 1 and Document 2 are hardly similar, so we get a value of roughly 0.5. # # In the second case, the documents are a lot more similar, semantically. Trained with the model, they give a much less distance value. # ############################################################################### # Kullback–Leibler # ---------------- # # Let's run similar examples down with Kullback Leibler. # from gensim.matutils import kullback_leibler print(kullback_leibler(lda_bow_water, lda_bow_bank)) print(kullback_leibler(lda_bow_finance, lda_bow_bank)) ############################################################################### # .. important:: # KL is not a Distance Metric in the mathematical sense, and hence is not # symmetrical. This means that ``kullback_leibler(lda_bow_finance, # lda_bow_bank)`` is not equal to ``kullback_leibler(lda_bow_bank, # lda_bow_finance)``. # # As you can see, the values are not equal. We'll get more into the details of # this later on in the notebook. # print(kullback_leibler(lda_bow_bank, lda_bow_finance)) ###############################################################################
def simlarity_kullback_leibler(lda_vec1, lda_vec2): return kullback_leibler(lda_vec1, lda_vec2)
def kldivergence_distance(self, x, y): replaceZero = 0.000001 """ return KL-divergence between two lists """ return kullback_leibler([replaceZero if e == 0 else e for e in x], [replaceZero if e == 0 else e for e in y])