Пример #1
0
def jensen_shannon_div(P, Q):
    """
    Compute the Jensen-Shannon divergence between two probability distributions of equal length.

    ----- 
        :param P: Probability distributions that sum to 1
        :param Q: Probability distributions that sum to 1 
        :return: float
    """
    M = 0.5 * (P + Q)
    # return 0.5 * (_kldiv(P, M) +_kldiv(Q, M))
    return 0.5 * (kullback_leibler(P, M) + kullback_leibler(Q, M))
Пример #2
0
def rank_documents(model, model_name, type, query):

    sims_list = []

    processed_query = read_ap.process_text(query)
    print(processed_query)

    if model_name == "LSI":
        if type == "bow":
            # calculating cosine similarity for LSI (BoW)
            index = gensim.similarities.MatrixSimilarity(model[corpus])
            #make a bow representation of the query, and split the words
            vec_bow = dictionary.doc2bow(processed_query)
            vec_lsi = model[vec_bow]  # convert the query to LSI space
            sims = index[vec_lsi]  # get index
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            # store the scores with the associated doc id's for the retrieval evaluation
            doc_ids = list(new_docs.keys())
            for i, s in sims:
                sims_list.append((doc_ids[i], np.float64(s)))
            return sims_list

        if type == "tfidf":
            #calculating cosine similarity for LSI, tf idf using similarities
            #use the tfidf corpus -> lsi corpus
            corpus_lsi = model[corpus_tfidf]
            #transform corpus to LSI space and index it
            index = gensim.similarities.MatrixSimilarity(corpus_lsi)
            #convert query to lsi space via tf-idf
            vec_bow = dictionary.doc2bow(processed_query)
            vec_lsi = model[vec_bow]
            sims = index[vec_lsi]
            #same as with LSI BoW
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            doc_ids = list(new_docs.keys())
            for i, s in sims:
                sims_list.append((doc_ids[i], np.float64(s)))
            return sims_list
    else:
        #calculating the negative Kullback–Leibler divergence scores for LDA
        #transform query
        vec_bow = dictionary.doc2bow(processed_query)
        # transform query to the LDA space
        vec_lda_query = model[vec_bow][0]
        kl_divergence = []
        for text in corpus:
            #transform current document text in bow space to lda space
            vec_lda_text = model[text][0]
            # KL(Q||D) =\sum_w p(w|Q) log p(w|D) as explained in http://times.cs.uiuc.edu/course/410s11/kldir.pdf, using gensim mathutil
            kl_divergence.append(kullback_leibler(vec_lda_query, vec_lda_text))

        #sims = index[vec_lda]

        #sort the kl scores
        kl_divergence = sorted(enumerate(kl_divergence),
                               key=lambda item: -item[1])
        doc_ids = list(new_docs.keys())
        for i, s in kl_divergence:
            sims_list.append((doc_ids[i], np.float64(s)))
        return sims_list
Пример #3
0
def get_most_similar_documents(query, corpus, dictionary, k=10):
    distances = []
    for c in corpus:
        distances.append(
            kullback_leibler(query, c, num_features=len(dictionary)))

    indices = np.array(distances).argsort()[:k]
    return indices
Пример #4
0
    def test_distributions(self):

        # checking bag of words as inputs
        vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
        vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
        result = matutils.kullback_leibler(vec_2, vec_1, 8)
        expected = 0.55451775
        self.assertAlmostEqual(expected, result)

        # KL is not symetric; vec1 compared with vec2 will contain log of zeros and return infinity
        vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
        vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
        result = matutils.kullback_leibler(vec_1, vec_2, 8)
        self.assertTrue(math.isinf(result))

        # checking ndarray, csr_matrix as inputs
        vec_1 = numpy.array([[1, 0.3], [0, 0.4], [2, 0.3]])
        vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]])
        result = matutils.kullback_leibler(vec_1, vec_2, 3)
        expected = 0.0894502
        self.assertAlmostEqual(expected, result)

        # checking ndarray, list as inputs
        vec_1 = numpy.array([0.6, 0.1, 0.1, 0.2])
        vec_2 = [0.2, 0.2, 0.1, 0.5]
        result = matutils.kullback_leibler(vec_1, vec_2)
        expected = 0.40659450877
        self.assertAlmostEqual(expected, result)

        # testing LDA distribution vectors
        numpy.random.seed(0)
        model = self.class_(self.corpus,
                            id2word=dictionary,
                            num_topics=2,
                            passes=100)
        lda_vec1 = model[[(1, 2), (2, 3)]]
        lda_vec2 = model[[(2, 2), (1, 3)]]
        result = matutils.kullback_leibler(lda_vec1, lda_vec2)
        expected = 4.283407e-12
        self.assertAlmostEqual(expected, result)
    def test_inputs(self):

        # checking empty inputs
        vec_1 = []
        vec_2 = []
        result = matutils.kullback_leibler(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking np array and list input
        vec_1 = np.array([])
        vec_2 = []
        result = matutils.kullback_leibler(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking scipy csr matrix and list input
        vec_1 = csr_matrix([])
        vec_2 = []
        result = matutils.kullback_leibler(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)
Пример #6
0
    def test_inputs(self):

        # checking empty inputs
        vec_1 = []
        vec_2 = []
        result = matutils.kullback_leibler(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking numpy array and list input
        vec_1 = numpy.array([])
        vec_2 = []
        result = matutils.kullback_leibler(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)

        # checking scipy csr matrix and list input
        vec_1 = csr_matrix([])
        vec_2 = []
        result = matutils.kullback_leibler(vec_1, vec_2)
        expected = 0.0
        self.assertEqual(expected, result)
Пример #7
0
    def test_distributions(self):

        # checking bag of words as inputs
        vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
        vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
        result = matutils.kullback_leibler(vec_2, vec_1, 8)
        expected = 0.55451775
        self.assertAlmostEqual(expected, result)

        # KL is not symetric; vec1 compared with vec2 will contain log of zeros and return infinity
        vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
        vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
        result = matutils.kullback_leibler(vec_1, vec_2, 8)
        self.assertTrue(math.isinf(result))

        # checking ndarray, csr_matrix as inputs
        vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]])
        vec_2 = csr_matrix([[1, 0.4], [0, 0.2], [2, 0.2]])
        result = matutils.kullback_leibler(vec_1, vec_2, 3)
        expected = 0.0894502
        self.assertAlmostEqual(expected, result)

        # checking ndarray, list as inputs
        vec_1 = np.array([0.6, 0.1, 0.1, 0.2])
        vec_2 = [0.2, 0.2, 0.1, 0.5]
        result = matutils.kullback_leibler(vec_1, vec_2)
        expected = 0.40659450877
        self.assertAlmostEqual(expected, result)

        # testing LDA distribution vectors
        np.random.seed(0)
        model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100)
        lda_vec1 = model[[(1, 2), (2, 3)]]
        lda_vec2 = model[[(2, 2), (1, 3)]]
        result = matutils.kullback_leibler(lda_vec1, lda_vec2)
        expected = 4.283407e-12
        self.assertAlmostEqual(expected, result)
Пример #8
0
    def search(self, query):

        query_repr = read_ap.process_text(query)
        vec_query = self.corpus.dictionary.doc2bow(query_repr)
        lda_query = sparse2full(self.model[vec_query], self.num_topics)

        results = defaultdict(float)
        for doc_id, lda_doc_repr in zip(self.corpus.doc_ids,
                                        self.lda_corpus_pers):
            results[doc_id] = kullback_leibler(lda_query, lda_doc_repr)

        results = {
            k: v
            for k, v in sorted(
                results.items(), key=lambda item: item[1], reverse=True)
        }
        return list(results.items())
Пример #9
0
def ranking_LDA(query, model, model_docs, num_topics=10):
    scores = []

    # Process query to correct KL divergence form
    query = read_ap.process_text(query)
    query = dictionary.doc2bow(query)
    query = model[query]
    query = gensim.matutils.sparse2full(query, num_topics)

    # Calculate KL divergence for each document in the corpus
    for i in range(len(corpus)):
        doc = model_docs[i]
        neg_kl = float(-1 * kullback_leibler(query, doc))
        scores.append((i2str[i], neg_kl))

    # Sort on second tuple value
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return scores
Пример #10
0
def get_sims(model, query, corpus_full, dictionary, n_topics):
    ''' get ranking for single query '''

    # avoid division by 0
    eps = 1e-8

    # process query
    query_processed = read_ap.process_text(query)
    query_bow = dictionary.doc2bow(query_processed)
    q_lda = sparse2full(model[query_bow], n_topics)
    q_lda += eps

    sims = []

    # loop over all docs
    for i, doc in enumerate(corpus_full):
        doc += eps
        sim = -1 * kullback_leibler(q_lda, doc)
        sims.append(sim)

    sim_ordered = sorted(enumerate(sims), key=lambda item: -1 * item[1])

    return sim_ordered
Пример #11
0
def rank_docs(query, model, doc_ids, dictionary, corpus_modelspace, tfidf_model=None, index=None):
    query_prepro = read_ap.process_text(query)

    # transform query to bow vector space
    q_cspace = dictionary.doc2bow(query_prepro)

    if not tfidf_model == None:
        # transform query to tfidf vector space
        q_cspace = tfidf_model[q_cspace]

    q_modelspace = model[q_cspace]
    
    if isinstance(model, LsiModel):
        ## LSI
        scores = index[q_modelspace]

        results = defaultdict(float)
        for doc_id, score in zip(doc_ids, scores):
          results[doc_id] = score

        results = list(results.items())
        results.sort(key=lambda _: -_[1])

    elif isinstance(model, LdaModel):
        ## LDA
        doc_ids = list(doc_ids)
        scores = []
        # have to use the for loop, otherwise kullback_leibler has problems
        for d in corpus_modelspace:
            scores.append(float(-kullback_leibler(q_modelspace, d)))

        # have to use torch here to do this more efficiently
        order = torch.Tensor(scores).argsort(descending=True).numpy()
        ordered_results = [(doc_ids[i], scores[i]) for i in order]
        results = dict(ordered_results)

    return results
Пример #12
0
    for word in file:
        filecontent = filecontent + word + ' '
        documents.append(filecontent)
    stoplist = set(stopwords.words('english'))
    texts = [[
        word for word in document.lower().split() if word not in stoplist
    ] for document in documents]
    basetext = []
    for list in texts:
        for item in list:
            basetext.append(item)
    bow_1 = lda.id2word.doc2bow(basetext)
    lda_1 = lda[bow_1]
    print("******************", filename)
    print("hellinger", hellinger(lda_1, lda_2))
    print("kullback_leibler", kullback_leibler(lda_1, lda_2))
    print("jaccard", jaccard(lda_1, lda_2))
    file.close()
    #dictionary = corpora.Dictionary(texts)
    #corpus = [dictionary.doc2bow(text) for text in texts]
    #lda1 = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, update_every=1, chunksize=10000, passes=5)
#print(lda1)
#print(texts)
"""
basetext=[]
for list in texts:
    for item in list:
        basetext.append(item)
#print(len(basetext))
#print(basetext)
Пример #13
0
lda_bow_water = model[bow_water]
lda_bow_finance = model[bow_finance]
lda_bow_bank = model[bow_bank]

tfidf_bow_water = tfidf[bow_water]
tfidf_bow_finance = tfidf[bow_finance]
tfidf_bow_bank = tfidf[bow_bank]

from gensim.matutils import kullback_leibler, jaccard, hellinger

hellinger(lda_bow_water, lda_bow_finance)
hellinger(lda_bow_finance, lda_bow_bank)
hellinger(lda_bow_bank, lda_bow_water)

hellinger(lda_bow_finance, lda_bow_water)
kullback_leibler(lda_bow_water, lda_bow_bank)
kullback_leibler(lda_bow_bank, lda_bow_water)


jaccard(bow_water, bow_bank)
jaccard(doc_water, doc_bank)
jaccard(['word'], ['word'])

def make_topics_bow(topic):
    # takes the string returned by model.show_topics()
    # split on strings to get topics and the probabilities
    topic = topic.split('+')
    # list to store topic bows
    topic_bow = []
    for word in topic:
        # split probability and word
Пример #14
0
print('Coherence: {}\n'.format(c_mean))
with open('data/' + model_no + '/evaluation.txt', 'w') as f:
    f.write('Coherence: {}\n'.format(c_mean))

#%%

# トピック間のカルバック・ライブラー距離を算出し描画する

# トピックごとの、辞書内の単語を含む文がそのトピックに分類される (事後) 確率
t = model.state.get_lambda()

# トピック同士の確率分布の距離を表示 (離れているほど良い)
ds = []
for i in range(model.num_topics):
    for j in range(model.num_topics):
        if i != j:
            kl = kullback_leibler(t[i], t[j])
            # print('{:02}-{:02}: {}'.format(i, j, kl))
            ds.append(kl)

# グラフを保存
plt.title('KL-divergence')
plt.hist(ds)
plt.savefig('data/' + model_no + '/kl-divergence.png')

# 平均値をテキスト保存
d_mean = mean([d.astype(float) for d in ds])
print('KL-divergence: {}\n'.format(d_mean))
with open('data/' + model_no + '/evaluation.txt', 'a') as f:
    f.write('KL-divergence: {}\n'.format(d_mean))
Пример #15
0
###############################################################################
# Makes sense, right? In the first example, Document 1 and Document 2 are hardly similar, so we get a value of roughly 0.5. 
# 
# In the second case, the documents are a lot more similar, semantically. Trained with the model, they give a much less distance value.
# 

###############################################################################
# Kullback–Leibler
# ----------------
# 
# Let's run similar examples down with Kullback Leibler.
# 
from gensim.matutils import kullback_leibler

print(kullback_leibler(lda_bow_water, lda_bow_bank))
print(kullback_leibler(lda_bow_finance, lda_bow_bank))

###############################################################################
# .. important::
#   KL is not a Distance Metric in the mathematical sense, and hence is not
#   symmetrical.  This means that ``kullback_leibler(lda_bow_finance,
#   lda_bow_bank)`` is not equal to  ``kullback_leibler(lda_bow_bank,
#   lda_bow_finance)``. 
# 
# As you can see, the values are not equal. We'll get more into the details of
# this later on in the notebook.
#
print(kullback_leibler(lda_bow_bank, lda_bow_finance))

###############################################################################
def simlarity_kullback_leibler(lda_vec1, lda_vec2):
    return kullback_leibler(lda_vec1, lda_vec2)
Пример #17
0
 def kldivergence_distance(self, x, y):
     replaceZero = 0.000001
     """ return KL-divergence between two lists """
     return kullback_leibler([replaceZero if e == 0 else e for e in x],
                             [replaceZero if e == 0 else e for e in y])