def bnExtractDocSimilarity(doc1, doc2, similarity):
    """Measure the semantic similarity between two documents using
       Word Movers Distance. Uses Textacy API
       textacy.similarity.word_movers(doc1, doc2, metric=u'cosine')
    """

    from textacy import similarity
    #if similarity == 'Word Movers':
    if similarity == 'cosine':
        # Metric can be cosine, euclidian, I1, I2, or manhattan
        s = similarity.word_movers(doc1, doc2, metric=u'cosine')
        print(" Cosine Similarity between docs {} and {} is: {}".format( \
                  bnGetDocName(doc1), bnGetDocName(doc2), s))
    elif similarity == 'Euclidian':
        s = similarity.word_movers(doc1, doc2, metric=u'euclidian')
        print(" Euclidian Similarity between docs {} and {} is: {}".format( \
                  bnGetDocName(doc1), bnGetDocName(doc2), s))
    elif similarity == 'Manhattan':
        s = similarity.word_movers(doc1, doc2, metric=u'manhattan')
        print(" Manhattan Similarity between docs {} and {} is: {}".format( \
                  bnGetDocName(doc1), bnGetDocName(doc2), s))
    elif similarity == 'word2vec':
        s = similarity.word2vec(doc1, doc2)
        print(" Semantic Similarity between docs {} and {} is: {}".format( \
                  bnGetDocName(doc1), bnGetDocName(doc2), s))
    else:
        # Unsupported similarity method
        s = 0

    return round(s, 5)
예제 #2
0
    def calculate(self):
        """
        Calculate similarity using Word2Vec.

        :returns: dict in the shape of {id: [(similar post id, similarity score)]}
        """
        similarity = {}
        for ref_post, comp_post in product(self.new_docs, self.all_docs):
            ref_id, ref_text = ref_post
            comp_id, comp_text = comp_post
            score = round(float(word2vec(ref_text, comp_text)), 2)
            if bool(self.threshold < score < 1):
                try:
                    _ = similarity[ref_id]  # noqa
                except KeyError:
                    similarity[ref_id] = set()
                finally:
                    similarity[ref_id].add((comp_id, round(score, 3)))

        return similarity
예제 #3
0
def test_word2vec_identity(doc1, doc2):
    assert similarity.word2vec(doc1, doc1) == pytest.approx(1.0, rel=1e-3)
예제 #4
0
def test_word2vec(doc1, doc2):
    pairs = ((doc1, doc2), (doc1[-2:], doc2[-2:]))
    for pair in pairs:
        assert 0.0 <= similarity.word2vec(pair[0], pair[1]) <= 1.0
예제 #5
0
 def test_identity(self, doc_pairs):
     for doc1, doc2 in doc_pairs:
         assert similarity.word2vec(doc1, doc1) == pytest.approx(1.0, rel=1e-3)
         assert similarity.word2vec(doc2, doc2) == pytest.approx(1.0, rel=1e-3)
예제 #6
0
 def test_default(self, doc_pairs):
     for doc1, doc2 in doc_pairs:
         assert 0.0 <= similarity.word2vec(doc1, doc2) <= 1.0