Пример #1
0
def soft_score_summarizer(summarizer, percent):
    documents = [summarizer.fullText, summarizer.condense(percent)]

    dictionary = corpora.Dictionary(
        [simple_preprocess(doc) for doc in documents])

    model = create_model()
    termsim_index = WordEmbeddingSimilarityIndex(model.wv)

    original_doc = dictionary.doc2bow(simple_preprocess(documents[0]))
    condensed_doc = dictionary.doc2bow(simple_preprocess(documents[1]))

    similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)
    docsim_index = SoftCosineSimilarity([original_doc], similarity_matrix)

    return docsim_index[condensed_doc][0]
Пример #2
0
def softcos(defns, return_centers=False):
    keys = list(defns.keys())
    if len(defns) == 1:
        return unclusterable_default(keys, return_centers=return_centers)
    dictionary, bow_corpus = mk_dictionary_bow_corpus(defns.values())
    if len(dictionary) == 0:
        return unclusterable_default(keys, return_centers=return_centers)

    similarity_index = WordEmbeddingSimilarityIndex(vecs.get_en())
    similarity_matrix = SparseTermSimilarityMatrix(similarity_index,
                                                   dictionary)
    index = SoftCosineSimilarity(bow_corpus, similarity_matrix)
    affinities = np.zeros((len(defns), len(defns)))

    for row, similarities in enumerate(index):
        affinities[row] = similarities

    return graph_clust_grouped(affinities, keys, return_centers)
Пример #3
0
    def test_most_similar(self):
        """Test most_similar returns expected results."""

        # check the handling of out-of-dictionary terms
        index = WordEmbeddingSimilarityIndex(self.vectors)
        self.assertLess(0, len(list(index.most_similar(u"holiday", topn=10))))
        self.assertEqual(
            0, len(list(index.most_similar(u"out-of-dictionary term",
                                           topn=10))))

        # check that the topn works as expected
        index = WordEmbeddingSimilarityIndex(self.vectors)
        results = list(index.most_similar(u"holiday", topn=10))
        self.assertLess(0, len(results))
        self.assertGreaterEqual(10, len(results))
        results = list(index.most_similar(u"holiday", topn=20))
        self.assertLess(10, len(results))
        self.assertGreaterEqual(20, len(results))

        # check that the term itself is not returned
        index = WordEmbeddingSimilarityIndex(self.vectors)
        terms = [
            term for term, similarity in index.most_similar(
                u"holiday", topn=len(self.vectors.vocab))
        ]
        self.assertFalse(u"holiday" in terms)

        # check that the threshold works as expected
        index = WordEmbeddingSimilarityIndex(self.vectors, threshold=0.0)
        results = list(index.most_similar(u"holiday", topn=10))
        self.assertLess(0, len(results))
        self.assertGreaterEqual(10, len(results))

        index = WordEmbeddingSimilarityIndex(self.vectors, threshold=1.0)
        results = list(index.most_similar(u"holiday", topn=10))
        self.assertEqual(0, len(results))

        # check that the exponent works as expected
        index = WordEmbeddingSimilarityIndex(self.vectors, exponent=1.0)
        first_similarities = np.array([
            similarity
            for term, similarity in index.most_similar(u"holiday", topn=10)
        ])
        index = WordEmbeddingSimilarityIndex(self.vectors, exponent=2.0)
        second_similarities = np.array([
            similarity
            for term, similarity in index.most_similar(u"holiday", topn=10)
        ])
        self.assertTrue(
            np.allclose(first_similarities**2.0, second_similarities))
Пример #4
0
    def test_most_similar(self):
        """Test most_similar returns expected results."""

        # check the handling of out-of-dictionary terms
        index = WordEmbeddingSimilarityIndex(self.vectors)
        self.assertLess(0, len(list(index.most_similar(u"holiday", topn=10))))
        self.assertEqual(0, len(list(index.most_similar(u"out-of-dictionary term", topn=10))))

        # check that the topn works as expected
        index = WordEmbeddingSimilarityIndex(self.vectors)
        results = list(index.most_similar(u"holiday", topn=10))
        self.assertLess(0, len(results))
        self.assertGreaterEqual(10, len(results))
        results = list(index.most_similar(u"holiday", topn=20))
        self.assertLess(10, len(results))
        self.assertGreaterEqual(20, len(results))

        # check that the term itself is not returned
        index = WordEmbeddingSimilarityIndex(self.vectors)
        terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.vectors.vocab))]
        self.assertFalse(u"holiday" in terms)

        # check that the threshold works as expected
        index = WordEmbeddingSimilarityIndex(self.vectors, threshold=0.0)
        results = list(index.most_similar(u"holiday", topn=10))
        self.assertLess(0, len(results))
        self.assertGreaterEqual(10, len(results))

        index = WordEmbeddingSimilarityIndex(self.vectors, threshold=1.0)
        results = list(index.most_similar(u"holiday", topn=10))
        self.assertEqual(0, len(results))

        # check that the exponent works as expected
        index = WordEmbeddingSimilarityIndex(self.vectors, exponent=1.0)
        first_similarities = np.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
        index = WordEmbeddingSimilarityIndex(self.vectors, exponent=2.0)
        second_similarities = np.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
        self.assertTrue(np.allclose(first_similarities**2.0, second_similarities))