def soft_score_summarizer(summarizer, percent): documents = [summarizer.fullText, summarizer.condense(percent)] dictionary = corpora.Dictionary( [simple_preprocess(doc) for doc in documents]) model = create_model() termsim_index = WordEmbeddingSimilarityIndex(model.wv) original_doc = dictionary.doc2bow(simple_preprocess(documents[0])) condensed_doc = dictionary.doc2bow(simple_preprocess(documents[1])) similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) docsim_index = SoftCosineSimilarity([original_doc], similarity_matrix) return docsim_index[condensed_doc][0]
def softcos(defns, return_centers=False): keys = list(defns.keys()) if len(defns) == 1: return unclusterable_default(keys, return_centers=return_centers) dictionary, bow_corpus = mk_dictionary_bow_corpus(defns.values()) if len(dictionary) == 0: return unclusterable_default(keys, return_centers=return_centers) similarity_index = WordEmbeddingSimilarityIndex(vecs.get_en()) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary) index = SoftCosineSimilarity(bow_corpus, similarity_matrix) affinities = np.zeros((len(defns), len(defns))) for row, similarities in enumerate(index): affinities[row] = similarities return graph_clust_grouped(affinities, keys, return_centers)
def test_most_similar(self): """Test most_similar returns expected results.""" # check the handling of out-of-dictionary terms index = WordEmbeddingSimilarityIndex(self.vectors) self.assertLess(0, len(list(index.most_similar(u"holiday", topn=10)))) self.assertEqual( 0, len(list(index.most_similar(u"out-of-dictionary term", topn=10)))) # check that the topn works as expected index = WordEmbeddingSimilarityIndex(self.vectors) results = list(index.most_similar(u"holiday", topn=10)) self.assertLess(0, len(results)) self.assertGreaterEqual(10, len(results)) results = list(index.most_similar(u"holiday", topn=20)) self.assertLess(10, len(results)) self.assertGreaterEqual(20, len(results)) # check that the term itself is not returned index = WordEmbeddingSimilarityIndex(self.vectors) terms = [ term for term, similarity in index.most_similar( u"holiday", topn=len(self.vectors.vocab)) ] self.assertFalse(u"holiday" in terms) # check that the threshold works as expected index = WordEmbeddingSimilarityIndex(self.vectors, threshold=0.0) results = list(index.most_similar(u"holiday", topn=10)) self.assertLess(0, len(results)) self.assertGreaterEqual(10, len(results)) index = WordEmbeddingSimilarityIndex(self.vectors, threshold=1.0) results = list(index.most_similar(u"holiday", topn=10)) self.assertEqual(0, len(results)) # check that the exponent works as expected index = WordEmbeddingSimilarityIndex(self.vectors, exponent=1.0) first_similarities = np.array([ similarity for term, similarity in index.most_similar(u"holiday", topn=10) ]) index = WordEmbeddingSimilarityIndex(self.vectors, exponent=2.0) second_similarities = np.array([ similarity for term, similarity in index.most_similar(u"holiday", topn=10) ]) self.assertTrue( np.allclose(first_similarities**2.0, second_similarities))
def test_most_similar(self): """Test most_similar returns expected results.""" # check the handling of out-of-dictionary terms index = WordEmbeddingSimilarityIndex(self.vectors) self.assertLess(0, len(list(index.most_similar(u"holiday", topn=10)))) self.assertEqual(0, len(list(index.most_similar(u"out-of-dictionary term", topn=10)))) # check that the topn works as expected index = WordEmbeddingSimilarityIndex(self.vectors) results = list(index.most_similar(u"holiday", topn=10)) self.assertLess(0, len(results)) self.assertGreaterEqual(10, len(results)) results = list(index.most_similar(u"holiday", topn=20)) self.assertLess(10, len(results)) self.assertGreaterEqual(20, len(results)) # check that the term itself is not returned index = WordEmbeddingSimilarityIndex(self.vectors) terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.vectors.vocab))] self.assertFalse(u"holiday" in terms) # check that the threshold works as expected index = WordEmbeddingSimilarityIndex(self.vectors, threshold=0.0) results = list(index.most_similar(u"holiday", topn=10)) self.assertLess(0, len(results)) self.assertGreaterEqual(10, len(results)) index = WordEmbeddingSimilarityIndex(self.vectors, threshold=1.0) results = list(index.most_similar(u"holiday", topn=10)) self.assertEqual(0, len(results)) # check that the exponent works as expected index = WordEmbeddingSimilarityIndex(self.vectors, exponent=1.0) first_similarities = np.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) index = WordEmbeddingSimilarityIndex(self.vectors, exponent=2.0) second_similarities = np.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)]) self.assertTrue(np.allclose(first_similarities**2.0, second_similarities))