예제 #1
0
 def test_new_corpus(self):
     """
     Test if the bow representation of new corpus is consistent
     :return:
     :rtype:
     """
     other_texts_without_unseen_word = [["computer", "time", "graph"], ["survey", "response", "eps"], ["human", "system", "computer"]]
     other_corpus_without_unseen_word = [common_dictionary.doc2bow(text) for text in other_texts_without_unseen_word]
     self.assertEqual(other_corpus_without_unseen_word[0], [(0, 1), (6, 1), (10, 1)])
     other_texts_with_unseen_word = [["computer", "graph", "hardware", "time", ], ["survey", "response", "eps", "administrator"]]
     other_corpus_with_unseen_word = [common_dictionary.doc2bow(text) for text in other_texts_with_unseen_word]
     self.assertEqual(other_corpus_with_unseen_word[0], [(0, 1), (6, 1), (10, 1)])
예제 #2
0
    def testEmptyDocument(self):
        local_texts = common_texts + [['only_occurs_once_in_corpus_and_alone_in_doc']]
        dictionary = Dictionary(local_texts)
        dictionary.filter_extremes(no_below=2)
        corpus = [dictionary.doc2bow(text) for text in local_texts]
        a2d = author2doc.copy()
        a2d['joaquin'] = [len(local_texts) - 1]

        self.class_(corpus, author2doc=a2d, id2word=dictionary, num_topics=2)
예제 #3
0
 def setUp(self):
     self.cls = similarities.SoftCosineSimilarity
     self.dictionary = Dictionary(texts)
     self.corpus = [dictionary.doc2bow(document) for document in texts]
     similarity_matrix = scipy.sparse.identity(12, format="lil")
     similarity_matrix[dictionary.token2id["user"],
                       dictionary.token2id["human"]] = 0.5
     similarity_matrix[dictionary.token2id["human"],
                       dictionary.token2id["user"]] = 0.5
     self.similarity_matrix = similarity_matrix.tocsc()
예제 #4
0
    def testEmptyDocument(self):
        local_texts = common_texts + [[
            'only_occurs_once_in_corpus_and_alone_in_doc'
        ]]
        dictionary = Dictionary(local_texts)
        dictionary.filter_extremes(no_below=2)
        corpus = [dictionary.doc2bow(text) for text in local_texts]
        a2d = author2doc.copy()
        a2d['joaquin'] = [len(local_texts) - 1]

        self.class_(corpus, author2doc=a2d, id2word=dictionary, num_topics=2)
예제 #5
0
 def test_lda_update_2(self):
     """
     Update with unseen text, which have new words.
     I add 'hardware', 'administrator' to test.
     :return:
     :rtype:
     """
     other_texts = [["computer", "graph", "hardware", "time", ], ["survey", "response", "eps", "administrator"]]
     other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
     original_model = copy.deepcopy(self.lda)
     self.lda.update(other_corpus)
     self.assertEqual(self.lda.id2word.token2id, original_model.id2word.token2id)
예제 #6
0
    def testNonIncreasing(self):
        """ Check that similarities are non-increasing when `num_best` is not `None`."""
        # NOTE: this could be implemented for other similarities as well (i.e. in _TestSimilarityABC).

        index = self.cls(corpus, self.similarity_matrix, num_best=5)
        query = dictionary.doc2bow(texts[0])
        sims = index[query]
        sims2 = numpy.asarray(sims)[:, 1]  # Just the similarities themselves.

        # The difference of adjacent elements should be negative.
        cond = sum(numpy.diff(sims2) < 0) == len(sims2) - 1
        self.assertTrue(cond)
예제 #7
0
    def testNonIncreasing(self):
        """ Check that similarities are non-increasing when `num_best` is not `None`."""
        # NOTE: this could be implemented for other similarities as well (i.e. in _TestSimilarityABC).

        index = self.cls(corpus, self.similarity_matrix, num_best=5)
        query = dictionary.doc2bow(texts[0])
        sims = index[query]
        sims2 = numpy.asarray(sims)[:, 1]  # Just the similarities themselves.

        # The difference of adjacent elements should be less than or equal to zero.
        cond = sum(numpy.diff(sims2) <= 0) == len(sims2) - 1
        self.assertTrue(cond)
예제 #8
0
 def test_lda_update_1(self):
     """
     Update with unseen text, which doesn't have new words.
     :return:
     :rtype:
     """
     other_texts = [["computer", "time", "graph"], ["survey", "response", "eps"], ["human", "system", "computer"]]
     other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
     original_model = copy.deepcopy(self.lda)
     # Inplace update
     self.lda.update(other_corpus)
     self.assertNotEqual(self.lda, original_model)
     self.assertEqual(self.lda.id2word.token2id, original_model.id2word.token2id)
예제 #9
0
    def testChunking(self):
        # Override testChunking.

        index = self.cls(corpus, self.similarity_matrix)
        query = [dictionary.doc2bow(document) for document in texts[:3]]
        sims = index[query]

        for i in range(3):
            self.assertTrue(numpy.alltrue(sims[i, i] == 1.0))  # Similarity of a document with itself is 1.0.

        # test the same thing but with num_best
        index.num_best = 5
        sims = index[query]
        for i, chunk in enumerate(sims):
            expected = i
            self.assertAlmostEqual(expected, chunk[0][0], places=2)
            expected = 1.0
            self.assertAlmostEqual(expected, chunk[0][1], places=2)
예제 #10
0
    def testChunking(self):
        # Override testChunking.

        index = self.cls(corpus, self.similarity_matrix)
        query = [dictionary.doc2bow(document) for document in texts[:3]]
        sims = index[query]

        for i in range(3):
            self.assertTrue(numpy.alltrue(sims[i, i] == 1.0))  # Similarity of a document with itself is 1.0.

        # test the same thing but with num_best
        index.num_best = 5
        sims = index[query]
        for i, chunk in enumerate(sims):
            expected = i
            self.assertAlmostEqual(expected, chunk[0][0], places=2)
            expected = 1.0
            self.assertAlmostEqual(expected, chunk[0][1], places=2)
예제 #11
0
    def testFull(self, num_best=None):
        # Override testFull.

        # Single query
        index = self.cls(corpus, self.similarity_matrix, num_best=num_best)
        query = dictionary.doc2bow(texts[0])
        sims = index[query]
        if num_best is not None:
            # Sparse array.
            for i, sim in sims:
                self.assertTrue(numpy.alltrue(sim <= 1.0))
                self.assertTrue(numpy.alltrue(sim >= 0.0))
        else:
            self.assertAlmostEqual(
                1.0, sims[0])  # Similarity of a document with itself is 1.0.
            self.assertTrue(numpy.alltrue(sims[1:] >= 0.0))
            self.assertTrue(numpy.alltrue(sims[1:] < 1.0))
            expected = 2.1889350195476758
            self.assertAlmostEqual(expected, numpy.sum(sims))

        # Corpora
        for query in (
                corpus,  # Basic text corpus.
                self.tfidf[corpus]
        ):  # Transformed corpus without slicing support.
            index = self.cls(query, self.similarity_matrix, num_best=num_best)
            sims = index[query]
            if num_best is not None:
                # Sparse array.
                for result in sims:
                    for i, sim in result:
                        self.assertTrue(numpy.alltrue(sim <= 1.0))
                        self.assertTrue(numpy.alltrue(sim >= 0.0))
            else:
                for i, result in enumerate(sims):
                    self.assertAlmostEqual(
                        1.0, result[i]
                    )  # Similarity of a document with itself is 1.0.
                    self.assertTrue(numpy.alltrue(result[:i] >= 0.0))
                    self.assertTrue(numpy.alltrue(result[:i] < 1.0))
                    self.assertTrue(numpy.alltrue(result[i + 1:] >= 0.0))
                    self.assertTrue(numpy.alltrue(result[i + 1:] < 1.0))
예제 #12
0
    def testFull(self, num_best=None):
        # Override testFull.

        # Single query
        index = self.cls(corpus, self.similarity_matrix, num_best=num_best)
        query = dictionary.doc2bow(texts[0])
        sims = index[query]
        if num_best is not None:
            # Sparse array.
            for i, sim in sims:
                self.assertTrue(numpy.alltrue(sim <= 1.0))
                self.assertTrue(numpy.alltrue(sim >= 0.0))
        else:
            self.assertAlmostEqual(1.0, sims[0])  # Similarity of a document with itself is 1.0.
            self.assertTrue(numpy.alltrue(sims[1:] >= 0.0))
            self.assertTrue(numpy.alltrue(sims[1:] < 1.0))
            expected = 2.1889350195476758
            self.assertAlmostEqual(expected, numpy.sum(sims))

        # Corpora
        for query in (
                corpus,  # Basic text corpus.
                self.tfidf[corpus]):  # Transformed corpus without slicing support.
            index = self.cls(query, self.similarity_matrix, num_best=num_best)
            sims = index[query]
            if num_best is not None:
                # Sparse array.
                for result in sims:
                    for i, sim in result:
                        self.assertTrue(numpy.alltrue(sim <= 1.0))
                        self.assertTrue(numpy.alltrue(sim >= 0.0))
            else:
                for i, result in enumerate(sims):
                    self.assertAlmostEqual(1.0, result[i])  # Similarity of a document with itself is 1.0.
                    self.assertTrue(numpy.alltrue(result[:i] >= 0.0))
                    self.assertTrue(numpy.alltrue(result[:i] < 1.0))
                    self.assertTrue(numpy.alltrue(result[i + 1:] >= 0.0))
                    self.assertTrue(numpy.alltrue(result[i + 1:] < 1.0))