예제 #1
0
    def _create_tfidf_matrix(self, document_set, dictionary):
        """
        summarization should treat a sentence as a doc
        Creates matrix of shape |unique words|×|sentences| where cells
        contains number of occurences of words (rows) in senteces (cols).
        """
        sentences_count = len(document_set.sentences)
        words_in_every_sent = [
            self._normalize_words(sent.words)
            for sent in document_set.sentences
        ]
        tf_value_every_sent = compute_tf(words_in_every_sent)
        idf_value = compute_idf(words_in_every_sent)

        words_count = len(dictionary)
        # create matrix |unique_words|x|sentences| filled with zeroes
        matrix = numpy.zeros((words_count, sentences_count))
        for idx, sentence in enumerate(document_set.sentences):
            for word in self._normalize_words(sentence.words):
                if word in dictionary:
                    row = dictionary[word]
                    matrix[
                        row,
                        idx] = tf_value_every_sent[idx][word] * idf_value[word]
        return matrix
예제 #2
0
    def test_compute_tf_idf(self):
        documents = (
            ("this", "is", "a", "example"),
            ("just", "for", "test"),
            ("test", "tf", "and", "idf"),
        )

        tf_metrics = compute_tf(documents)
        idf_metrics = compute_idf(documents)
        expected_tf = [
            {
                "this": 1 / 4,
                "is": 1 / 4,
                "a": 1 / 4,
                "example": 1 / 4
            },
            {
                "just": 1 / 3,
                "for": 1 / 3,
                "test": 1 / 3
            },
            {
                "test": 1 / 4,
                "tf": 1 / 4,
                "and": 1 / 4,
                "idf": 1 / 4
            },
        ]
        expected_idf = {
            "this": math.log(3 / 2),
            "is": math.log(3 / 2),
            "a": math.log(3 / 2),
            "example": math.log(3 / 2),
            "just": math.log(3 / 2),
            "for": math.log(3 / 2),
            "test": math.log(3 / 3),
            "tf": math.log(3 / 2),
            "and": math.log(3 / 2),
            "idf": math.log(3 / 2),
        }

        self.assertEqual(tf_metrics, expected_tf)
        self.assertEqual(idf_metrics, expected_idf)