Пример #1
0
 def test_sentence_vector(self):
     s = self.topics.get(1)[1].sens[1]  # s1 is a Sentence object
     # s text: 'He loves playing so he liked to run around with the other dogs playing fetch.'
     id_of_playing = WordMap.id_of('playing')
     self.assertEqual(s.vector.getcol(id_of_playing).sum(), 1)
     for word in s.tokens:
         id_of_word = WordMap.id_of(word)
         self.assertGreater(s.vector.getcol(id_of_word).sum(), 0)
Пример #2
0
    def create_term_doc_freq(self, topics):
        """
        create term freq on each doc over each topic
        :param topics:
        :return: set tdf to topic_list of [doc_list of (wordid, freq)]
                e.g.,[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (6, 5),  (9, 2), (10, 1), (11, 1)...],[...]]

        """
        for cluster in topics.values():
            for document in cluster:
                term_doc_freq_dict = {}
                for sentence in document.sens:

                    for word in sentence.tokenized():
                        word_id = WordMap.id_of(word)
                        if word_id is None:
                            warnings.warn('Word \'' + word + '\' not in WordMap', Warning)
                            continue
                        if word_id not in term_doc_freq_dict:
                            term_doc_freq_dict[word_id] = 0
                        term_doc_freq_dict[word_id] += 1
                term_doc_freq_list = []
                for word_id in sorted(term_doc_freq_dict):
                    term_doc_freq_list.append((word_id, term_doc_freq_dict[word_id]))

                document.set_tdf(term_doc_freq_list)
Пример #3
0
 def create_freq_vectors(self, topics):
     """
     creates a frequency vector for each sentence in each document in each topic in topics; stores single vectors in
     relevant Sentence objects and per-document matrices in relevant Document objects
     :param topics: Dictionary {topic -> list of Documents}
     :return: None
     pre: WordMap.create_mapping has been called (should happen in run_summarization document loading)
     """
     for cluster in topics.values():
         for document in cluster:
             doc_vectors = dok_matrix((0, self.num_unique_words))
             for sentence in document.sens:
                 sentence_vector = dok_matrix((1, self.num_unique_words))
                 for word in sentence.tokenized():  # maybe check that sentence.tokenized() is the right thing here
                     word_id = WordMap.id_of(word)
                     if word_id is None:
                         warnings.warn('Word \'' + word + '\' not in WordMap', Warning)
                         warnings.warn('Sentence:' + sentence.raw_sentence, Warning)
                     else:
                         sentence_vector[0, word_id] += 1
                 # assign vector to sentence object
                 sentence.set_vector(sentence_vector)
                 # add sentence vector to document matrix
                 doc_vectors = vstack([doc_vectors, sentence_vector])
             # assign matrix to document
             document.set_vectors(doc_vectors)
Пример #4
0
    def test_term_topics(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        puppy_topics = lda_model.get_term_topics(WordMap.id_of('puppy'), minimum_probability=0)
        war_topics = lda_model.get_term_topics(WordMap.id_of('war'), minimum_probability=0)
        puppy_dist = [prob[1] for prob in puppy_topics]
        enemy_dist = [prob[1] for prob in war_topics]

        puppy_war = puppy_dist[0] > enemy_dist[0] and puppy_dist[1] < enemy_dist[1]
        war_puppy = puppy_dist[0] < enemy_dist[0] and puppy_dist[1] > enemy_dist[1]

        self.assertTrue(puppy_war or war_puppy)
Пример #5
0
    def get_idf_array(self):
        """
        Use external corpus to get IDF scores
        for cluster centroid calculations
        :return: numpy array of idf values
        """
        corpus = brown
        if self.args.corpus == 'R':
            corpus = reuters
        num_words = Vectors().num_unique_words
        n = len(corpus.fileids())  # number of documents in corpus
        docs_word_matrix = np.zeros([n, num_words])
        for doc_idx, doc_id in enumerate(corpus.fileids()):
            sentences = list(corpus.sents(doc_id))
            words_in_doc = set()
            for s in sentences:
                s = ' '.join(s)
                proc_s = Preprocessor.get_processed_tokens(Preprocessor.get_processed_sentence(s))
                if proc_s:
                    words_in_doc = words_in_doc.union(proc_s)
            for word in words_in_doc:
                word_idx = WordMap.id_of(word)
                if word_idx:
                    docs_word_matrix[doc_idx, word_idx] = 1

        docs_per_word = np.sum(docs_word_matrix, axis=0)
        self.idf_array = np.log10(np.divide(n, docs_per_word + 1))  # add one to avoid divide by zero error

        return self.idf_array
    def get_centroid_score(self, sentence, centroid):
        """
        Get the centroid score for this sentence
        :param: sentence, centroid
        :return: float
        """
        centroid_score = 0
        for word in sentence.tokens:
            id = WordMap.id_of(word)
            centroid_score += centroid[id] if id is not None else 0

        # return centroid_score/(sentence.word_count() + 1)
        return centroid_score
Пример #7
0
    def test_get_idf_array(self):
        words = [
            "i", "eat", "cake", "is", "delicious", "puppies", "are", "cute",
            "cats", "furry", "bank", "company", "sugar", "dollar", "however",
            "say"
        ]
        # Must override WordMap dictionary for test
        WordMap.word_to_id = {
            'delicious': 0,
            'eat': 1,
            'furry': 2,
            'puppies': 3,
            'i': 4,
            'cats': 5,
            'are': 6,
            'is': 7,
            'cute': 8,
            'cake': 9,
            'bank': 10,
            'company': 11,
            'sugar': 12,
            'dollar': 13,
            'however': 14,
            'say': 15
        }

        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()

        scores = []
        for word in words:
            curr_score = idf[WordMap.id_of(word)]
            scores.append("{:.5f}".format(curr_score))

        expected_scores = [
            '2.69897', '0.80688', '1.49485', '2.69897', '2.69897', '2.69897',
            '2.69897', '1.92082', '2.69897', '2.69897', '1.04576', '0.65365',
            '1.44370', '0.98297', '0.24718', '0.10018'
        ]

        self.assertListEqual(scores, expected_scores, 5)
Пример #8
0
    def create_term_sen_freq(self, sen):
        """
        create term freq on a tokenized sentence
        :param sent:
        :return:
        """
        term_doc_freq_dict = {}
        for tok in sen:
            word_id = WordMap.id_of(tok)
            if word_id is None:
                warnings.warn('Word \'' + tok + '\' not in WordMap', Warning)
                continue
            else:
                if word_id not in term_doc_freq_dict:
                    term_doc_freq_dict[word_id] = 0
                term_doc_freq_dict[word_id] += 1

        term_doc_freq_list = []
        for word_id in sorted(term_doc_freq_dict):
            term_doc_freq_list.append((word_id, term_doc_freq_dict[word_id]))

        return term_doc_freq_list