Пример #1
0
    def test_select_content(self):
        sentence_1 = 'In a park somewhere, a bunch of puppies played fetch with their owners today.'
        doc_id_1 = 'TST_ENG_20190101.0001'
        sentence_2 = 'I took my small puppy to the dog park today.'
        doc_id_2 = 'TST_ENG_20190101.0002'

        selector = LeadSentenceSelector()
        documents = [Document(doc_id_1), Document(doc_id_2)]
        expected_sentences = [Sentence(sentence_1, 1, doc_id_1), Sentence(sentence_2, 1, doc_id_2)]
        selector.select_content(documents, [])
        selected_sentences = selector.selected_content

        self.assertCountEqual(expected_sentences, selected_sentences)
Пример #2
0
    def test_generate_summary(self):
        topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        WordMap.create_mapping()
        vec = Vectors()
        vec.create_freq_vectors(topics)
        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()

        for topic_id, documents in topics.items():
            summarizer = MeadSummaryGenerator(documents, MeadContentSelector(),
                                              self.args)
            summary = summarizer.generate_summary(idf)
            self.assertIsNot(summary, None)
Пример #3
0
    def test_create_mapping(self):

        Preprocessor.load_models()

        WordMap.word_set = set()
        WordMap.word_to_id = {}

        Document("TST_ENG_20190101.0001")
        Document("TST_ENG_20190101.0002")

        WordMap.create_mapping()
        mapping = WordMap.get_mapping()

        self.assertCountEqual(self.word_set, mapping.keys())  # each word in word_set got added to the dictionary
        self.assertEqual(len(mapping), len(set(mapping.items())))  # each id value in the dict is unique
Пример #4
0
 def test_parse_doc_id2(self):
     doc = Document("TST20190201.0001")
     self.assertEqual(doc.src, 'TST')
     self.assertEqual(doc.lang, '_ENG')
     self.assertEqual(doc.date, '20190201')
     self.assertEqual(doc.art_id, '0001')
     self.assertEqual(doc.docid, 'TST20190201.0001')
Пример #5
0
    def test_realize_content(self):
        documents = [
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002')
        ]
        expected_content = "In a park somewhere, a bunch of puppies played fetch with their owners today.\n" \
                           "I took my small puppy to the dog park today.\n" \
                           "Puppies are cute because many of them are small.\n" \
                           "Puppies love to play with toys."

        generator = LeadSummaryGenerator(documents, LeadSentenceSelector(), [])
        generator.select_content()
        generator.order_information()
        realized_content = generator.realize_content()
        self.assertEqual(expected_content, realized_content)
Пример #6
0
    def test_order_information(self):
        sentence_1 = 'Puppies are cute because many of them are small.'
        doc_id_1 = 'TST20190201.0001'
        sentence_2 = 'In a park somewhere, a bunch of puppies played fetch with their owners today.'
        doc_id_2 = 'TST_ENG_20190101.0001'
        expected_info = [
            Sentence(sentence_2, 1, doc_id_2),
            Sentence(sentence_1, 1, doc_id_1)
        ]

        documents = [
            Document('TST_ENG_20190101.0001'),
            Document('TST20190201.0001')
        ]
        generator = LeadSummaryGenerator(documents, LeadSentenceSelector(), [])
        generator.select_content()
        generator.order_information()

        self.assertListEqual(expected_info,
                             generator.content_selector.selected_content)
Пример #7
0
def load_documents(topic):
    """
    Load the documents for the given topic
    :param topic:
    :return:
    """
    documents = []
    for doc in topic.find_all('doc'):
        documents.append(Document(doc['id']))

    return documents
Пример #8
0
class VectorsTests(unittest.TestCase):

    Preprocessor.load_models()
    topics = {
        1:
        [Document('TST_ENG_20190101.0001'),
         Document('TST_ENG_20190101.0002')]
    }
    WordMap.create_mapping()
    mapping = WordMap.get_mapping()
    topic_one = topics.get(1)  # list of Documents

    def test_create_freq_vectors(self):
        Vectors().create_freq_vectors(self.topics)
        for doc_list in self.topics.values():
            for doc in doc_list:
                # check that there's a vector for each sentence

                doc_matrix_shape = doc.vectors.get_shape()
                expected_rows = 3
                self.assertEqual(doc_matrix_shape[0], expected_rows)

    def test_sentence_vector(self):
        s = self.topics.get(1)[1].sens[1]  # s1 is a Sentence object
        # s text: 'He loves playing so he liked to run around with the other dogs playing fetch.'
        id_of_playing = WordMap.id_of('playing')
        self.assertEqual(s.vector.getcol(id_of_playing).sum(), 1)
        for word in s.tokens:
            id_of_word = WordMap.id_of(word)
            self.assertGreater(s.vector.getcol(id_of_word).sum(), 0)

    def test_get_topic_matrix(self):
        # make sure all sentences from all topic docs make it into the matrix
        topic_one_matrix = Vectors().get_topic_matrix(self.topic_one)
        expected_num_sentences = 6
        self.assertEqual(expected_num_sentences,
                         topic_one_matrix.get_shape()[0])
Пример #9
0
 def test_get_documents_for_topics(self):
     topic_soup = make_soup('test_data/test_topics.xml')
     expected_topics = {
         'PUP1A': [
             Document('TST_ENG_20190101.0001'),
             Document('TST_ENG_20190101.0002'),
             Document('TST20190201.0001'),
             Document('TST20190201.0002')
         ],
         'WAR2A': [
             Document('TST_ENG_20190301.0001'),
             Document('TST_ENG_20190301.0002'),
             Document('TST20190401.0001'),
             Document('TST20190401.0002')
         ]
     }
     topics = load_documents_for_topics(topic_soup)
     self.assertCountEqual(topics, expected_topics)
Пример #10
0
    def test_mead_summary_length(self):
        """
        Test length of summary is less than 100 words
        :return:
        """
        topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        WordMap.create_mapping()
        vec = Vectors()
        vec.create_freq_vectors(topics)
        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()
        max_length = 100

        for topic_id, documents in topics.items():
            generator = MeadSummaryGenerator(documents, MeadContentSelector(),
                                             self.args)
            generator.select_content(idf)
            generator.order_information()
            realized_content = generator.realize_content()
            realized_content = [
                w for w in realized_content.split(" ") if not " "
            ]
            content_length = len(realized_content)
            self.assertLessEqual(content_length, max_length)
Пример #11
0
 def test_document_headline2(self):
     doc = Document("TST_ENG_20190101.0002")
     self.assertEqual(doc.headline, "Playing in the dog park")
Пример #12
0
 def test_document_headline(self):
     doc = Document("TST_ENG_20190101.0001")
     self.assertEqual(doc.headline, "Puppies play fetch in the park")
Пример #13
0
class MeldaContentSelectorTests(unittest.TestCase):
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_3 = Document("TST_ENG_20190301.0001")
    doc_list = [doc_1, doc_3]
    topics = {'PUPWAR': doc_list}

    w_set = {'park', 'somewhere', 'bunch', 'puppy', 'play', 'fetch', 'their', 'owner', 'today', 'they', 'all', 'run',
             'around', 'their', 'tail', 'wag', 'tongue', 'hang', 'out', 'have', 'load', 'fun', 'sun', 'love', 'our',
             'country', 'go', 'war', 'soldier', 'go', 'fight', 'travel', 'wherever', 'fight', 'enemy', 'try', 'kill',
             'before', 'get', 'kill', 'themselves', '-PRON-', 'playing'}

    idf = [4.032940937780854, 2.420157081061118, 1.3730247377110034,
           2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
           3.25478968739721, 2.7107216430469343, 3.7319109421168726,
           4.032940937780854, 3.3339709334448346, 4.032940937780854,
           1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
           2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
           1.5660733174267443, 2.024340766018936, 1.2476111027700865,
           4.032940937780854, 0.9959130580250786, 3.7319109421168726,
           2.5415792439465807, 1.7107216430469343, 4.032940937780854,
           3.4308809464528913, 4.032940937780854, 3.4308809464528913,
           3.5558196830611912, 3.5558196830611912, 4.032940937780854,
           1.734087861371147, 3.0786984283415286, 0.9055121599292547,
           3.5558196830611912, 3.5558196830611912, 1.9876179589941962]


    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.lda_topics = 2

    def test_document_topics(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)
        testtok = ['puppy', 'soldier', 'war', 'fetch']
        testsen = Vectors().create_term_sen_freq(testtok)
        document_topics = lda_model.get_document_topics(testsen, minimum_probability=0)
        topic_dist = [prob[1] for prob in document_topics]

        self.assertEqual(len(topic_dist), self.args.lda_topics)
        self.assertAlmostEquals(sum(topic_dist), 1, 2)


    def test_term_topics(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        puppy_topics = lda_model.get_term_topics(WordMap.id_of('puppy'), minimum_probability=0)
        war_topics = lda_model.get_term_topics(WordMap.id_of('war'), minimum_probability=0)
        puppy_dist = [prob[1] for prob in puppy_topics]
        enemy_dist = [prob[1] for prob in war_topics]

        puppy_war = puppy_dist[0] > enemy_dist[0] and puppy_dist[1] < enemy_dist[1]
        war_puppy = puppy_dist[0] < enemy_dist[0] and puppy_dist[1] > enemy_dist[1]

        self.assertTrue(puppy_war or war_puppy)

    def test_get_lda_scores(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentence = self.doc_list[0].sens[0]
        selector.calculate_lda_scores([sentence], lda_model)
        lda_scores = sentence.lda_scores

        self.assertEqual(len(lda_scores), self.args.lda_topics)
        self.assertAlmostEqual(sum(lda_scores), 1, 2)

    def test_get_melda_scores(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentence = self.doc_list[0].sens[0]
        sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf)
        selector.calculate_lda_scores(sentences, lda_model)
        selector.calculate_melda_scores(sentences)
        melda_scores = sentence.melda_scores

        self.assertEqual(len(melda_scores), self.args.lda_topics)

    def test_get_top_n(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf)
        sentences = selector.calculate_lda_scores(sentences, lda_model)
        sentences = selector.calculate_melda_scores(sentences)
        selector.select_top_n(sentences, self.args.lda_topics, 1)

        self.assertEqual(len(selector.selected_content), self.args.lda_topics)
Пример #14
0
class MeadSummaryGeneratorTests(unittest.TestCase):
    """
    Tests for MeadSummaryGenerator
    """

    # variables used in multiple tests
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_2 = Document("TST_ENG_20190101.0002")
    doc_list = [doc_1, doc_2]
    topics = {'PUP1A': [doc_1, doc_2]}
    w_set = {
        'he', 'owner', 'i', 'play', 'big', 'chase', 'fetch', 'park', 'dog',
        'fun', 'toy', 'tongue', 'take', 'ran', 'in', 'sun', 'love',
        'somewhere', 'many', 'together', 'around', 'puppy', 'today', 'load',
        'fight', 'small', "n't", '-PRON-', 'wag', 'hang', 'loads', 'bunch',
        'get', 'playing', 'they', 'like', 'tail', 'run', 'there'
    }

    idf = [
        4.032940937780854, 2.420157081061118, 1.3730247377110034,
        2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
        3.25478968739721, 2.7107216430469343, 3.7319109421168726,
        4.032940937780854, 3.3339709334448346, 4.032940937780854,
        1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
        2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
        1.5660733174267443, 2.024340766018936, 1.2476111027700865,
        4.032940937780854, 0.9959130580250786, 3.7319109421168726,
        2.5415792439465807, 1.7107216430469343, 4.032940937780854,
        3.4308809464528913, 4.032940937780854, 3.4308809464528913,
        3.5558196830611912, 3.5558196830611912, 4.032940937780854,
        1.734087861371147, 3.0786984283415286, 0.9055121599292547,
        3.5558196830611912, 3.5558196830611912, 1.9876179589941962
    ]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    WordMap.reset()

    def test_order_information(self):
        """
        Test ordering Sentences by MEAD score
        :return:
        """
        doc_id_1 = 'TST_ENG_20190101.0001'
        sentence_1 = 'Puppies love playing fetch.'
        sentence_2 = 'They all ran around with their tails wagging ' \
                     'and their tongues hanging out having loads of fun in the sun.'
        sentence_3 = "He loves playing so he liked to run around with the other dogs playing fetch."
        expected_info = [
            Sentence(sentence_1, 1, doc_id_1),
            Sentence(sentence_3, 3, doc_id_1),
            Sentence(sentence_2, 2, doc_id_1)
        ]

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                         self.args)
        generator.select_content(self.idf)
        generator.order_information()

        first_sentences = generator.content_selector.selected_content[:3]

        self.assertListEqual(expected_info, first_sentences)

    def test_realize_content(self):
        """
        Test applying redundancy penalty during realize_content
        :return:
        """
        expected_content = "I took my small puppy to the dog park today.\n" \
                           "In a park somewhere, a bunch of puppies played fetch with their owners today.\n" \
                           "There were many bigger puppies but he didn't get in a fight with any of them, " \
                           "they just played together with their toys and chased each other.\n" \
                           "They all ran around with their tails wagging and their tongues hanging out having " \
                           "loads of fun in the sun.\n" \
                           "He loves playing so he liked to run around with the other dogs playing fetch.\n" \
                           "Puppies love playing fetch."

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)

        generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                         self.args)
        generator.select_content(self.idf)
        generator.order_information()
        generator.content_selector.selected_content = generator.content_selector.selected_content
        realized_content = generator.realize_content()
        self.assertEqual(expected_content, realized_content)

    def test_get_idf_array(self):
        words = [
            "i", "eat", "cake", "is", "delicious", "puppies", "are", "cute",
            "cats", "furry", "bank", "company", "sugar", "dollar", "however",
            "say"
        ]
        # Must override WordMap dictionary for test
        WordMap.word_to_id = {
            'delicious': 0,
            'eat': 1,
            'furry': 2,
            'puppies': 3,
            'i': 4,
            'cats': 5,
            'are': 6,
            'is': 7,
            'cute': 8,
            'cake': 9,
            'bank': 10,
            'company': 11,
            'sugar': 12,
            'dollar': 13,
            'however': 14,
            'say': 15
        }

        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()

        scores = []
        for word in words:
            curr_score = idf[WordMap.id_of(word)]
            scores.append("{:.5f}".format(curr_score))

        expected_scores = [
            '2.69897', '0.80688', '1.49485', '2.69897', '2.69897', '2.69897',
            '2.69897', '1.92082', '2.69897', '2.69897', '1.04576', '0.65365',
            '1.44370', '0.98297', '0.24718', '0.10018'
        ]

        self.assertListEqual(scores, expected_scores, 5)

    def test_mead_summary_length(self):
        """
        Test length of summary is less than 100 words
        :return:
        """
        topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        WordMap.create_mapping()
        vec = Vectors()
        vec.create_freq_vectors(topics)
        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()
        max_length = 100

        for topic_id, documents in topics.items():
            generator = MeadSummaryGenerator(documents, MeadContentSelector(),
                                             self.args)
            generator.select_content(idf)
            generator.order_information()
            realized_content = generator.realize_content()
            realized_content = [
                w for w in realized_content.split(" ") if not " "
            ]
            content_length = len(realized_content)
            self.assertLessEqual(content_length, max_length)

    def test_generate_summary(self):
        topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        WordMap.create_mapping()
        vec = Vectors()
        vec.create_freq_vectors(topics)
        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()

        for topic_id, documents in topics.items():
            summarizer = MeadSummaryGenerator(documents, MeadContentSelector(),
                                              self.args)
            summary = summarizer.generate_summary(idf)
            self.assertIsNot(summary, None)
Пример #15
0
    def test_lead_summary_length(self):
        documents = [
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002'),
            Document('TST_ENG_20190101.0001'),
            Document('TST_ENG_20190101.0002'),
            Document('TST20190201.0001'),
            Document('TST20190201.0002')
        ]
        max_length = 100

        generator = LeadSummaryGenerator(documents, LeadSentenceSelector(), [])
        generator.select_content()
        generator.order_information()
        realized_content = generator.realize_content()
        content_length = len(realized_content.split(" "))
        self.assertLessEqual(content_length, max_length)
Пример #16
0
class MeadContentSelectorTests(unittest.TestCase):
    """
    Tests for MeadContentSelector
    """

    # variables used in multiple tests
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_2 = Document("TST_ENG_20190101.0002")
    doc_list = [doc_1, doc_2]
    topics = {'PUP1A': [doc_1, doc_2]}
    w_set = {
        'he', 'owner', 'i', 'play', 'big', 'chase', 'fetch', 'park', 'dog',
        'fun', 'toy', 'tongue', 'take', 'ran', 'in', 'sun', 'love',
        'somewhere', 'many', 'together', 'around', 'puppy', 'today', 'load',
        'fight', 'small', "n't", '-PRON-', 'wag', 'hang', 'loads', 'bunch',
        'get', 'playing', 'they', 'like', 'tail', 'run', 'there'
    }

    w_map = {
        'he': 1,
        'owner': 2,
        'i': 3,
        'play': 4,
        'big': 5,
        'chase': 6,
        'fetch': 7,
        'park': 8,
        'dog': 9,
        'fun': 10,
        'toy': 11,
        'tongue': 12,
        'take': 13,
        'ran': 14,
        'in': 15,
        'sun': 16,
        'love': 17,
        'somewhere': 18,
        'many': 19,
        'together': 20,
        'around': 21,
        'puppy': 22,
        'today': 23,
        'load': 24,
        'fight': 25,
        'small': 26,
        "n't": 27,
        '-PRON-': 28,
        'wag': 29,
        'hang': 30,
        'loads': 31,
        'bunch': 32,
        'get': 33,
        'playing': 34,
        'they': 35,
        'like': 36,
        'tail': 37,
        'run': 38,
        'there': 39
    }

    idf = [
        4.032940937780854, 2.420157081061118, 1.3730247377110034,
        2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
        3.25478968739721, 2.7107216430469343, 3.7319109421168726,
        4.032940937780854, 3.3339709334448346, 4.032940937780854,
        1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
        2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
        1.5660733174267443, 2.024340766018936, 1.2476111027700865,
        4.032940937780854, 0.9959130580250786, 3.7319109421168726,
        2.5415792439465807, 1.7107216430469343, 4.032940937780854,
        3.4308809464528913, 4.032940937780854, 3.4308809464528913,
        3.5558196830611912, 3.5558196830611912, 4.032940937780854,
        1.734087861371147, 3.0786984283415286, 0.9055121599292547,
        3.5558196830611912, 3.5558196830611912, 1.9876179589941962
    ]

    # idf = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9,
    #        1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9,
    #        1, 2, 3]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.c_threshold = 'min'

    def test_get_sentence_position(self):
        selector = MeadContentSelector()
        sentence_1 = Sentence("Here is a test sentence.", 0)
        sentence_2 = Sentence("Here is another one.", 50)

        pos_score_1 = selector.get_sentence_position(sentence_1, 100)
        pos_score_2 = selector.get_sentence_position(sentence_2, 100)

        expected_score_1 = 1
        expected_score_2 = 50 / 100

        self.assertEqual(expected_score_1, pos_score_1)
        self.assertEqual(expected_score_2, pos_score_2)

    def test_get_cluster_centroid(self):
        selector = MeadContentSelector()
        WordMap.word_set = self.w_set
        WordMap.word_to_id = self.w_map
        Vectors().create_freq_vectors(self.topics)

        centroid = selector.get_cluster_centroid(self.doc_list, self.idf,
                                                 self.args.c_threshold)

        actual_non_zero = np.count_nonzero(centroid)
        should_be_non_zero = 29

        self.assertEqual(actual_non_zero, should_be_non_zero)

    def test_get_centroid_score(self):
        selector = MeadContentSelector()
        sent_1 = Sentence("Puppies love playing fetch.", 0)
        self.args.c_threshold = 'mean'

        WordMap.word_set = self.w_set
        WordMap.word_to_id = self.w_map
        Vectors().create_freq_vectors(self.topics)

        centroid = selector.get_cluster_centroid(self.doc_list, self.idf,
                                                 self.args.c_threshold)

        expected_centroid_score = 6.3
        c_score = selector.get_centroid_score(sent_1, centroid)

        self.assertAlmostEqual(expected_centroid_score, c_score, 1)

    def test_apply_redundancy_penalty(self):
        """
        Test the function to apply the redundancy penalty
        :return:
        """
        selector = MeadContentSelector()

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)

        selected = selector.select_content(self.doc_list, self.args, self.idf)
        selector.apply_redundancy_penalty(selected[0],
                                          selector.selected_content)
        scores = [s.mead_score for s in selector.selected_content]
        expected_scores = [
            1.9003829413846463, 1.6243717975775935, 0.6522065176000799,
            2.3571461578060453, 1.532600545620478, 1.7661796758000055
        ]

        self.assertEqual(scores, expected_scores)

    def test_select_content(self):
        selector = MeadContentSelector()
        Vectors().create_freq_vectors(self.topics)
        selected = selector.select_content(self.topics['PUP1A'], self.args,
                                           self.idf)
        top_sentence = selected[0]
        expected_top_sentence = 'In a park somewhere, a bunch of ' \
                                'puppies played fetch with their owners today.'

        top_mead_score = float("{:.5f}".format(top_sentence.mead_score))
        expected_top_mead_score = 2.40038

        self.assertEqual(top_sentence.raw_sentence, expected_top_sentence)
        self.assertEqual(top_mead_score, expected_top_mead_score)
Пример #17
0
class MeldaSummaryGeneratorTests(unittest.TestCase):
    """
    Tests for MeldaSummaryGenerator
    """

    # variables used in multiple tests
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_3 = Document("TST_ENG_20190301.0001")
    doc_list = [doc_1, doc_3]
    topics = {'PUPWAR': doc_list}

    w_set = {
        'park', 'somewhere', 'bunch', 'puppy', 'play', 'fetch', 'their',
        'owner', 'today', 'they', 'all', 'run', 'around', 'their', 'tail',
        'wag', 'tongue', 'hang', 'out', 'have', 'load', 'fun', 'sun', 'love',
        'our', 'country', 'go', 'war', 'soldier', 'go', 'fight', 'travel',
        'wherever', 'fight', 'enemy', 'try', 'kill', 'before', 'get', 'kill',
        'themselves', '-PRON-', 'playing'
    }

    idf = [
        4.032940937780854, 2.420157081061118, 1.3730247377110034,
        2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
        3.25478968739721, 2.7107216430469343, 3.7319109421168726,
        4.032940937780854, 3.3339709334448346, 4.032940937780854,
        1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
        2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
        1.5660733174267443, 2.024340766018936, 1.2476111027700865,
        4.032940937780854, 0.9959130580250786, 3.7319109421168726,
        2.5415792439465807, 1.7107216430469343, 4.032940937780854,
        3.4308809464528913, 4.032940937780854, 3.4308809464528913,
        3.5558196830611912, 3.5558196830611912, 4.032940937780854,
        1.734087861371147, 3.0786984283415286, 0.9055121599292547,
        3.5558196830611912, 3.5558196830611912, 1.9876179589941962
    ]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.lda_topics = 2

    def test_melda_info_ordering(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        summarizer = MeldaSummaryGenerator(self.doc_list,
                                           MeldaContentSelector(), self.args)
        content_selector = summarizer.select_content(self.idf)
        expected_len = len(content_selector)
        summarizer.order_information()

        actual_len = len(content_selector)

        self.assertEqual(expected_len, actual_len)

    def test_melda_generate_summary(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        for topic_id, documents in self.topics.items():
            summarizer = MeldaSummaryGenerator(documents,
                                               MeldaContentSelector(),
                                               self.args)
            summary = summarizer.generate_summary(self.idf)
            self.assertIsNot(summary, None)

    def test_ifvalid_sent(self):
        for topic_id, documents in self.topics.items():
            summarizer = MeldaSummaryGenerator(documents,
                                               MeldaContentSelector(),
                                               self.args)
            break
        raw_sent1 = "--"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent1), 1)

        raw_sent2 = "---"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent2), 0)

        raw_sent3 = "-342--"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent3), 1)

        raw_sent4 = "-342dafd23480134"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent4), 0)

        raw_sent5 = "\n\nsafadj\n\n"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent5), 0)

        raw_sent6 = "-342dafd23480"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent6), 1)