Exemplo n.º 1
0
    def test_get_output_filename(self):
        topic_id = 'PUP1A'
        args = parse_args([
            'test_data/test_topics.xml', 'test', '--output_dir',
            '../outputs/D0/'
        ])
        output_file = get_output_filename(topic_id, args)

        self.assertEqual(output_file,
                         '../outputs/D0/PUP1-A.M.100.A.test-B-max-111')
Exemplo n.º 2
0
class MeadContentSelectorTests(unittest.TestCase):
    """
    Tests for MeadContentSelector
    """

    # variables used in multiple tests
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_2 = Document("TST_ENG_20190101.0002")
    doc_list = [doc_1, doc_2]
    topics = {'PUP1A': [doc_1, doc_2]}
    w_set = {
        'he', 'owner', 'i', 'play', 'big', 'chase', 'fetch', 'park', 'dog',
        'fun', 'toy', 'tongue', 'take', 'ran', 'in', 'sun', 'love',
        'somewhere', 'many', 'together', 'around', 'puppy', 'today', 'load',
        'fight', 'small', "n't", '-PRON-', 'wag', 'hang', 'loads', 'bunch',
        'get', 'playing', 'they', 'like', 'tail', 'run', 'there'
    }

    w_map = {
        'he': 1,
        'owner': 2,
        'i': 3,
        'play': 4,
        'big': 5,
        'chase': 6,
        'fetch': 7,
        'park': 8,
        'dog': 9,
        'fun': 10,
        'toy': 11,
        'tongue': 12,
        'take': 13,
        'ran': 14,
        'in': 15,
        'sun': 16,
        'love': 17,
        'somewhere': 18,
        'many': 19,
        'together': 20,
        'around': 21,
        'puppy': 22,
        'today': 23,
        'load': 24,
        'fight': 25,
        'small': 26,
        "n't": 27,
        '-PRON-': 28,
        'wag': 29,
        'hang': 30,
        'loads': 31,
        'bunch': 32,
        'get': 33,
        'playing': 34,
        'they': 35,
        'like': 36,
        'tail': 37,
        'run': 38,
        'there': 39
    }

    idf = [
        4.032940937780854, 2.420157081061118, 1.3730247377110034,
        2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
        3.25478968739721, 2.7107216430469343, 3.7319109421168726,
        4.032940937780854, 3.3339709334448346, 4.032940937780854,
        1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
        2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
        1.5660733174267443, 2.024340766018936, 1.2476111027700865,
        4.032940937780854, 0.9959130580250786, 3.7319109421168726,
        2.5415792439465807, 1.7107216430469343, 4.032940937780854,
        3.4308809464528913, 4.032940937780854, 3.4308809464528913,
        3.5558196830611912, 3.5558196830611912, 4.032940937780854,
        1.734087861371147, 3.0786984283415286, 0.9055121599292547,
        3.5558196830611912, 3.5558196830611912, 1.9876179589941962
    ]

    # idf = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9,
    #        1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9,
    #        1, 2, 3]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.c_threshold = 'min'

    def test_get_sentence_position(self):
        selector = MeadContentSelector()
        sentence_1 = Sentence("Here is a test sentence.", 0)
        sentence_2 = Sentence("Here is another one.", 50)

        pos_score_1 = selector.get_sentence_position(sentence_1, 100)
        pos_score_2 = selector.get_sentence_position(sentence_2, 100)

        expected_score_1 = 1
        expected_score_2 = 50 / 100

        self.assertEqual(expected_score_1, pos_score_1)
        self.assertEqual(expected_score_2, pos_score_2)

    def test_get_cluster_centroid(self):
        selector = MeadContentSelector()
        WordMap.word_set = self.w_set
        WordMap.word_to_id = self.w_map
        Vectors().create_freq_vectors(self.topics)

        centroid = selector.get_cluster_centroid(self.doc_list, self.idf,
                                                 self.args.c_threshold)

        actual_non_zero = np.count_nonzero(centroid)
        should_be_non_zero = 29

        self.assertEqual(actual_non_zero, should_be_non_zero)

    def test_get_centroid_score(self):
        selector = MeadContentSelector()
        sent_1 = Sentence("Puppies love playing fetch.", 0)
        self.args.c_threshold = 'mean'

        WordMap.word_set = self.w_set
        WordMap.word_to_id = self.w_map
        Vectors().create_freq_vectors(self.topics)

        centroid = selector.get_cluster_centroid(self.doc_list, self.idf,
                                                 self.args.c_threshold)

        expected_centroid_score = 6.3
        c_score = selector.get_centroid_score(sent_1, centroid)

        self.assertAlmostEqual(expected_centroid_score, c_score, 1)

    def test_apply_redundancy_penalty(self):
        """
        Test the function to apply the redundancy penalty
        :return:
        """
        selector = MeadContentSelector()

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)

        selected = selector.select_content(self.doc_list, self.args, self.idf)
        selector.apply_redundancy_penalty(selected[0],
                                          selector.selected_content)
        scores = [s.mead_score for s in selector.selected_content]
        expected_scores = [
            1.9003829413846463, 1.6243717975775935, 0.6522065176000799,
            2.3571461578060453, 1.532600545620478, 1.7661796758000055
        ]

        self.assertEqual(scores, expected_scores)

    def test_select_content(self):
        selector = MeadContentSelector()
        Vectors().create_freq_vectors(self.topics)
        selected = selector.select_content(self.topics['PUP1A'], self.args,
                                           self.idf)
        top_sentence = selected[0]
        expected_top_sentence = 'In a park somewhere, a bunch of ' \
                                'puppies played fetch with their owners today.'

        top_mead_score = float("{:.5f}".format(top_sentence.mead_score))
        expected_top_mead_score = 2.40038

        self.assertEqual(top_sentence.raw_sentence, expected_top_sentence)
        self.assertEqual(top_mead_score, expected_top_mead_score)
Exemplo n.º 3
0
    def test_argparse(self):
        args = parse_args(['test_data/test_topics.xml', 'test'])

        self.assertEqual(len(args._get_kwargs()), 10)
Exemplo n.º 4
0
class MeldaSentenceCompressionTests(unittest.TestCase):
    """
    Tests for MeldaInfoOrdering
    """
    Preprocessor.load_models()

    s0 = Sentence(
        "In a park somewhere, a bunch of puppies played fetch with their owners today.",
        1)
    s1 = Sentence("I took my small puppy to the dog park today.", 1)
    s2 = Sentence(
        "He loves playing so he liked to run around with the other dogs playing fetch.",
        1)
    s3 = Sentence("Puppies love playing fetch.", 1)

    input_summary = [s0, s1, s2, s3]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.n = 1

    realizer = MeldaContentRealizer()

    def test_remove_adverbs(self):
        s = Sentence("Puppies love running quickly and playing loudly.", 1)
        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Puppies love running and playing."
        self.assertEqual(summary, expected)

    def test_remove_initial_conj(self):
        s = Sentence("But, puppies are great.", 1)
        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Puppies are great."
        self.assertEqual(summary, expected)

    def test_remove_parens(self):
        s = Sentence("The puppy (aka Mr. Mayor) was the cutest.", 1)
        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "The puppy was the cutest."
        self.assertEqual(summary, expected)

    def test_remove_appositives(self):
        s = Sentence(
            "Dennis, the cutest puppy in the park, ran towards the ball.", 1)
        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Dennis ran towards the ball."
        self.assertEqual(expected, summary)

    def test_remove_junk(self):
        s = Sentence("Seattle, WA --- Puppies are great.", 1)
        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Puppies are great."
        self.assertEqual(summary, expected)

    def test_remove_attributions(self):
        s = Sentence("Julia said that puppies are cute.", 1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Puppies are cute."
        self.assertEqual(summary, expected)

    def test_remove_attribution_phrases(self):
        s = Sentence(
            "Seattle State Bureau of Animal Rating said "
            "in a press release that puppies are cute.", 1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Puppies are cute."
        self.assertEqual(summary, expected)

    def test_remove_temporal_mod(self):
        s = Sentence("By 8 a.m. on Saturday the park was full of puppies.", 1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "The park was full of puppies."
        self.assertEqual(summary, expected)

    def test_remove_mod_rel(self):
        s = Sentence(
            "Joe said that by 8 a.m. on Saturday the park was full of puppies.",
            1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "The park was full of puppies."
        self.assertEqual(summary, expected)

    def test_bad(self):
        s = Sentence(
            "Heilongjiang Provincial Bureau of Environmental Protection said in a press release that by 6 a.m. on Saturday, concentration of nitrobenzene monitored at Sujiatun upstream Sifangtai, one major water intake spot of Harbin, capital of northeast China's Heilongjiang Province, fell to 0.0793 mg per liter, but above the state safety standard of 0.017 mg per liter, but the density of benzene stood at 0.0011 mg per liter, which is within   the state safety benchmark.",
            1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "Concentration of nitrobenzene monitored at Sujiatun upstream Sifangtai fell, " \
                   "but above the state safety standard, but the density of benzene stood, " \
                   "which is within the state safety benchmark."
        self.assertEqual(summary, expected)

    def test_remove_npadvmod(self):
        s = Sentence("Joe said Saturday that the park was full of puppies.", 1)

        sentences = self.realizer.compress_sentences([s])
        summary = "\n".join([s.compressed for s in sentences])

        expected = "The park was full of puppies."
        self.assertEqual(summary, expected)
Exemplo n.º 5
0
class MeldaContentSelectorTests(unittest.TestCase):
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_3 = Document("TST_ENG_20190301.0001")
    doc_list = [doc_1, doc_3]
    topics = {'PUPWAR': doc_list}

    w_set = {'park', 'somewhere', 'bunch', 'puppy', 'play', 'fetch', 'their', 'owner', 'today', 'they', 'all', 'run',
             'around', 'their', 'tail', 'wag', 'tongue', 'hang', 'out', 'have', 'load', 'fun', 'sun', 'love', 'our',
             'country', 'go', 'war', 'soldier', 'go', 'fight', 'travel', 'wherever', 'fight', 'enemy', 'try', 'kill',
             'before', 'get', 'kill', 'themselves', '-PRON-', 'playing'}

    idf = [4.032940937780854, 2.420157081061118, 1.3730247377110034,
           2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
           3.25478968739721, 2.7107216430469343, 3.7319109421168726,
           4.032940937780854, 3.3339709334448346, 4.032940937780854,
           1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
           2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
           1.5660733174267443, 2.024340766018936, 1.2476111027700865,
           4.032940937780854, 0.9959130580250786, 3.7319109421168726,
           2.5415792439465807, 1.7107216430469343, 4.032940937780854,
           3.4308809464528913, 4.032940937780854, 3.4308809464528913,
           3.5558196830611912, 3.5558196830611912, 4.032940937780854,
           1.734087861371147, 3.0786984283415286, 0.9055121599292547,
           3.5558196830611912, 3.5558196830611912, 1.9876179589941962]


    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.lda_topics = 2

    def test_document_topics(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)
        testtok = ['puppy', 'soldier', 'war', 'fetch']
        testsen = Vectors().create_term_sen_freq(testtok)
        document_topics = lda_model.get_document_topics(testsen, minimum_probability=0)
        topic_dist = [prob[1] for prob in document_topics]

        self.assertEqual(len(topic_dist), self.args.lda_topics)
        self.assertAlmostEquals(sum(topic_dist), 1, 2)


    def test_term_topics(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        puppy_topics = lda_model.get_term_topics(WordMap.id_of('puppy'), minimum_probability=0)
        war_topics = lda_model.get_term_topics(WordMap.id_of('war'), minimum_probability=0)
        puppy_dist = [prob[1] for prob in puppy_topics]
        enemy_dist = [prob[1] for prob in war_topics]

        puppy_war = puppy_dist[0] > enemy_dist[0] and puppy_dist[1] < enemy_dist[1]
        war_puppy = puppy_dist[0] < enemy_dist[0] and puppy_dist[1] > enemy_dist[1]

        self.assertTrue(puppy_war or war_puppy)

    def test_get_lda_scores(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentence = self.doc_list[0].sens[0]
        selector.calculate_lda_scores([sentence], lda_model)
        lda_scores = sentence.lda_scores

        self.assertEqual(len(lda_scores), self.args.lda_topics)
        self.assertAlmostEqual(sum(lda_scores), 1, 2)

    def test_get_melda_scores(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentence = self.doc_list[0].sens[0]
        sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf)
        selector.calculate_lda_scores(sentences, lda_model)
        selector.calculate_melda_scores(sentences)
        melda_scores = sentence.melda_scores

        self.assertEqual(len(melda_scores), self.args.lda_topics)

    def test_get_top_n(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        selector = MeldaContentSelector()
        lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics)

        sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf)
        sentences = selector.calculate_lda_scores(sentences, lda_model)
        sentences = selector.calculate_melda_scores(sentences)
        selector.select_top_n(sentences, self.args.lda_topics, 1)

        self.assertEqual(len(selector.selected_content), self.args.lda_topics)
Exemplo n.º 6
0
class MeadSummaryGeneratorTests(unittest.TestCase):
    """
    Tests for MeadSummaryGenerator
    """

    # variables used in multiple tests
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_2 = Document("TST_ENG_20190101.0002")
    doc_list = [doc_1, doc_2]
    topics = {'PUP1A': [doc_1, doc_2]}
    w_set = {
        'he', 'owner', 'i', 'play', 'big', 'chase', 'fetch', 'park', 'dog',
        'fun', 'toy', 'tongue', 'take', 'ran', 'in', 'sun', 'love',
        'somewhere', 'many', 'together', 'around', 'puppy', 'today', 'load',
        'fight', 'small', "n't", '-PRON-', 'wag', 'hang', 'loads', 'bunch',
        'get', 'playing', 'they', 'like', 'tail', 'run', 'there'
    }

    idf = [
        4.032940937780854, 2.420157081061118, 1.3730247377110034,
        2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
        3.25478968739721, 2.7107216430469343, 3.7319109421168726,
        4.032940937780854, 3.3339709334448346, 4.032940937780854,
        1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
        2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
        1.5660733174267443, 2.024340766018936, 1.2476111027700865,
        4.032940937780854, 0.9959130580250786, 3.7319109421168726,
        2.5415792439465807, 1.7107216430469343, 4.032940937780854,
        3.4308809464528913, 4.032940937780854, 3.4308809464528913,
        3.5558196830611912, 3.5558196830611912, 4.032940937780854,
        1.734087861371147, 3.0786984283415286, 0.9055121599292547,
        3.5558196830611912, 3.5558196830611912, 1.9876179589941962
    ]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    WordMap.reset()

    def test_order_information(self):
        """
        Test ordering Sentences by MEAD score
        :return:
        """
        doc_id_1 = 'TST_ENG_20190101.0001'
        sentence_1 = 'Puppies love playing fetch.'
        sentence_2 = 'They all ran around with their tails wagging ' \
                     'and their tongues hanging out having loads of fun in the sun.'
        sentence_3 = "He loves playing so he liked to run around with the other dogs playing fetch."
        expected_info = [
            Sentence(sentence_1, 1, doc_id_1),
            Sentence(sentence_3, 3, doc_id_1),
            Sentence(sentence_2, 2, doc_id_1)
        ]

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                         self.args)
        generator.select_content(self.idf)
        generator.order_information()

        first_sentences = generator.content_selector.selected_content[:3]

        self.assertListEqual(expected_info, first_sentences)

    def test_realize_content(self):
        """
        Test applying redundancy penalty during realize_content
        :return:
        """
        expected_content = "I took my small puppy to the dog park today.\n" \
                           "In a park somewhere, a bunch of puppies played fetch with their owners today.\n" \
                           "There were many bigger puppies but he didn't get in a fight with any of them, " \
                           "they just played together with their toys and chased each other.\n" \
                           "They all ran around with their tails wagging and their tongues hanging out having " \
                           "loads of fun in the sun.\n" \
                           "He loves playing so he liked to run around with the other dogs playing fetch.\n" \
                           "Puppies love playing fetch."

        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)

        generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                         self.args)
        generator.select_content(self.idf)
        generator.order_information()
        generator.content_selector.selected_content = generator.content_selector.selected_content
        realized_content = generator.realize_content()
        self.assertEqual(expected_content, realized_content)

    def test_get_idf_array(self):
        words = [
            "i", "eat", "cake", "is", "delicious", "puppies", "are", "cute",
            "cats", "furry", "bank", "company", "sugar", "dollar", "however",
            "say"
        ]
        # Must override WordMap dictionary for test
        WordMap.word_to_id = {
            'delicious': 0,
            'eat': 1,
            'furry': 2,
            'puppies': 3,
            'i': 4,
            'cats': 5,
            'are': 6,
            'is': 7,
            'cute': 8,
            'cake': 9,
            'bank': 10,
            'company': 11,
            'sugar': 12,
            'dollar': 13,
            'however': 14,
            'say': 15
        }

        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()

        scores = []
        for word in words:
            curr_score = idf[WordMap.id_of(word)]
            scores.append("{:.5f}".format(curr_score))

        expected_scores = [
            '2.69897', '0.80688', '1.49485', '2.69897', '2.69897', '2.69897',
            '2.69897', '1.92082', '2.69897', '2.69897', '1.04576', '0.65365',
            '1.44370', '0.98297', '0.24718', '0.10018'
        ]

        self.assertListEqual(scores, expected_scores, 5)

    def test_mead_summary_length(self):
        """
        Test length of summary is less than 100 words
        :return:
        """
        topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        WordMap.create_mapping()
        vec = Vectors()
        vec.create_freq_vectors(topics)
        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()
        max_length = 100

        for topic_id, documents in topics.items():
            generator = MeadSummaryGenerator(documents, MeadContentSelector(),
                                             self.args)
            generator.select_content(idf)
            generator.order_information()
            realized_content = generator.realize_content()
            realized_content = [
                w for w in realized_content.split(" ") if not " "
            ]
            content_length = len(realized_content)
            self.assertLessEqual(content_length, max_length)

    def test_generate_summary(self):
        topics = {
            'PUP1A': [
                Document('TST_ENG_20190101.0001'),
                Document('TST_ENG_20190101.0002'),
                Document('TST20190201.0001'),
                Document('TST20190201.0002')
            ],
            'WAR2A': [
                Document('TST_ENG_20190301.0001'),
                Document('TST_ENG_20190301.0002'),
                Document('TST20190401.0001'),
                Document('TST20190401.0002')
            ]
        }
        WordMap.create_mapping()
        vec = Vectors()
        vec.create_freq_vectors(topics)
        idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(),
                                   self.args).get_idf_array()

        for topic_id, documents in topics.items():
            summarizer = MeadSummaryGenerator(documents, MeadContentSelector(),
                                              self.args)
            summary = summarizer.generate_summary(idf)
            self.assertIsNot(summary, None)
Exemplo n.º 7
0
class MeldaSummaryGeneratorTests(unittest.TestCase):
    """
    Tests for MeldaSummaryGenerator
    """

    # variables used in multiple tests
    Preprocessor.load_models()
    doc_1 = Document("TST_ENG_20190101.0001")
    doc_3 = Document("TST_ENG_20190301.0001")
    doc_list = [doc_1, doc_3]
    topics = {'PUPWAR': doc_list}

    w_set = {
        'park', 'somewhere', 'bunch', 'puppy', 'play', 'fetch', 'their',
        'owner', 'today', 'they', 'all', 'run', 'around', 'their', 'tail',
        'wag', 'tongue', 'hang', 'out', 'have', 'load', 'fun', 'sun', 'love',
        'our', 'country', 'go', 'war', 'soldier', 'go', 'fight', 'travel',
        'wherever', 'fight', 'enemy', 'try', 'kill', 'before', 'get', 'kill',
        'themselves', '-PRON-', 'playing'
    }

    idf = [
        4.032940937780854, 2.420157081061118, 1.3730247377110034,
        2.8868129021026157, 2.7776684326775474, 3.7319109421168726,
        3.25478968739721, 2.7107216430469343, 3.7319109421168726,
        4.032940937780854, 3.3339709334448346, 4.032940937780854,
        1.9257309681329853, 2.5705429398818973, 0.21458305982249878,
        2.3608430798451363, 3.5558196830611912, 3.3339709334448346,
        1.5660733174267443, 2.024340766018936, 1.2476111027700865,
        4.032940937780854, 0.9959130580250786, 3.7319109421168726,
        2.5415792439465807, 1.7107216430469343, 4.032940937780854,
        3.4308809464528913, 4.032940937780854, 3.4308809464528913,
        3.5558196830611912, 3.5558196830611912, 4.032940937780854,
        1.734087861371147, 3.0786984283415286, 0.9055121599292547,
        3.5558196830611912, 3.5558196830611912, 1.9876179589941962
    ]

    args = parse_args(['test_data/test_topics.xml', 'test'])
    args.lda_topics = 2

    def test_melda_info_ordering(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        summarizer = MeldaSummaryGenerator(self.doc_list,
                                           MeldaContentSelector(), self.args)
        content_selector = summarizer.select_content(self.idf)
        expected_len = len(content_selector)
        summarizer.order_information()

        actual_len = len(content_selector)

        self.assertEqual(expected_len, actual_len)

    def test_melda_generate_summary(self):
        WordMap.word_set = self.w_set
        WordMap.create_mapping()
        Vectors().create_freq_vectors(self.topics)
        Vectors().create_term_doc_freq(self.topics)
        for topic_id, documents in self.topics.items():
            summarizer = MeldaSummaryGenerator(documents,
                                               MeldaContentSelector(),
                                               self.args)
            summary = summarizer.generate_summary(self.idf)
            self.assertIsNot(summary, None)

    def test_ifvalid_sent(self):
        for topic_id, documents in self.topics.items():
            summarizer = MeldaSummaryGenerator(documents,
                                               MeldaContentSelector(),
                                               self.args)
            break
        raw_sent1 = "--"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent1), 1)

        raw_sent2 = "---"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent2), 0)

        raw_sent3 = "-342--"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent3), 1)

        raw_sent4 = "-342dafd23480134"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent4), 0)

        raw_sent5 = "\n\nsafadj\n\n"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent5), 0)

        raw_sent6 = "-342dafd23480"
        self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent6), 1)