def test_select_content(self): sentence_1 = 'In a park somewhere, a bunch of puppies played fetch with their owners today.' doc_id_1 = 'TST_ENG_20190101.0001' sentence_2 = 'I took my small puppy to the dog park today.' doc_id_2 = 'TST_ENG_20190101.0002' selector = LeadSentenceSelector() documents = [Document(doc_id_1), Document(doc_id_2)] expected_sentences = [Sentence(sentence_1, 1, doc_id_1), Sentence(sentence_2, 1, doc_id_2)] selector.select_content(documents, []) selected_sentences = selector.selected_content self.assertCountEqual(expected_sentences, selected_sentences)
def test_generate_summary(self): topics = { 'PUP1A': [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ], 'WAR2A': [ Document('TST_ENG_20190301.0001'), Document('TST_ENG_20190301.0002'), Document('TST20190401.0001'), Document('TST20190401.0002') ] } WordMap.create_mapping() vec = Vectors() vec.create_freq_vectors(topics) idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args).get_idf_array() for topic_id, documents in topics.items(): summarizer = MeadSummaryGenerator(documents, MeadContentSelector(), self.args) summary = summarizer.generate_summary(idf) self.assertIsNot(summary, None)
def test_create_mapping(self): Preprocessor.load_models() WordMap.word_set = set() WordMap.word_to_id = {} Document("TST_ENG_20190101.0001") Document("TST_ENG_20190101.0002") WordMap.create_mapping() mapping = WordMap.get_mapping() self.assertCountEqual(self.word_set, mapping.keys()) # each word in word_set got added to the dictionary self.assertEqual(len(mapping), len(set(mapping.items()))) # each id value in the dict is unique
def test_parse_doc_id2(self): doc = Document("TST20190201.0001") self.assertEqual(doc.src, 'TST') self.assertEqual(doc.lang, '_ENG') self.assertEqual(doc.date, '20190201') self.assertEqual(doc.art_id, '0001') self.assertEqual(doc.docid, 'TST20190201.0001')
def test_realize_content(self): documents = [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ] expected_content = "In a park somewhere, a bunch of puppies played fetch with their owners today.\n" \ "I took my small puppy to the dog park today.\n" \ "Puppies are cute because many of them are small.\n" \ "Puppies love to play with toys." generator = LeadSummaryGenerator(documents, LeadSentenceSelector(), []) generator.select_content() generator.order_information() realized_content = generator.realize_content() self.assertEqual(expected_content, realized_content)
def test_order_information(self): sentence_1 = 'Puppies are cute because many of them are small.' doc_id_1 = 'TST20190201.0001' sentence_2 = 'In a park somewhere, a bunch of puppies played fetch with their owners today.' doc_id_2 = 'TST_ENG_20190101.0001' expected_info = [ Sentence(sentence_2, 1, doc_id_2), Sentence(sentence_1, 1, doc_id_1) ] documents = [ Document('TST_ENG_20190101.0001'), Document('TST20190201.0001') ] generator = LeadSummaryGenerator(documents, LeadSentenceSelector(), []) generator.select_content() generator.order_information() self.assertListEqual(expected_info, generator.content_selector.selected_content)
def load_documents(topic): """ Load the documents for the given topic :param topic: :return: """ documents = [] for doc in topic.find_all('doc'): documents.append(Document(doc['id'])) return documents
class VectorsTests(unittest.TestCase): Preprocessor.load_models() topics = { 1: [Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002')] } WordMap.create_mapping() mapping = WordMap.get_mapping() topic_one = topics.get(1) # list of Documents def test_create_freq_vectors(self): Vectors().create_freq_vectors(self.topics) for doc_list in self.topics.values(): for doc in doc_list: # check that there's a vector for each sentence doc_matrix_shape = doc.vectors.get_shape() expected_rows = 3 self.assertEqual(doc_matrix_shape[0], expected_rows) def test_sentence_vector(self): s = self.topics.get(1)[1].sens[1] # s1 is a Sentence object # s text: 'He loves playing so he liked to run around with the other dogs playing fetch.' id_of_playing = WordMap.id_of('playing') self.assertEqual(s.vector.getcol(id_of_playing).sum(), 1) for word in s.tokens: id_of_word = WordMap.id_of(word) self.assertGreater(s.vector.getcol(id_of_word).sum(), 0) def test_get_topic_matrix(self): # make sure all sentences from all topic docs make it into the matrix topic_one_matrix = Vectors().get_topic_matrix(self.topic_one) expected_num_sentences = 6 self.assertEqual(expected_num_sentences, topic_one_matrix.get_shape()[0])
def test_get_documents_for_topics(self): topic_soup = make_soup('test_data/test_topics.xml') expected_topics = { 'PUP1A': [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ], 'WAR2A': [ Document('TST_ENG_20190301.0001'), Document('TST_ENG_20190301.0002'), Document('TST20190401.0001'), Document('TST20190401.0002') ] } topics = load_documents_for_topics(topic_soup) self.assertCountEqual(topics, expected_topics)
def test_mead_summary_length(self): """ Test length of summary is less than 100 words :return: """ topics = { 'PUP1A': [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ], 'WAR2A': [ Document('TST_ENG_20190301.0001'), Document('TST_ENG_20190301.0002'), Document('TST20190401.0001'), Document('TST20190401.0002') ] } WordMap.create_mapping() vec = Vectors() vec.create_freq_vectors(topics) idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args).get_idf_array() max_length = 100 for topic_id, documents in topics.items(): generator = MeadSummaryGenerator(documents, MeadContentSelector(), self.args) generator.select_content(idf) generator.order_information() realized_content = generator.realize_content() realized_content = [ w for w in realized_content.split(" ") if not " " ] content_length = len(realized_content) self.assertLessEqual(content_length, max_length)
def test_document_headline2(self): doc = Document("TST_ENG_20190101.0002") self.assertEqual(doc.headline, "Playing in the dog park")
def test_document_headline(self): doc = Document("TST_ENG_20190101.0001") self.assertEqual(doc.headline, "Puppies play fetch in the park")
class MeldaContentSelectorTests(unittest.TestCase): Preprocessor.load_models() doc_1 = Document("TST_ENG_20190101.0001") doc_3 = Document("TST_ENG_20190301.0001") doc_list = [doc_1, doc_3] topics = {'PUPWAR': doc_list} w_set = {'park', 'somewhere', 'bunch', 'puppy', 'play', 'fetch', 'their', 'owner', 'today', 'they', 'all', 'run', 'around', 'their', 'tail', 'wag', 'tongue', 'hang', 'out', 'have', 'load', 'fun', 'sun', 'love', 'our', 'country', 'go', 'war', 'soldier', 'go', 'fight', 'travel', 'wherever', 'fight', 'enemy', 'try', 'kill', 'before', 'get', 'kill', 'themselves', '-PRON-', 'playing'} idf = [4.032940937780854, 2.420157081061118, 1.3730247377110034, 2.8868129021026157, 2.7776684326775474, 3.7319109421168726, 3.25478968739721, 2.7107216430469343, 3.7319109421168726, 4.032940937780854, 3.3339709334448346, 4.032940937780854, 1.9257309681329853, 2.5705429398818973, 0.21458305982249878, 2.3608430798451363, 3.5558196830611912, 3.3339709334448346, 1.5660733174267443, 2.024340766018936, 1.2476111027700865, 4.032940937780854, 0.9959130580250786, 3.7319109421168726, 2.5415792439465807, 1.7107216430469343, 4.032940937780854, 3.4308809464528913, 4.032940937780854, 3.4308809464528913, 3.5558196830611912, 3.5558196830611912, 4.032940937780854, 1.734087861371147, 3.0786984283415286, 0.9055121599292547, 3.5558196830611912, 3.5558196830611912, 1.9876179589941962] args = parse_args(['test_data/test_topics.xml', 'test']) args.lda_topics = 2 def test_document_topics(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) testtok = ['puppy', 'soldier', 'war', 'fetch'] testsen = Vectors().create_term_sen_freq(testtok) document_topics = lda_model.get_document_topics(testsen, minimum_probability=0) topic_dist = [prob[1] for prob in document_topics] self.assertEqual(len(topic_dist), self.args.lda_topics) self.assertAlmostEquals(sum(topic_dist), 1, 2) def test_term_topics(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) puppy_topics = lda_model.get_term_topics(WordMap.id_of('puppy'), minimum_probability=0) war_topics = lda_model.get_term_topics(WordMap.id_of('war'), minimum_probability=0) puppy_dist = [prob[1] for prob in puppy_topics] enemy_dist = [prob[1] for prob in war_topics] puppy_war = puppy_dist[0] > enemy_dist[0] and puppy_dist[1] < enemy_dist[1] war_puppy = puppy_dist[0] < enemy_dist[0] and puppy_dist[1] > enemy_dist[1] self.assertTrue(puppy_war or war_puppy) def test_get_lda_scores(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) sentence = self.doc_list[0].sens[0] selector.calculate_lda_scores([sentence], lda_model) lda_scores = sentence.lda_scores self.assertEqual(len(lda_scores), self.args.lda_topics) self.assertAlmostEqual(sum(lda_scores), 1, 2) def test_get_melda_scores(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) sentence = self.doc_list[0].sens[0] sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf) selector.calculate_lda_scores(sentences, lda_model) selector.calculate_melda_scores(sentences) melda_scores = sentence.melda_scores self.assertEqual(len(melda_scores), self.args.lda_topics) def test_get_top_n(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf) sentences = selector.calculate_lda_scores(sentences, lda_model) sentences = selector.calculate_melda_scores(sentences) selector.select_top_n(sentences, self.args.lda_topics, 1) self.assertEqual(len(selector.selected_content), self.args.lda_topics)
class MeadSummaryGeneratorTests(unittest.TestCase): """ Tests for MeadSummaryGenerator """ # variables used in multiple tests Preprocessor.load_models() doc_1 = Document("TST_ENG_20190101.0001") doc_2 = Document("TST_ENG_20190101.0002") doc_list = [doc_1, doc_2] topics = {'PUP1A': [doc_1, doc_2]} w_set = { 'he', 'owner', 'i', 'play', 'big', 'chase', 'fetch', 'park', 'dog', 'fun', 'toy', 'tongue', 'take', 'ran', 'in', 'sun', 'love', 'somewhere', 'many', 'together', 'around', 'puppy', 'today', 'load', 'fight', 'small', "n't", '-PRON-', 'wag', 'hang', 'loads', 'bunch', 'get', 'playing', 'they', 'like', 'tail', 'run', 'there' } idf = [ 4.032940937780854, 2.420157081061118, 1.3730247377110034, 2.8868129021026157, 2.7776684326775474, 3.7319109421168726, 3.25478968739721, 2.7107216430469343, 3.7319109421168726, 4.032940937780854, 3.3339709334448346, 4.032940937780854, 1.9257309681329853, 2.5705429398818973, 0.21458305982249878, 2.3608430798451363, 3.5558196830611912, 3.3339709334448346, 1.5660733174267443, 2.024340766018936, 1.2476111027700865, 4.032940937780854, 0.9959130580250786, 3.7319109421168726, 2.5415792439465807, 1.7107216430469343, 4.032940937780854, 3.4308809464528913, 4.032940937780854, 3.4308809464528913, 3.5558196830611912, 3.5558196830611912, 4.032940937780854, 1.734087861371147, 3.0786984283415286, 0.9055121599292547, 3.5558196830611912, 3.5558196830611912, 1.9876179589941962 ] args = parse_args(['test_data/test_topics.xml', 'test']) WordMap.reset() def test_order_information(self): """ Test ordering Sentences by MEAD score :return: """ doc_id_1 = 'TST_ENG_20190101.0001' sentence_1 = 'Puppies love playing fetch.' sentence_2 = 'They all ran around with their tails wagging ' \ 'and their tongues hanging out having loads of fun in the sun.' sentence_3 = "He loves playing so he liked to run around with the other dogs playing fetch." expected_info = [ Sentence(sentence_1, 1, doc_id_1), Sentence(sentence_3, 3, doc_id_1), Sentence(sentence_2, 2, doc_id_1) ] WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args) generator.select_content(self.idf) generator.order_information() first_sentences = generator.content_selector.selected_content[:3] self.assertListEqual(expected_info, first_sentences) def test_realize_content(self): """ Test applying redundancy penalty during realize_content :return: """ expected_content = "I took my small puppy to the dog park today.\n" \ "In a park somewhere, a bunch of puppies played fetch with their owners today.\n" \ "There were many bigger puppies but he didn't get in a fight with any of them, " \ "they just played together with their toys and chased each other.\n" \ "They all ran around with their tails wagging and their tongues hanging out having " \ "loads of fun in the sun.\n" \ "He loves playing so he liked to run around with the other dogs playing fetch.\n" \ "Puppies love playing fetch." WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args) generator.select_content(self.idf) generator.order_information() generator.content_selector.selected_content = generator.content_selector.selected_content realized_content = generator.realize_content() self.assertEqual(expected_content, realized_content) def test_get_idf_array(self): words = [ "i", "eat", "cake", "is", "delicious", "puppies", "are", "cute", "cats", "furry", "bank", "company", "sugar", "dollar", "however", "say" ] # Must override WordMap dictionary for test WordMap.word_to_id = { 'delicious': 0, 'eat': 1, 'furry': 2, 'puppies': 3, 'i': 4, 'cats': 5, 'are': 6, 'is': 7, 'cute': 8, 'cake': 9, 'bank': 10, 'company': 11, 'sugar': 12, 'dollar': 13, 'however': 14, 'say': 15 } idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args).get_idf_array() scores = [] for word in words: curr_score = idf[WordMap.id_of(word)] scores.append("{:.5f}".format(curr_score)) expected_scores = [ '2.69897', '0.80688', '1.49485', '2.69897', '2.69897', '2.69897', '2.69897', '1.92082', '2.69897', '2.69897', '1.04576', '0.65365', '1.44370', '0.98297', '0.24718', '0.10018' ] self.assertListEqual(scores, expected_scores, 5) def test_mead_summary_length(self): """ Test length of summary is less than 100 words :return: """ topics = { 'PUP1A': [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ], 'WAR2A': [ Document('TST_ENG_20190301.0001'), Document('TST_ENG_20190301.0002'), Document('TST20190401.0001'), Document('TST20190401.0002') ] } WordMap.create_mapping() vec = Vectors() vec.create_freq_vectors(topics) idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args).get_idf_array() max_length = 100 for topic_id, documents in topics.items(): generator = MeadSummaryGenerator(documents, MeadContentSelector(), self.args) generator.select_content(idf) generator.order_information() realized_content = generator.realize_content() realized_content = [ w for w in realized_content.split(" ") if not " " ] content_length = len(realized_content) self.assertLessEqual(content_length, max_length) def test_generate_summary(self): topics = { 'PUP1A': [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ], 'WAR2A': [ Document('TST_ENG_20190301.0001'), Document('TST_ENG_20190301.0002'), Document('TST20190401.0001'), Document('TST20190401.0002') ] } WordMap.create_mapping() vec = Vectors() vec.create_freq_vectors(topics) idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args).get_idf_array() for topic_id, documents in topics.items(): summarizer = MeadSummaryGenerator(documents, MeadContentSelector(), self.args) summary = summarizer.generate_summary(idf) self.assertIsNot(summary, None)
def test_lead_summary_length(self): documents = [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002'), Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002'), Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002'), Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002'), Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002'), Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002'), Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002'), Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ] max_length = 100 generator = LeadSummaryGenerator(documents, LeadSentenceSelector(), []) generator.select_content() generator.order_information() realized_content = generator.realize_content() content_length = len(realized_content.split(" ")) self.assertLessEqual(content_length, max_length)
class MeadContentSelectorTests(unittest.TestCase): """ Tests for MeadContentSelector """ # variables used in multiple tests Preprocessor.load_models() doc_1 = Document("TST_ENG_20190101.0001") doc_2 = Document("TST_ENG_20190101.0002") doc_list = [doc_1, doc_2] topics = {'PUP1A': [doc_1, doc_2]} w_set = { 'he', 'owner', 'i', 'play', 'big', 'chase', 'fetch', 'park', 'dog', 'fun', 'toy', 'tongue', 'take', 'ran', 'in', 'sun', 'love', 'somewhere', 'many', 'together', 'around', 'puppy', 'today', 'load', 'fight', 'small', "n't", '-PRON-', 'wag', 'hang', 'loads', 'bunch', 'get', 'playing', 'they', 'like', 'tail', 'run', 'there' } w_map = { 'he': 1, 'owner': 2, 'i': 3, 'play': 4, 'big': 5, 'chase': 6, 'fetch': 7, 'park': 8, 'dog': 9, 'fun': 10, 'toy': 11, 'tongue': 12, 'take': 13, 'ran': 14, 'in': 15, 'sun': 16, 'love': 17, 'somewhere': 18, 'many': 19, 'together': 20, 'around': 21, 'puppy': 22, 'today': 23, 'load': 24, 'fight': 25, 'small': 26, "n't": 27, '-PRON-': 28, 'wag': 29, 'hang': 30, 'loads': 31, 'bunch': 32, 'get': 33, 'playing': 34, 'they': 35, 'like': 36, 'tail': 37, 'run': 38, 'there': 39 } idf = [ 4.032940937780854, 2.420157081061118, 1.3730247377110034, 2.8868129021026157, 2.7776684326775474, 3.7319109421168726, 3.25478968739721, 2.7107216430469343, 3.7319109421168726, 4.032940937780854, 3.3339709334448346, 4.032940937780854, 1.9257309681329853, 2.5705429398818973, 0.21458305982249878, 2.3608430798451363, 3.5558196830611912, 3.3339709334448346, 1.5660733174267443, 2.024340766018936, 1.2476111027700865, 4.032940937780854, 0.9959130580250786, 3.7319109421168726, 2.5415792439465807, 1.7107216430469343, 4.032940937780854, 3.4308809464528913, 4.032940937780854, 3.4308809464528913, 3.5558196830611912, 3.5558196830611912, 4.032940937780854, 1.734087861371147, 3.0786984283415286, 0.9055121599292547, 3.5558196830611912, 3.5558196830611912, 1.9876179589941962 ] # idf = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, # 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, # 1, 2, 3] args = parse_args(['test_data/test_topics.xml', 'test']) args.c_threshold = 'min' def test_get_sentence_position(self): selector = MeadContentSelector() sentence_1 = Sentence("Here is a test sentence.", 0) sentence_2 = Sentence("Here is another one.", 50) pos_score_1 = selector.get_sentence_position(sentence_1, 100) pos_score_2 = selector.get_sentence_position(sentence_2, 100) expected_score_1 = 1 expected_score_2 = 50 / 100 self.assertEqual(expected_score_1, pos_score_1) self.assertEqual(expected_score_2, pos_score_2) def test_get_cluster_centroid(self): selector = MeadContentSelector() WordMap.word_set = self.w_set WordMap.word_to_id = self.w_map Vectors().create_freq_vectors(self.topics) centroid = selector.get_cluster_centroid(self.doc_list, self.idf, self.args.c_threshold) actual_non_zero = np.count_nonzero(centroid) should_be_non_zero = 29 self.assertEqual(actual_non_zero, should_be_non_zero) def test_get_centroid_score(self): selector = MeadContentSelector() sent_1 = Sentence("Puppies love playing fetch.", 0) self.args.c_threshold = 'mean' WordMap.word_set = self.w_set WordMap.word_to_id = self.w_map Vectors().create_freq_vectors(self.topics) centroid = selector.get_cluster_centroid(self.doc_list, self.idf, self.args.c_threshold) expected_centroid_score = 6.3 c_score = selector.get_centroid_score(sent_1, centroid) self.assertAlmostEqual(expected_centroid_score, c_score, 1) def test_apply_redundancy_penalty(self): """ Test the function to apply the redundancy penalty :return: """ selector = MeadContentSelector() WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) selected = selector.select_content(self.doc_list, self.args, self.idf) selector.apply_redundancy_penalty(selected[0], selector.selected_content) scores = [s.mead_score for s in selector.selected_content] expected_scores = [ 1.9003829413846463, 1.6243717975775935, 0.6522065176000799, 2.3571461578060453, 1.532600545620478, 1.7661796758000055 ] self.assertEqual(scores, expected_scores) def test_select_content(self): selector = MeadContentSelector() Vectors().create_freq_vectors(self.topics) selected = selector.select_content(self.topics['PUP1A'], self.args, self.idf) top_sentence = selected[0] expected_top_sentence = 'In a park somewhere, a bunch of ' \ 'puppies played fetch with their owners today.' top_mead_score = float("{:.5f}".format(top_sentence.mead_score)) expected_top_mead_score = 2.40038 self.assertEqual(top_sentence.raw_sentence, expected_top_sentence) self.assertEqual(top_mead_score, expected_top_mead_score)
class MeldaSummaryGeneratorTests(unittest.TestCase): """ Tests for MeldaSummaryGenerator """ # variables used in multiple tests Preprocessor.load_models() doc_1 = Document("TST_ENG_20190101.0001") doc_3 = Document("TST_ENG_20190301.0001") doc_list = [doc_1, doc_3] topics = {'PUPWAR': doc_list} w_set = { 'park', 'somewhere', 'bunch', 'puppy', 'play', 'fetch', 'their', 'owner', 'today', 'they', 'all', 'run', 'around', 'their', 'tail', 'wag', 'tongue', 'hang', 'out', 'have', 'load', 'fun', 'sun', 'love', 'our', 'country', 'go', 'war', 'soldier', 'go', 'fight', 'travel', 'wherever', 'fight', 'enemy', 'try', 'kill', 'before', 'get', 'kill', 'themselves', '-PRON-', 'playing' } idf = [ 4.032940937780854, 2.420157081061118, 1.3730247377110034, 2.8868129021026157, 2.7776684326775474, 3.7319109421168726, 3.25478968739721, 2.7107216430469343, 3.7319109421168726, 4.032940937780854, 3.3339709334448346, 4.032940937780854, 1.9257309681329853, 2.5705429398818973, 0.21458305982249878, 2.3608430798451363, 3.5558196830611912, 3.3339709334448346, 1.5660733174267443, 2.024340766018936, 1.2476111027700865, 4.032940937780854, 0.9959130580250786, 3.7319109421168726, 2.5415792439465807, 1.7107216430469343, 4.032940937780854, 3.4308809464528913, 4.032940937780854, 3.4308809464528913, 3.5558196830611912, 3.5558196830611912, 4.032940937780854, 1.734087861371147, 3.0786984283415286, 0.9055121599292547, 3.5558196830611912, 3.5558196830611912, 1.9876179589941962 ] args = parse_args(['test_data/test_topics.xml', 'test']) args.lda_topics = 2 def test_melda_info_ordering(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) Vectors().create_term_doc_freq(self.topics) summarizer = MeldaSummaryGenerator(self.doc_list, MeldaContentSelector(), self.args) content_selector = summarizer.select_content(self.idf) expected_len = len(content_selector) summarizer.order_information() actual_len = len(content_selector) self.assertEqual(expected_len, actual_len) def test_melda_generate_summary(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) Vectors().create_term_doc_freq(self.topics) for topic_id, documents in self.topics.items(): summarizer = MeldaSummaryGenerator(documents, MeldaContentSelector(), self.args) summary = summarizer.generate_summary(self.idf) self.assertIsNot(summary, None) def test_ifvalid_sent(self): for topic_id, documents in self.topics.items(): summarizer = MeldaSummaryGenerator(documents, MeldaContentSelector(), self.args) break raw_sent1 = "--" self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent1), 1) raw_sent2 = "---" self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent2), 0) raw_sent3 = "-342--" self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent3), 1) raw_sent4 = "-342dafd23480134" self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent4), 0) raw_sent5 = "\n\nsafadj\n\n" self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent5), 0) raw_sent6 = "-342dafd23480" self.assertEqual(summarizer.ifvalid_sent_reg(raw_sent6), 1)