def test_realize_content(self): """ Test applying redundancy penalty during realize_content :return: """ expected_content = "I took my small puppy to the dog park today.\n" \ "In a park somewhere, a bunch of puppies played fetch with their owners today.\n" \ "There were many bigger puppies but he didn't get in a fight with any of them, " \ "they just played together with their toys and chased each other.\n" \ "They all ran around with their tails wagging and their tongues hanging out having " \ "loads of fun in the sun.\n" \ "He loves playing so he liked to run around with the other dogs playing fetch.\n" \ "Puppies love playing fetch." WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args) generator.select_content(self.idf) generator.order_information() generator.content_selector.selected_content = generator.content_selector.selected_content realized_content = generator.realize_content() self.assertEqual(expected_content, realized_content)
def test_order_information(self): """ Test ordering Sentences by MEAD score :return: """ doc_id_1 = 'TST_ENG_20190101.0001' sentence_1 = 'Puppies love playing fetch.' sentence_2 = 'They all ran around with their tails wagging ' \ 'and their tongues hanging out having loads of fun in the sun.' sentence_3 = "He loves playing so he liked to run around with the other dogs playing fetch." expected_info = [ Sentence(sentence_1, 1, doc_id_1), Sentence(sentence_3, 3, doc_id_1), Sentence(sentence_2, 2, doc_id_1) ] WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) generator = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args) generator.select_content(self.idf) generator.order_information() first_sentences = generator.content_selector.selected_content[:3] self.assertListEqual(expected_info, first_sentences)
def test_generate_summary(self): topics = { 'PUP1A': [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ], 'WAR2A': [ Document('TST_ENG_20190301.0001'), Document('TST_ENG_20190301.0002'), Document('TST20190401.0001'), Document('TST20190401.0002') ] } WordMap.create_mapping() vec = Vectors() vec.create_freq_vectors(topics) idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args).get_idf_array() for topic_id, documents in topics.items(): summarizer = MeadSummaryGenerator(documents, MeadContentSelector(), self.args) summary = summarizer.generate_summary(idf) self.assertIsNot(summary, None)
def test_melda_generate_summary(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) Vectors().create_term_doc_freq(self.topics) for topic_id, documents in self.topics.items(): summarizer = MeldaSummaryGenerator(documents, MeldaContentSelector(), self.args) summary = summarizer.generate_summary(self.idf) self.assertIsNot(summary, None)
def test_get_lda_scores(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) sentence = self.doc_list[0].sens[0] selector.calculate_lda_scores([sentence], lda_model) lda_scores = sentence.lda_scores self.assertEqual(len(lda_scores), self.args.lda_topics) self.assertAlmostEqual(sum(lda_scores), 1, 2)
def test_document_topics(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) testtok = ['puppy', 'soldier', 'war', 'fetch'] testsen = Vectors().create_term_sen_freq(testtok) document_topics = lda_model.get_document_topics(testsen, minimum_probability=0) topic_dist = [prob[1] for prob in document_topics] self.assertEqual(len(topic_dist), self.args.lda_topics) self.assertAlmostEquals(sum(topic_dist), 1, 2)
def test_get_top_n(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) sentences = selector.calculate_mead_scores(self.doc_list, self.args, self.idf) sentences = selector.calculate_lda_scores(sentences, lda_model) sentences = selector.calculate_melda_scores(sentences) selector.select_top_n(sentences, self.args.lda_topics, 1) self.assertEqual(len(selector.selected_content), self.args.lda_topics)
def test_melda_info_ordering(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_freq_vectors(self.topics) Vectors().create_term_doc_freq(self.topics) summarizer = MeldaSummaryGenerator(self.doc_list, MeldaContentSelector(), self.args) content_selector = summarizer.select_content(self.idf) expected_len = len(content_selector) summarizer.order_information() actual_len = len(content_selector) self.assertEqual(expected_len, actual_len)
def test_create_mapping(self): Preprocessor.load_models() WordMap.word_set = set() WordMap.word_to_id = {} Document("TST_ENG_20190101.0001") Document("TST_ENG_20190101.0002") WordMap.create_mapping() mapping = WordMap.get_mapping() self.assertCountEqual(self.word_set, mapping.keys()) # each word in word_set got added to the dictionary self.assertEqual(len(mapping), len(set(mapping.items()))) # each id value in the dict is unique
def test_term_topics(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) puppy_topics = lda_model.get_term_topics(WordMap.id_of('puppy'), minimum_probability=0) war_topics = lda_model.get_term_topics(WordMap.id_of('war'), minimum_probability=0) puppy_dist = [prob[1] for prob in puppy_topics] enemy_dist = [prob[1] for prob in war_topics] puppy_war = puppy_dist[0] > enemy_dist[0] and puppy_dist[1] < enemy_dist[1] war_puppy = puppy_dist[0] < enemy_dist[0] and puppy_dist[1] > enemy_dist[1] self.assertTrue(puppy_war or war_puppy)
def load_documents_for_topics(topic_soup): """ Load documents for each topic :param topic_soup: :return: """ topics = {} for topic in topic_soup.find_all('topic'): documents = load_documents(topic) topics[topic['id']] = documents # At this point, all docs have been loaded and all unique words are stored in WordMap set # Need to trigger creation of mapping and of vectors WordMap.create_mapping() vec = Vectors() vec.create_freq_vectors(topics) # do we need to have this here if we don't run mead based content selection vec.create_term_doc_freq(topics) return topics
def test_mead_summary_length(self): """ Test length of summary is less than 100 words :return: """ topics = { 'PUP1A': [ Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002'), Document('TST20190201.0001'), Document('TST20190201.0002') ], 'WAR2A': [ Document('TST_ENG_20190301.0001'), Document('TST_ENG_20190301.0002'), Document('TST20190401.0001'), Document('TST20190401.0002') ] } WordMap.create_mapping() vec = Vectors() vec.create_freq_vectors(topics) idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args).get_idf_array() max_length = 100 for topic_id, documents in topics.items(): generator = MeadSummaryGenerator(documents, MeadContentSelector(), self.args) generator.select_content(idf) generator.order_information() realized_content = generator.realize_content() realized_content = [ w for w in realized_content.split(" ") if not " " ] content_length = len(realized_content) self.assertLessEqual(content_length, max_length)
class VectorsTests(unittest.TestCase): Preprocessor.load_models() topics = { 1: [Document('TST_ENG_20190101.0001'), Document('TST_ENG_20190101.0002')] } WordMap.create_mapping() mapping = WordMap.get_mapping() topic_one = topics.get(1) # list of Documents def test_create_freq_vectors(self): Vectors().create_freq_vectors(self.topics) for doc_list in self.topics.values(): for doc in doc_list: # check that there's a vector for each sentence doc_matrix_shape = doc.vectors.get_shape() expected_rows = 3 self.assertEqual(doc_matrix_shape[0], expected_rows) def test_sentence_vector(self): s = self.topics.get(1)[1].sens[1] # s1 is a Sentence object # s text: 'He loves playing so he liked to run around with the other dogs playing fetch.' id_of_playing = WordMap.id_of('playing') self.assertEqual(s.vector.getcol(id_of_playing).sum(), 1) for word in s.tokens: id_of_word = WordMap.id_of(word) self.assertGreater(s.vector.getcol(id_of_word).sum(), 0) def test_get_topic_matrix(self): # make sure all sentences from all topic docs make it into the matrix topic_one_matrix = Vectors().get_topic_matrix(self.topic_one) expected_num_sentences = 6 self.assertEqual(expected_num_sentences, topic_one_matrix.get_shape()[0])