def test_sentence_vector(self): s = self.topics.get(1)[1].sens[1] # s1 is a Sentence object # s text: 'He loves playing so he liked to run around with the other dogs playing fetch.' id_of_playing = WordMap.id_of('playing') self.assertEqual(s.vector.getcol(id_of_playing).sum(), 1) for word in s.tokens: id_of_word = WordMap.id_of(word) self.assertGreater(s.vector.getcol(id_of_word).sum(), 0)
def create_term_doc_freq(self, topics): """ create term freq on each doc over each topic :param topics: :return: set tdf to topic_list of [doc_list of (wordid, freq)] e.g.,[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (6, 5), (9, 2), (10, 1), (11, 1)...],[...]] """ for cluster in topics.values(): for document in cluster: term_doc_freq_dict = {} for sentence in document.sens: for word in sentence.tokenized(): word_id = WordMap.id_of(word) if word_id is None: warnings.warn('Word \'' + word + '\' not in WordMap', Warning) continue if word_id not in term_doc_freq_dict: term_doc_freq_dict[word_id] = 0 term_doc_freq_dict[word_id] += 1 term_doc_freq_list = [] for word_id in sorted(term_doc_freq_dict): term_doc_freq_list.append((word_id, term_doc_freq_dict[word_id])) document.set_tdf(term_doc_freq_list)
def create_freq_vectors(self, topics): """ creates a frequency vector for each sentence in each document in each topic in topics; stores single vectors in relevant Sentence objects and per-document matrices in relevant Document objects :param topics: Dictionary {topic -> list of Documents} :return: None pre: WordMap.create_mapping has been called (should happen in run_summarization document loading) """ for cluster in topics.values(): for document in cluster: doc_vectors = dok_matrix((0, self.num_unique_words)) for sentence in document.sens: sentence_vector = dok_matrix((1, self.num_unique_words)) for word in sentence.tokenized(): # maybe check that sentence.tokenized() is the right thing here word_id = WordMap.id_of(word) if word_id is None: warnings.warn('Word \'' + word + '\' not in WordMap', Warning) warnings.warn('Sentence:' + sentence.raw_sentence, Warning) else: sentence_vector[0, word_id] += 1 # assign vector to sentence object sentence.set_vector(sentence_vector) # add sentence vector to document matrix doc_vectors = vstack([doc_vectors, sentence_vector]) # assign matrix to document document.set_vectors(doc_vectors)
def test_term_topics(self): WordMap.word_set = self.w_set WordMap.create_mapping() Vectors().create_term_doc_freq(self.topics) selector = MeldaContentSelector() lda_model = selector.build_lda_model(self.doc_list, self.args.lda_topics) puppy_topics = lda_model.get_term_topics(WordMap.id_of('puppy'), minimum_probability=0) war_topics = lda_model.get_term_topics(WordMap.id_of('war'), minimum_probability=0) puppy_dist = [prob[1] for prob in puppy_topics] enemy_dist = [prob[1] for prob in war_topics] puppy_war = puppy_dist[0] > enemy_dist[0] and puppy_dist[1] < enemy_dist[1] war_puppy = puppy_dist[0] < enemy_dist[0] and puppy_dist[1] > enemy_dist[1] self.assertTrue(puppy_war or war_puppy)
def get_idf_array(self): """ Use external corpus to get IDF scores for cluster centroid calculations :return: numpy array of idf values """ corpus = brown if self.args.corpus == 'R': corpus = reuters num_words = Vectors().num_unique_words n = len(corpus.fileids()) # number of documents in corpus docs_word_matrix = np.zeros([n, num_words]) for doc_idx, doc_id in enumerate(corpus.fileids()): sentences = list(corpus.sents(doc_id)) words_in_doc = set() for s in sentences: s = ' '.join(s) proc_s = Preprocessor.get_processed_tokens(Preprocessor.get_processed_sentence(s)) if proc_s: words_in_doc = words_in_doc.union(proc_s) for word in words_in_doc: word_idx = WordMap.id_of(word) if word_idx: docs_word_matrix[doc_idx, word_idx] = 1 docs_per_word = np.sum(docs_word_matrix, axis=0) self.idf_array = np.log10(np.divide(n, docs_per_word + 1)) # add one to avoid divide by zero error return self.idf_array
def get_centroid_score(self, sentence, centroid): """ Get the centroid score for this sentence :param: sentence, centroid :return: float """ centroid_score = 0 for word in sentence.tokens: id = WordMap.id_of(word) centroid_score += centroid[id] if id is not None else 0 # return centroid_score/(sentence.word_count() + 1) return centroid_score
def test_get_idf_array(self): words = [ "i", "eat", "cake", "is", "delicious", "puppies", "are", "cute", "cats", "furry", "bank", "company", "sugar", "dollar", "however", "say" ] # Must override WordMap dictionary for test WordMap.word_to_id = { 'delicious': 0, 'eat': 1, 'furry': 2, 'puppies': 3, 'i': 4, 'cats': 5, 'are': 6, 'is': 7, 'cute': 8, 'cake': 9, 'bank': 10, 'company': 11, 'sugar': 12, 'dollar': 13, 'however': 14, 'say': 15 } idf = MeadSummaryGenerator(self.doc_list, MeadContentSelector(), self.args).get_idf_array() scores = [] for word in words: curr_score = idf[WordMap.id_of(word)] scores.append("{:.5f}".format(curr_score)) expected_scores = [ '2.69897', '0.80688', '1.49485', '2.69897', '2.69897', '2.69897', '2.69897', '1.92082', '2.69897', '2.69897', '1.04576', '0.65365', '1.44370', '0.98297', '0.24718', '0.10018' ] self.assertListEqual(scores, expected_scores, 5)
def create_term_sen_freq(self, sen): """ create term freq on a tokenized sentence :param sent: :return: """ term_doc_freq_dict = {} for tok in sen: word_id = WordMap.id_of(tok) if word_id is None: warnings.warn('Word \'' + tok + '\' not in WordMap', Warning) continue else: if word_id not in term_doc_freq_dict: term_doc_freq_dict[word_id] = 0 term_doc_freq_dict[word_id] += 1 term_doc_freq_list = [] for word_id in sorted(term_doc_freq_dict): term_doc_freq_list.append((word_id, term_doc_freq_dict[word_id])) return term_doc_freq_list