def getSentenceDistributions(self): # computes topic distributions for each sentence # output: list of lists # each list corresponds to a document and stores a tuple per sentence # the 1st element is the sentence number in the group # the 2nd element is a tuple of (topic_id, weight) distributions = list() get_bow = self.dictionary.doc2bow get_document_topics = self.lda.get_document_topics for sentences in self.sentence_groups: sentence_distributions = list() for k, sentence in sentences: tkns = tokenizer(sentence) if tkns is None: continue bow = get_bow(tkns) dist = get_document_topics(bow) # this is to get list of dominant indices in decreasing order #dist.sort(key=lambda x: x[1], reverse=True) #dist = [d[0] for d in dist] # # this is to get the dominant index only (not a list) try: dist = max(dist, key=lambda x: x[1]) except ValueError: continue sentence_distributions.append((k, dist)) distributions.append(sentence_distributions) return distributions
def getSentenceDistributions(self): # computes topic distributions for each sentence # output: list of lists # each list corresponds to a document and stores a tuple per sentence # the 1st element is the sentence number in the group # the 2nd element is a tuple of (topic_id, weight) distributions = list() get_bow = self.dictionary.doc2bow get_document_topics = self.lda.get_document_topics for sentences in self.sentence_groups: sentence_distributions = list() for k, sentence in sentences: tkns = tokenizer(sentence) if tkns is None: continue bow = get_bow(tkns) dist = get_document_topics(bow) # this is to get list of dominant indices in decreasing order #dist.sort(key=lambda x: x[1], reverse=True) #dist = [d[0] for d in dist] # # this is to get the dominant index only (not a list) try: dist = max(dist, key=lambda x: x[1]) except ValueError, ve: continue sentence_distributions.append((k, dist)) distributions.append(sentence_distributions)
def summarize(self, documents): tokens = [tokenizer(document) for document in documents] tokens = [self.bigramizer[tkn] for tkn in tokens] corpus = [self.dictionary.doc2bow(tkn) for tkn in tokens] self.dominant_topic_ids = self.getDominantTopics(corpus) self.sentence_groups = self.splitIntoSentences(documents) self.distributions = self.getSentenceDistributions() self.summary_data = self.sentenceSelection(verbose=False)