Пример #1
0
 def getSentenceDistributions(self):
     # computes topic distributions for each sentence
     # output: list of lists
     # each list corresponds to a document and stores a tuple per sentence
     # the 1st element is the sentence number in the group
     # the 2nd element is a tuple of (topic_id, weight)
     distributions = list()
     get_bow = self.dictionary.doc2bow
     get_document_topics = self.lda.get_document_topics
     for sentences in self.sentence_groups:
         sentence_distributions = list()
         for k, sentence in sentences:
             tkns = tokenizer(sentence)
             if tkns is None:
                 continue
             bow = get_bow(tkns)
             dist = get_document_topics(bow)
             # this is to get list of dominant indices in decreasing order
             #dist.sort(key=lambda x: x[1], reverse=True)
             #dist = [d[0] for d in dist]
             #
             # this is to get the dominant index only (not a list)
             try:
                 dist = max(dist, key=lambda x: x[1])
             except ValueError:
                 continue
             sentence_distributions.append((k, dist))
         distributions.append(sentence_distributions)
     return distributions
 def getSentenceDistributions(self):
     # computes topic distributions for each sentence
     # output: list of lists
     # each list corresponds to a document and stores a tuple per sentence
     # the 1st element is the sentence number in the group
     # the 2nd element is a tuple of (topic_id, weight)
     distributions = list()
     get_bow = self.dictionary.doc2bow
     get_document_topics = self.lda.get_document_topics
     for sentences in self.sentence_groups:
         sentence_distributions = list()
         for k, sentence in sentences:
             tkns = tokenizer(sentence)
             if tkns is None:
                 continue
             bow = get_bow(tkns)
             dist = get_document_topics(bow)
             # this is to get list of dominant indices in decreasing order
             #dist.sort(key=lambda x: x[1], reverse=True)
             #dist = [d[0] for d in dist]
             #
             # this is to get the dominant index only (not a list)
             try:
                 dist = max(dist, key=lambda x: x[1])
             except ValueError, ve:
                 continue
             sentence_distributions.append((k, dist))
         distributions.append(sentence_distributions)
Пример #3
0
 def summarize(self, documents):
     
     tokens = [tokenizer(document) for document in documents]
     tokens = [self.bigramizer[tkn] for tkn in tokens]
     corpus = [self.dictionary.doc2bow(tkn) for tkn in tokens]
         
     self.dominant_topic_ids = self.getDominantTopics(corpus)
         
     self.sentence_groups = self.splitIntoSentences(documents)
         
     self.distributions = self.getSentenceDistributions()
         
     self.summary_data = self.sentenceSelection(verbose=False)
 def summarize(self, documents):
     
     tokens = [tokenizer(document) for document in documents]
     tokens = [self.bigramizer[tkn] for tkn in tokens]
     corpus = [self.dictionary.doc2bow(tkn) for tkn in tokens]
         
     self.dominant_topic_ids = self.getDominantTopics(corpus)
         
     self.sentence_groups = self.splitIntoSentences(documents)
         
     self.distributions = self.getSentenceDistributions()
         
     self.summary_data = self.sentenceSelection(verbose=False)