def __init__(self, args, opts): ''' Constructor ''' self.tkzr = WordTokenizer(stem=False) self.s_t = SentTokenizer(offsets=False) self.util = Utils()
def summarize(self, extracted_refs, facet_results, max_length=250): ''' Summarizes the extracted references based on the facet results Chooses from facets naively Args: extracted_refs(list) -- results of the method.run (e.g. simple.py) facet_results(dict) -- facets for each extracted reference Look at data/task1b_results1.json max_length(int) -- maximum length of the summary ''' summaries = defaultdict(lambda: defaultdict(list)) for t in extracted_refs: topic = t[0]['topic'] citance = t[0]['citance_number'] if isinstance(t[0]['sentence'][0], list): logger.warn('Unexpected, should check') summaries[topic.upper()]\ [facet_results[topic.upper()] [str(citance)]['SVM_LABEL']].append( t[0]['sentence']) final_summ = defaultdict(lambda: defaultdict(dict)) counts = defaultdict(lambda: defaultdict(dict)) sent_tok = SentTokenizer(offsets=False) for t in summaries: for facet in summaries[t]: sents = [] for e in summaries[t][facet]: sents.extend(sent_tok(e)) final_summ[t][facet] = sents counts[t][facet] = len(final_summ[t][facet]) summ = defaultdict(list) tokzer = WordTokenizer(stem=False) for k in final_summ: for f in final_summ[k]: for i in range(len(final_summ[k][f])): if len(final_summ[k][f]) > i and\ tokzer.count_words(summ[k]) < max_length: summ[k].append(final_summ[k][f][i]) return summ
def summarize(self, doc, max_length=10): ''' Summarizes a document or list of docs MMR(S) = lambda*Sim(S,D)-(1-lambda)*Sim(S,Summary) Arg: doc: (list) | (str) max_length: The maximum length of the desired summary Returns ------- str ''' if isinstance(doc, str): # list of sentences, no need to tokenize s_t = SentTokenizer() docs = s_t(doc, offsets=False) docs += [doc] # Dummy sentence, The whole document else: docs = doc + [' '.join(doc)] tokzr = self.get_tokenizer('regex', True) vectorizer = TfidfVectorizer(min_df=1, max_df=len(doc) * .95, tokenizer=tokzr, stop_words=stopwords.words('english')) vectors = vectorizer.fit_transform(docs).toarray() doc_texts = {i: v for i, v in enumerate(docs)} doc_dict = {i: v for i, v in enumerate(vectors)} feature_names = vectorizer.get_feature_names() # idf_vals = vectorizer.idf_ summ_scores = [] # includes tuples (mmr_score, sentence_id) for i, s in doc_texts.iteritems(): # iterate through sentences to # select them for summary if len(summ_scores) > 0: summ_v = ' '.join([doc_texts[e[1]] for e in summ_scores]) # summarization vector else: summ_v = '' if summ_v != '': summ_v = vectorizer.transform([summ_v ]).toarray()[0] # to tf-idf score = -1 * self._mmr( vectorizer.transform([s]).toarray()[0], doc_dict[len(doc_dict) - 1], summ_v, self.lmbda, self.cossim) if len(summ_scores) < max_length / 30 + 3: # max heap data structure for mmr heappush(summ_scores, (score, i)) else: # Get rid of lowest score heappushpop(summ_scores, (score, i)) print summ_scores final_sum = [] for s in summ_scores: if self.w_t.count_words(final_sum) < max_length: final_sum.append(doc_texts[s[1]]) # print 'before: %d' % self.w_t.count_words(final_sum) if self.w_t.count_words(final_sum) > max_length: tmp = final_sum.pop() if self.w_t.count_words(final_sum) == 0: final_sum.append(tmp) # print 'after: %d' % self.w_t.count_words(final_sum) return final_sum
def summarize(self, extracted_refs, facet_results, max_length=250, mode='citance'): ''' Summarizes the extracted references based on community detection Args: extracted_refs(list) -- results of the method.run (e.g. simple.py) facet_results(dict) -- facets for each extracted reference Look at data/task1b_results1.json max_length(int) -- maximum length of the summary mode(str) -- can be citance, reference ''' citances = defaultdict(list) summarizer = LexRankSummarizer(Stemmer('english')) summary = defaultdict(lambda: defaultdict(list)) for t in extracted_refs: citances[t[0]['topic']].append({ 'refs': t[0]['sentence'], 'citance': self.clean_citation(t[0]['citation_text']) }) for topic, citance in citances.iteritems(): # Create graph of citation similarities vectorizer = TfidfVectorizer(tokenizer=self.tokenize, min_df=1, max_df=len(citances) * .9) cit_vectors = vectorizer.fit_transform( [e['citance'] for e in citance]).toarray() cit_text = {i: v for i, v in enumerate(citance)} cit_dict = {i: v for i, v in enumerate(cit_vectors)} cits = [] for e in cit_dict: # vector (numpy array) for e1 in cit_dict: if e != e1: simil = self.cossim(cit_dict[e], cit_dict[e1]) if simil > 0.1: cits.append((e, e1, simil)) G = nx.Graph() G.add_weighted_edges_from(cits) part = community.best_partition(G) clusters = defaultdict(list) tokenize = SentTokenizer(offsets=False) for k, v in part.iteritems(): clusters[v].extend(tokenize(citance[k]['refs'])) # clusters includes ref sentences that belong in each cluster # Find the most salient sentence in each cluster sal_in_cluster = {} # salient sentences for each cluster for i in clusters: parser = PlaintextParser.from_string( ' '.join(clusters[i]).replace('\\', ''), Tokenizer('english')) summ = summarizer(parser.document, 5) # 5 is the number of sentences returned by LexRank sal_in_cluster[i] = [unicode(s) for s in summ] # The most salient sentences in each cluster summary[topic.upper()] =\ self.pick_from_cluster( sal_in_cluster, max_length, weighted=False) return summary