Exemplo n.º 1
0
class LeadOracle():
    def __init__(self, rouge_n=1, metric='f'):
        self.rouge_n = rouge_n
        self.metric = metric
        self.summarizer = Summarizer()

    def summarize(self,
                  ref,
                  articles,
                  max_len=40,
                  len_type='words',
                  in_titles=False,
                  out_titles=False,
                  min_sent_tokens=7,
                  max_sent_tokens=40):

        articles = self.summarizer._preprocess(articles)
        scored_summaries = []
        for a in articles:
            selected_sents = []
            current_len = 0
            sents = a.sents
            if in_titles == False or out_titles == False:
                sents = [s for s in sents if not s.is_title]
            for s in sents:
                l = self.summarizer._sent_len(s, len_type)
                new_len = current_len + l
                if new_len <= max_len:
                    selected_sents.append(s.text)
                    current_len = new_len
                if new_len > max_len:
                    break
            if len(selected_sents) >= 1:
                summary = ' '.join(selected_sents)
                rouge_scores = compute_rouge_n(summary,
                                               ref,
                                               self.rouge_n,
                                               tokenize=True)
                score = rouge_scores[self.metric]
                scored_summaries.append((summary, score))
        scored_summaries.sort(key=lambda x: x[1], reverse=True)
        summary = scored_summaries[0][0]
        return summary
Exemplo n.º 2
0
class Oracle():
    def __init__(self, rouge_n=1, metric='f', early_stopping=True):
        self.rouge_n = rouge_n
        self.metric = metric
        self.early_stopping = early_stopping
        self.summarizer = Summarizer()

    def summarize(self,
                  ref,
                  articles,
                  max_len=40,
                  len_type='words',
                  in_titles=False,
                  out_titles=False,
                  min_sent_tokens=7,
                  max_sent_tokens=40):

        articles = self.summarizer._preprocess(articles)
        sents = [s for a in articles for s in a.sents]
        sents = self.summarizer._deduplicate(sents)
        if in_titles == False or out_titles == False:
            sents = [s for s in sents if not s.is_title]
        sent_lens = [self.summarizer._sent_len(s, len_type) for s in sents]
        current_len = 0
        remaining = list(range(len(sents)))
        selected = []
        scored_selections = []
        ref_words = word_tokenize(ref)

        while current_len < max_len and len(remaining) > 0:
            scored = []
            current_summary_words = [
                tok for i in selected for tok in sents[i].words
            ]
            for i in remaining:
                new_len = current_len + sent_lens[i]
                if new_len <= max_len:
                    try:
                        summary_words = current_summary_words + sents[i].words
                        rouge_scores = compute_rouge_n(summary_words,
                                                       ref_words,
                                                       rouge_n=self.rouge_n,
                                                       tokenize=False)
                        score = rouge_scores[self.metric]
                        scored.append((i, score))
                    except:
                        pass
            if len(scored) == 0:
                break
            scored.sort(key=lambda x: x[1], reverse=True)
            best_idx, best_score = scored[0]
            scored_selections.append((selected + [best_idx], best_score))
            current_len += sent_lens[best_idx]
            selected.append(scored[0][0])
            remaining.remove(best_idx)

        if self.early_stopping == False:
            # remove shorter summaries
            max_sents = max([len(x[0]) for x in scored_selections])
            scored_selections = [
                x for x in scored_selections if len(x[0]) < max_sents
            ]

        scored_selections.sort(key=lambda x: x[1], reverse=True)
        if len(scored_selections) == 0:
            return ''
        best_selection = scored_selections[0][0]
        summary_sents = [sents[i].text for i in best_selection]
        return ' '.join(summary_sents)