Пример #1
0
 def __init__(self, args, opts):
     '''
     Constructor
     '''
     self.tkzr = WordTokenizer(stem=False)
     self.s_t = SentTokenizer(offsets=False)
     self.util = Utils()
Пример #2
0
    def summarize(self, extracted_refs, facet_results, max_length=250):
        '''
        Summarizes the extracted references based on the facet results

        Chooses from facets naively

        Args:
            extracted_refs(list) -- results of the method.run (e.g. simple.py)
            facet_results(dict) -- facets for each extracted reference
                Look at data/task1b_results1.json
            max_length(int) -- maximum length of the summary
        '''
        summaries = defaultdict(lambda: defaultdict(list))
        for t in extracted_refs:
            topic = t[0]['topic']
            citance = t[0]['citance_number']
            if isinstance(t[0]['sentence'][0], list):
                logger.warn('Unexpected, should check')
            summaries[topic.upper()]\
                [facet_results[topic.upper()]
                 [str(citance)]['SVM_LABEL']].append(
                t[0]['sentence'])

        final_summ = defaultdict(lambda: defaultdict(dict))
        counts = defaultdict(lambda: defaultdict(dict))
        sent_tok = SentTokenizer(offsets=False)
        for t in summaries:
            for facet in summaries[t]:
                sents = []
                for e in summaries[t][facet]:
                    sents.extend(sent_tok(e))
                final_summ[t][facet] = sents
                counts[t][facet] = len(final_summ[t][facet])
        summ = defaultdict(list)
        tokzer = WordTokenizer(stem=False)
        for k in final_summ:
            for f in final_summ[k]:
                for i in range(len(final_summ[k][f])):
                    if len(final_summ[k][f]) > i and\
                            tokzer.count_words(summ[k]) < max_length:
                        summ[k].append(final_summ[k][f][i])
        return summ
Пример #3
0
    def summarize(self, doc, max_length=10):
        '''
        Summarizes a document or list of docs
        MMR(S) = lambda*Sim(S,D)-(1-lambda)*Sim(S,Summary)
        Arg:
            doc: (list) | (str)

            max_length: The maximum length of the desired summary

        Returns
        -------
        str
        '''
        if isinstance(doc, str):  # list of sentences, no need to tokenize
            s_t = SentTokenizer()
            docs = s_t(doc, offsets=False)
            docs += [doc]  # Dummy sentence, The whole document
        else:
            docs = doc + [' '.join(doc)]
        tokzr = self.get_tokenizer('regex', True)
        vectorizer = TfidfVectorizer(min_df=1,
                                     max_df=len(doc) * .95,
                                     tokenizer=tokzr,
                                     stop_words=stopwords.words('english'))
        vectors = vectorizer.fit_transform(docs).toarray()
        doc_texts = {i: v for i, v in enumerate(docs)}
        doc_dict = {i: v for i, v in enumerate(vectors)}
        feature_names = vectorizer.get_feature_names()
        #         idf_vals = vectorizer.idf_

        summ_scores = []  # includes tuples (mmr_score, sentence_id)
        for i, s in doc_texts.iteritems():  # iterate through sentences to
            # select them for summary
            if len(summ_scores) > 0:
                summ_v = ' '.join([doc_texts[e[1]] for e in summ_scores])
                # summarization vector
            else:
                summ_v = ''
            if summ_v != '':
                summ_v = vectorizer.transform([summ_v
                                               ]).toarray()[0]  # to tf-idf
            score = -1 * self._mmr(
                vectorizer.transform([s]).toarray()[0],
                doc_dict[len(doc_dict) - 1], summ_v, self.lmbda, self.cossim)
            if len(summ_scores) < max_length / 30 + 3:
                # max heap data structure for mmr
                heappush(summ_scores, (score, i))
            else:  # Get rid of lowest score
                heappushpop(summ_scores, (score, i))
        print summ_scores
        final_sum = []
        for s in summ_scores:
            if self.w_t.count_words(final_sum) < max_length:
                final_sum.append(doc_texts[s[1]])
#         print 'before: %d' % self.w_t.count_words(final_sum)
        if self.w_t.count_words(final_sum) > max_length:
            tmp = final_sum.pop()
        if self.w_t.count_words(final_sum) == 0:
            final_sum.append(tmp)
#         print 'after: %d' % self.w_t.count_words(final_sum)
        return final_sum
Пример #4
0
    def summarize(self,
                  extracted_refs,
                  facet_results,
                  max_length=250,
                  mode='citance'):
        '''
        Summarizes the extracted references based on community detection

        Args:
            extracted_refs(list) -- results of the method.run (e.g. simple.py)
            facet_results(dict) -- facets for each extracted reference
                Look at data/task1b_results1.json
            max_length(int) -- maximum length of the summary
            mode(str) -- can be citance, reference 

        '''
        citances = defaultdict(list)
        summarizer = LexRankSummarizer(Stemmer('english'))
        summary = defaultdict(lambda: defaultdict(list))
        for t in extracted_refs:
            citances[t[0]['topic']].append({
                'refs':
                t[0]['sentence'],
                'citance':
                self.clean_citation(t[0]['citation_text'])
            })

        for topic, citance in citances.iteritems():
            # Create graph of citation similarities
            vectorizer = TfidfVectorizer(tokenizer=self.tokenize,
                                         min_df=1,
                                         max_df=len(citances) * .9)
            cit_vectors = vectorizer.fit_transform(
                [e['citance'] for e in citance]).toarray()
            cit_text = {i: v for i, v in enumerate(citance)}
            cit_dict = {i: v for i, v in enumerate(cit_vectors)}
            cits = []
            for e in cit_dict:  # vector (numpy array)
                for e1 in cit_dict:
                    if e != e1:
                        simil = self.cossim(cit_dict[e], cit_dict[e1])
                        if simil > 0.1:
                            cits.append((e, e1, simil))
            G = nx.Graph()
            G.add_weighted_edges_from(cits)
            part = community.best_partition(G)
            clusters = defaultdict(list)
            tokenize = SentTokenizer(offsets=False)
            for k, v in part.iteritems():
                clusters[v].extend(tokenize(citance[k]['refs']))
            # clusters includes ref sentences that belong in each cluster
            # Find the most salient sentence in each cluster
            sal_in_cluster = {}  # salient sentences for each cluster
            for i in clusters:
                parser = PlaintextParser.from_string(
                    ' '.join(clusters[i]).replace('\\', ''),
                    Tokenizer('english'))
                summ = summarizer(parser.document, 5)
                # 5 is the number of sentences returned by LexRank
                sal_in_cluster[i] = [unicode(s) for s in summ]
                # The most salient sentences in each cluster
            summary[topic.upper()] =\
                self.pick_from_cluster(
                    sal_in_cluster, max_length, weighted=False)
        return summary