def load_sume_sentences(self, docs, parse_type=None, parse_info=[]):
        """

        :param docs: the documents to load
        :param parse_type:
        :param parse_info:
        :return: list[Sentence]

        @type docs: list[tuple]
        @type parse_type: str
        @type parse_info: list
        """
        doc_sentences = []
        doc_id = 0
        for doc_id, doc in enumerate(docs):
            doc_name, doc_sents = doc
            for sent_pos, sentence in enumerate(doc_sents):
                token_sentence = word_tokenize(sentence, self.LANGUAGE)
                if parse_info:
                    parse_sent = parse_info[0][doc_id][1][sent_pos]
                    hash_tokens_pos, phrases = get_parse_info(parse_sent, self.stemmer, self.LANGUAGE, self.stoplist)
                    pruned_phrases = prune_phrases(phrases, self.stoplist, self.stemmer, self.LANGUAGE)
                    sentence_s = Sentence(token_sentence, doc_id, sent_pos+1, pruned_phrases, hash_tokens_pos)
                else:
                    sentence_s = Sentence(token_sentence, doc_id, sent_pos+1)
                            
                #print token_sentence
                untokenized_form = untokenize(token_sentence)
                sentence_s.untokenized_form = untokenized_form
                sentence_s.length = len(untokenized_form.split(' '))
                doc_sentences.append(sentence_s)
            
        return doc_sentences
示例#2
0
    def load_sume_sentences(self, docs, parse_type=None, parse_info=None):
        """

        :param docs: the documents to load
        :param parse_type:
        :param parse_info:
        :return: list[Sentence]

        @type docs: list[tuple]
        @type parse_type: str
        @type parse_info: list
        """
        if parse_info is None:
            parse_info = []
            # System should work even without parser, it is optional
            # print("Warning!!!!! No parse_info available for docs %s" % docs)
            # raise BaseException("Warning!!!!! No parse_info available")

        doc_sentences = []
        doc_id = 0
        for doc_id, doc in enumerate(docs):
            doc_name, doc_sents = doc
            for sent_pos, sentence in enumerate(doc_sents):
                token_sentence = word_tokenize(sentence, self.LANGUAGE)
                if parse_info:
                    parse_sent = parse_info[0][doc_id][1][sent_pos]
                    # _, raw_phrases = get_parse_info(parse_sent, self.stemmer, self.LANGUAGE, self.stoplist, use_stems=False)
                    hash_tokens_pos, phrases = get_parse_info(parse_sent, self.stemmer, self.LANGUAGE, self.stoplist)
                    pruned_phrases = prune_phrases(phrases, self.stoplist, self.stemmer, self.LANGUAGE)
                    sentence_s = Sentence(token_sentence, doc_id, sent_pos+1, pruned_phrases, hash_tokens_pos)
                else:
                    sentence_s = Sentence(token_sentence, doc_id, sent_pos+1)

                #print token_sentence
                untokenized_form = untokenize(token_sentence)
                sentence_s.untokenized_form = untokenized_form
                sentence_s.length = len(untokenized_form.split(' '))
                doc_sentences.append(sentence_s)
            
        return doc_sentences
示例#3
0
    def __init__(self, rouge, models, parse_info, language, stemmer, summary_length=100, N=2, stopwords=None,
                 ub_score=None,
                 ub_summary=None, summarizer=None, parser_type=None):
        self.rouge = rouge
        self.models = models
        self.language = language
        self.stopwords = stopwords or Set()
        self.summary_length = summary_length

        self.ref_ngrams = Set()  # set of ngrams that are in the reference summaries (for the feedback to peek)
        self.ref_phrases = Set()  # set of phrases that are in the reference summaries (for the feedback to peek)
        self.__ub_summary__ = ub_summary or []
        self.__ub_score__ = ub_score or (0.0, 0.0, 0.0)

        # this only deals with the reference summaries
        parse_info = parse_info or []
        for model_name, model in models:
            y = Set(extract_ngrams2(model, stemmer, language, N))
            self.ref_ngrams = self.ref_ngrams.union(y)
            if parser_type == PARSE_TYPE_PARSE:
                for _, parse_sents in parse_info[1]:
                    for parse_sent in parse_sents:
                        _, phrases = get_parse_info(parse_sent, stemmer, language,
                                                    stopwords)
                        y = Set(prune_phrases(phrases, stopwords, stemmer, language))
                        self.ref_phrases = self.ref_phrases.union(y)

        if summarizer is not None:
            if parser_type is None or parser_type == PARSE_TYPE_NGRAMS:
                concept_match = [key for key in summarizer.weights if key in self.ref_ngrams]
                log.debug('Total uniq ref concepts (ngr):   %s' % (len(self.ref_ngrams)))
            elif parser_type == PARSE_TYPE_PARSE:
                concept_match = [key for key in summarizer.weights if key in self.ref_phrases]
                log.debug('Total uniq ref concepts (phr):   %s' % (len(self.ref_phrases)))
            else:
                raise ValueError("parse_type '%s' is invalid, should be %s or %s" %
                                 (parser_type, None, PARSE_TYPE_PARSE))
            log.debug('UB Accept concepts:  %s' % (len(concept_match)))
    def __call__(self,
                 docs,
                 models,
                 summary_length,
                 oracle_type,
                 ub_score,
                 ub_summary,
                 parser_type=None,
                 parse_info=[],
                 max_iteration_count=11,
                 weights_override={},
                 clear_before_override=None,
                 propagation=False):
        """
        This starts of the simualted feedback for a single cluster of documents, towards a list of models. i.e. the
        models get united, and then the feedback loop is simulated.

        :param docs:
        :param models:
        :param summary_length:
        :param oracle_type:
        :param ub_score:
        :param ub_summary:
        :param parser_type:
        :param parse_info:
        :param max_iteration_count: int: Maximum number of iterations to run.
        :param weights_override: dict: (concept -> double) dictionary containing the override weights for propagation
        """

        self.models = models
        self.summary_length = summary_length
        self.ub_score = ub_score
        self.parse_type = parser_type
        self.cluster_size = len(docs)
        self.MAX_WEIGHT = len(docs)

        for model_name, model in models:
            y = set(extract_ngrams2(model, self.stemmer, self.language,
                                    self.N))
            self.ref_ngrams = self.ref_ngrams.union(y)
            if parser_type == PARSE_TYPE_PARSE:
                for _, parse_sents in parse_info[1]:
                    for parse_sent in parse_sents:
                        _, phrases = get_parse_info(parse_sent, self.stemmer,
                                                    self.language,
                                                    self.stoplist)
                        y = set(
                            prune_phrases(phrases, self.stoplist, self.stemmer,
                                          self.language))
                        self.ref_phrases = self.ref_phrases.union(y)

        self.summarizer.sentences = self.SumeWrap.load_sume_sentences(
            docs, parser_type, parse_info)
        parse_info = []

        # extract bigrams as concepts
        if self.parse_type == PARSE_TYPE_PARSE:
            print('Get concept types Phrases')
            self.summarizer.extract_ngrams2(concept_type='phrase')
        if self.parse_type == None:
            print('Get concept types ngrams')
            self.summarizer.extract_ngrams2(concept_type='ngrams')

        # compute document frequency as concept weights
        self.summarizer.compute_document_frequency()

        # compute word_frequency
        self.summarizer.compute_word_frequency()

        old_sentences = self.summarizer.sentences

        self.summarizer.prune_sentences(remove_citations=True,
                                        remove_redundancy=True,
                                        imp_list=[])

        # from all concepts that are going to be pruned, keep only those that also appear elsewhere

        retained_concepts = [
            concept for s in self.summarizer.sentences
            for concept in s.concepts
        ]

        print('Total concepts before sentence pruning: ',
              len(self.summarizer.weights))

        for sentence in set(old_sentences).difference(
                self.summarizer.sentences):
            for concept in sentence.concepts:
                if concept not in retained_concepts and self.summarizer.weights.has_key(
                        concept):
                    del self.summarizer.weights[concept]

        print('Total concepts found: ', len(self.summarizer.weights))

        if self.parse_type == None:
            concept_match = [
                key for key in self.summarizer.weights
                if key in self.ref_ngrams
            ]
            print('Total ref concepts:   ', len(self.ref_ngrams))
        elif self.parse_type == PARSE_TYPE_PARSE:
            concept_match = [
                key for key in self.summarizer.weights
                if key in self.ref_phrases
            ]
            print('Total ref concepts:   ', len(self.ref_phrases))
        print('UB Accept concepts:   ', len(concept_match))

        if oracle_type.startswith(ORACLE_TYPE_ACTIVE_LEARNING):
            self.get_feature_vector()
            self.data = np.array(self.fvector)
            model = svm.SVC(kernel='linear',
                            C=1.0,
                            probability=True,
                            class_weight='balanced')

        self.initial_weights = self.summarizer.weights

        self.__apply_initial_weights_override__(weights_override,
                                                clear_before_override)
        '''
        # create the coocurence graph
        self.graph.clear()
        self.graph.add_sentences(self.summarizer.sentences)
        dump_dir=tempfile.mkdtemp(dir=self.debug_dump_target_dir)
        '''

        print('Summarizing %s sentences down to %s words' %
              (len(self.summarizer.sentences), self.summary_length))
        # core algorithm for feedback calculation... (as in paper)
        flag = 0
        # get_details is the personalizedSummary function which gets updated weights in every iteration.
        # Starting with boudin as starting weights (except in case of weights_override != None).

        # initial iteration
        summary, self.score, subset = self.get_details(1, summary_length,
                                                       oracle_type)
        self.prev_score = (0.0, 0.0, 0.0)
        prev_summary = ''
        for iteration in range(2, max_iteration_count):
            self.dump_current_weight_map(self.debug_dump_target_dir,
                                         max_iteration_count)
            # here, depending on the oracle_type, a intermediate summary is generated. This intermediate summary is
            # satisfies other optimization criteria, so that the amount/probability of getting useful feedback is maximized
            if iteration > 2:
                subset = self.__generate_optimal_feedback_summary__(
                    flag, oracle_type, summary_length)

            print('Summary Subset:', subset)

            # acquire feedback and record it using the flight_recorder
            #new_accepts, new_rejects, new_implicits = self.get_feedback(subset, RECOMMENDER_METHOD_HIGHEST_WEIGHT)
            new_accepts, new_rejects, new_implicits = self.get_feedback(subset)
            self.flight_recorder.record(new_accepts, new_rejects,
                                        new_implicits)

            # update the summarizer weights for next iteration
            self.recalculate_weights(oracle_type, propagation)

            summary, self.score, _ = self.get_details(iteration,
                                                      summary_length,
                                                      oracle_type)

            if oracle_type.startswith(ORACLE_TYPE_ACTIVE_LEARNING):
                self.uncertainity, self.labels = self.get_uncertainity_labels(
                    model)

            if self.check_break_condition(iteration, prev_summary, summary,
                                          ub_summary, self.prev_score):
                break

            self.prev_score = self.score
            prev_summary = summary

        return summary
示例#5
0
    "King Sihanouk declined to chair talks in either place.",
    "A U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.2",
    "But in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.",
    "Left out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians."
]

sents = [
    "Budget negotiations between the White House and House Republicans were delayed on several issues.",
    "At issue were provisions that included requiring Federal Health Insurance providers to provide contraceptives to women as Well as a provision to build a road across a wildlife preserve in Alaska.",
    "The contraceptive issue faced an uncertain future while Clinton likely will veto the road.",
    "There is disagreement also on how to spend the funding on education.",
    "This year's budget discussions also have been hampered because it is the first time since budget procedures were established in 1974 that there has been a surplus, preventing agreement on a budget resolution."
]

sentences = parser.raw_parse_sents(sents)
language = 'english'
stemmer = SnowballStemmer(language)
stoplist = set(stopwords.words(language))

for sent in sentences:
    phrases = []
    parsestr = unicode(list(sent)[0])
    #print 'Sent:', parsestr
    tokens = Tree.fromstring(parsestr).leaves()
    print tokens
    hash_pos_tokens, phrases = get_parse_info(parsestr, stemmer, language,
                                              stoplist)
    check = prune_phrases(phrases, stoplist, stemmer, language)
    for x in check:
        print(unicode(x))
    print('No. of phrases:', len(check))