def extract_ngrams2(self, concept_type='ngrams', n=2):
        """Extract the ngrams of words from the input sentences.

        Args:
            n (int): the number of words for ngrams, defaults to 2
        """
        for i, sentence in enumerate(self.sentences):
            untokenized_concepts = []
            if concept_type == 'ngrams':
                ngrams = extract_ngrams2([sentence.untokenized_form],
                                         self.stemmer, self.LANGUAGE, n)
                pruned_list = prune_ngrams(ngrams, self.stoplist, n)
            elif concept_type == 'phrase':
                pruned_list = self.sentences[i].phrases

            for concept in pruned_list:
                wrds = unstem_ngram(concept, sentence)
                untokenized_concepts.append(" ".join(wrds))

            self.sentences[i].concepts = pruned_list
            self.sentences[i].untokenized_concepts = untokenized_concepts
            #print(untokenized_concepts)
            if len(self.sentences[i].concepts) != len(
                    self.sentences[i].untokenized_concepts):
                raise BaseException(
                    "unexpected length difference between concepts and untokenized_concepts"
                )
示例#2
0
    def prune_concepts(self, method="threshold", value=3, rejected_list=[]):
        """Prune the concepts for efficient summarization.

        Args:
            method (str): the method for pruning concepts that can be whether
              by using a minimal value for concept scores (threshold) or using
              the top-N highest scoring concepts (top-n), defaults to
              threshold.
            value (int): the value used for pruning concepts, defaults to 3.

        """
        if method == 'stopwords':
            concepts = self.weights.keys()
            for concept in concepts:
                pruned_list = prune_ngrams(concept, self.stoplist, 1)
                if not pruned_list:
                    #print concept, self.weights[concept]
                    del self.weights[concept]

        if method == "list":
            concepts = self.weights.keys()
            for concept in concepts:
                if concept in rejected_list:
                    #print concept, self.weights[concept]
                    del self.weights[concept]

        # 'threshold' pruning method
        if method == "threshold":

            # iterates over the concept weights
            concepts = self.weights.keys()
            for concept in concepts:
                if self.weights[concept] < value:
                    del self.weights[concept]

        # 'top-n' pruning method
        elif method == "top-n":

            # sort concepts by scores
            sorted_concepts = sorted(self.weights,
                                     key=lambda x: self.weights[x],
                                     reverse=True)

            # iterates over the concept weights
            concepts = self.weights.keys()
            for concept in concepts:
                if concept not in sorted_concepts[:value]:
                    del self.weights[concept]

        # iterates over the sentences
        for i in range(len(self.sentences)):

            # current sentence concepts
            concepts = self.sentences[i].concepts

            # prune concepts
            self.sentences[i].concepts = [
                c for c in concepts if c in self.weights
            ]
    def __call__(self, docs, models, length, ngram_type=2):
        self.sum_length = int(length)
        self.load_data(docs, models)
        self.get_ref_ngrams(ngram_type)
        self.ref_ngrams = prune_ngrams(self.ref_ngrams, self.stoplist, ngram_type)
        #self.prune_sentences(remove_citations=True, remove_redundancy=True)

        self.sentences_idx = range(len(self.sentences))
        self.ref_ngrams_idx = range(len(self.ref_ngrams))

        summary_idx = self.solve_ilp(ngram_type)
        summary_txt = self.get_summary_text(summary_idx)

        return summary_txt
    def extract_ngrams2(self, concept_type='ngrams', n=2):
        """Extract the ngrams of words from the input sentences.

        Args:
            n (int): the number of words for ngrams, defaults to 2
        """
        for i, sentence in enumerate(self.sentences):
            if concept_type == 'ngrams':
                ngrams = extract_ngrams2([sentence.untokenized_form], self.stemmer, self.LANGUAGE, n)
                pruned_list = prune_ngrams(ngrams, self.stoplist, n)
            elif concept_type == 'phrase':
                pruned_list = self.sentences[i].phrases
                
            self.sentences[i].concepts = pruned_list
示例#5
0
    def recommend_highest_weight(self, samples, limit=1, prune=True):
        w = dict(self.graph.get_weights())
        s = sorted(w, key=w.get, reverse=True)
        s = [item for item in s if
             item not in self.flight_recorder.union().reject
             and item not in self.flight_recorder.union().accept
             and item not in self.flight_recorder.union().implicit_reject]

        pruned = prune_ngrams(s, self.stoplist, self.N)
        result = []
        for concept in s:
            if concept in samples:
                # log.debug ("adding %s with weight %s to result" % (concept, w[concept]))
                result.append(concept)

        return result[:limit]
    def get_feedback(self, subset, recommender=None):
        """
            Generate feedback for the subset sentences by peeking into the reference summary.

        :param subset: The indices of the sentences to get feedback for.
        :param allowed_number_of_feedbacks: how many concepts may be sent to the oracle, default all
        """
        new_implicit_rejects = set(
        )  # currently not used (all writing occurences are commented out)

        summary = [
            self.summarizer.sentences[j].untokenized_form for j in subset
        ]
        # print('Feedback-optimal summary:', summary)

        if self.parse_type == 'parse':
            print('feedback on phrases')
            summary_phrases = [
                self.summarizer.sentences[j].phrases for j in subset
            ]

            samples = list(itertools.chain(*summary_phrases))
            references = self.ref_phrases

        elif self.parse_type == None:
            print('feedback on ngrams')
            summary_concepts = [
                self.summarizer.sentences[j].concepts for j in subset
            ]

            samples = list(itertools.chain(*summary_concepts))
            references = self.ref_ngrams

        # from all samples, use a sub-set
        if recommender is None:
            use_samples = samples
        elif recommender == RECOMMENDER_METHOD_SAMPLING:
            use_samples = random.sample(
                samples, self.allowed_number_of_feedback_per_iteration)
        elif recommender == RECOMMENDER_METHOD_HIGHEST_WEIGHT:
            use_samples = self.recommend_highest_weight(
                samples, self.allowed_number_of_feedback_per_iteration)

        new_rejects = list(
            self.Oracle.reject_concepts(use_samples, references) -
            self.flight_recorder.union().reject)
        new_accepts = list(
            self.Oracle.accept_concepts(use_samples, references) -
            self.flight_recorder.union().accept)

        new_rejects = prune_ngrams(new_rejects, self.stoplist, self.N)
        new_accepts = prune_ngrams(new_accepts, self.stoplist, self.N)
        '''
        if self.parse_type == 'parse':
            self.recorder.total_accept_keys += self.project_phrase_ngrams(self.recorder.accepted_concepts)
            self.recorder.total_reject_keys += self.project_phrase_ngrams(self.recorder.rejected_concepts)
            
            x = list(Set(self.recorder.total_accept + self.recorder.union.reject))
            new_implicit_rejects = list(self.get_implicit_feedback(summ_ngrams, x) - Set(self.recorder.total_implicit_reject))
            # self.recorder.total_implicit_reject += self.recorder.latest().implicit_reject
        '''

        # self.recorder.total_accept += self.recorder.accepted_concepts
        # self.recorder.total_reject += self.recorder.rejected_concepts
        # self.recorder.total_implicit_reject += self.recorder.latest().implicit_reject
        return (new_accepts, new_rejects, new_implicit_rejects)