def load_sume_sentences(self, docs, parse_type=None, parse_info=[]): """ :param docs: the documents to load :param parse_type: :param parse_info: :return: list[Sentence] @type docs: list[tuple] @type parse_type: str @type parse_info: list """ doc_sentences = [] doc_id = 0 for doc_id, doc in enumerate(docs): doc_name, doc_sents = doc for sent_pos, sentence in enumerate(doc_sents): token_sentence = word_tokenize(sentence, self.LANGUAGE) if parse_info: parse_sent = parse_info[0][doc_id][1][sent_pos] hash_tokens_pos, phrases = get_parse_info(parse_sent, self.stemmer, self.LANGUAGE, self.stoplist) pruned_phrases = prune_phrases(phrases, self.stoplist, self.stemmer, self.LANGUAGE) sentence_s = Sentence(token_sentence, doc_id, sent_pos+1, pruned_phrases, hash_tokens_pos) else: sentence_s = Sentence(token_sentence, doc_id, sent_pos+1) #print token_sentence untokenized_form = untokenize(token_sentence) sentence_s.untokenized_form = untokenized_form sentence_s.length = len(untokenized_form.split(' ')) doc_sentences.append(sentence_s) return doc_sentences
def load_sume_sentences(self, docs, parse_type=None, parse_info=None): """ :param docs: the documents to load :param parse_type: :param parse_info: :return: list[Sentence] @type docs: list[tuple] @type parse_type: str @type parse_info: list """ if parse_info is None: parse_info = [] # System should work even without parser, it is optional # print("Warning!!!!! No parse_info available for docs %s" % docs) # raise BaseException("Warning!!!!! No parse_info available") doc_sentences = [] doc_id = 0 for doc_id, doc in enumerate(docs): doc_name, doc_sents = doc for sent_pos, sentence in enumerate(doc_sents): token_sentence = word_tokenize(sentence, self.LANGUAGE) if parse_info: parse_sent = parse_info[0][doc_id][1][sent_pos] # _, raw_phrases = get_parse_info(parse_sent, self.stemmer, self.LANGUAGE, self.stoplist, use_stems=False) hash_tokens_pos, phrases = get_parse_info(parse_sent, self.stemmer, self.LANGUAGE, self.stoplist) pruned_phrases = prune_phrases(phrases, self.stoplist, self.stemmer, self.LANGUAGE) sentence_s = Sentence(token_sentence, doc_id, sent_pos+1, pruned_phrases, hash_tokens_pos) else: sentence_s = Sentence(token_sentence, doc_id, sent_pos+1) #print token_sentence untokenized_form = untokenize(token_sentence) sentence_s.untokenized_form = untokenized_form sentence_s.length = len(untokenized_form.split(' ')) doc_sentences.append(sentence_s) return doc_sentences
def __init__(self, rouge, models, parse_info, language, stemmer, summary_length=100, N=2, stopwords=None, ub_score=None, ub_summary=None, summarizer=None, parser_type=None): self.rouge = rouge self.models = models self.language = language self.stopwords = stopwords or Set() self.summary_length = summary_length self.ref_ngrams = Set() # set of ngrams that are in the reference summaries (for the feedback to peek) self.ref_phrases = Set() # set of phrases that are in the reference summaries (for the feedback to peek) self.__ub_summary__ = ub_summary or [] self.__ub_score__ = ub_score or (0.0, 0.0, 0.0) # this only deals with the reference summaries parse_info = parse_info or [] for model_name, model in models: y = Set(extract_ngrams2(model, stemmer, language, N)) self.ref_ngrams = self.ref_ngrams.union(y) if parser_type == PARSE_TYPE_PARSE: for _, parse_sents in parse_info[1]: for parse_sent in parse_sents: _, phrases = get_parse_info(parse_sent, stemmer, language, stopwords) y = Set(prune_phrases(phrases, stopwords, stemmer, language)) self.ref_phrases = self.ref_phrases.union(y) if summarizer is not None: if parser_type is None or parser_type == PARSE_TYPE_NGRAMS: concept_match = [key for key in summarizer.weights if key in self.ref_ngrams] log.debug('Total uniq ref concepts (ngr): %s' % (len(self.ref_ngrams))) elif parser_type == PARSE_TYPE_PARSE: concept_match = [key for key in summarizer.weights if key in self.ref_phrases] log.debug('Total uniq ref concepts (phr): %s' % (len(self.ref_phrases))) else: raise ValueError("parse_type '%s' is invalid, should be %s or %s" % (parser_type, None, PARSE_TYPE_PARSE)) log.debug('UB Accept concepts: %s' % (len(concept_match)))
def __call__(self, docs, models, summary_length, oracle_type, ub_score, ub_summary, parser_type=None, parse_info=[], max_iteration_count=11, weights_override={}, clear_before_override=None, propagation=False): """ This starts of the simualted feedback for a single cluster of documents, towards a list of models. i.e. the models get united, and then the feedback loop is simulated. :param docs: :param models: :param summary_length: :param oracle_type: :param ub_score: :param ub_summary: :param parser_type: :param parse_info: :param max_iteration_count: int: Maximum number of iterations to run. :param weights_override: dict: (concept -> double) dictionary containing the override weights for propagation """ self.models = models self.summary_length = summary_length self.ub_score = ub_score self.parse_type = parser_type self.cluster_size = len(docs) self.MAX_WEIGHT = len(docs) for model_name, model in models: y = set(extract_ngrams2(model, self.stemmer, self.language, self.N)) self.ref_ngrams = self.ref_ngrams.union(y) if parser_type == PARSE_TYPE_PARSE: for _, parse_sents in parse_info[1]: for parse_sent in parse_sents: _, phrases = get_parse_info(parse_sent, self.stemmer, self.language, self.stoplist) y = set( prune_phrases(phrases, self.stoplist, self.stemmer, self.language)) self.ref_phrases = self.ref_phrases.union(y) self.summarizer.sentences = self.SumeWrap.load_sume_sentences( docs, parser_type, parse_info) parse_info = [] # extract bigrams as concepts if self.parse_type == PARSE_TYPE_PARSE: print('Get concept types Phrases') self.summarizer.extract_ngrams2(concept_type='phrase') if self.parse_type == None: print('Get concept types ngrams') self.summarizer.extract_ngrams2(concept_type='ngrams') # compute document frequency as concept weights self.summarizer.compute_document_frequency() # compute word_frequency self.summarizer.compute_word_frequency() old_sentences = self.summarizer.sentences self.summarizer.prune_sentences(remove_citations=True, remove_redundancy=True, imp_list=[]) # from all concepts that are going to be pruned, keep only those that also appear elsewhere retained_concepts = [ concept for s in self.summarizer.sentences for concept in s.concepts ] print('Total concepts before sentence pruning: ', len(self.summarizer.weights)) for sentence in set(old_sentences).difference( self.summarizer.sentences): for concept in sentence.concepts: if concept not in retained_concepts and self.summarizer.weights.has_key( concept): del self.summarizer.weights[concept] print('Total concepts found: ', len(self.summarizer.weights)) if self.parse_type == None: concept_match = [ key for key in self.summarizer.weights if key in self.ref_ngrams ] print('Total ref concepts: ', len(self.ref_ngrams)) elif self.parse_type == PARSE_TYPE_PARSE: concept_match = [ key for key in self.summarizer.weights if key in self.ref_phrases ] print('Total ref concepts: ', len(self.ref_phrases)) print('UB Accept concepts: ', len(concept_match)) if oracle_type.startswith(ORACLE_TYPE_ACTIVE_LEARNING): self.get_feature_vector() self.data = np.array(self.fvector) model = svm.SVC(kernel='linear', C=1.0, probability=True, class_weight='balanced') self.initial_weights = self.summarizer.weights self.__apply_initial_weights_override__(weights_override, clear_before_override) ''' # create the coocurence graph self.graph.clear() self.graph.add_sentences(self.summarizer.sentences) dump_dir=tempfile.mkdtemp(dir=self.debug_dump_target_dir) ''' print('Summarizing %s sentences down to %s words' % (len(self.summarizer.sentences), self.summary_length)) # core algorithm for feedback calculation... (as in paper) flag = 0 # get_details is the personalizedSummary function which gets updated weights in every iteration. # Starting with boudin as starting weights (except in case of weights_override != None). # initial iteration summary, self.score, subset = self.get_details(1, summary_length, oracle_type) self.prev_score = (0.0, 0.0, 0.0) prev_summary = '' for iteration in range(2, max_iteration_count): self.dump_current_weight_map(self.debug_dump_target_dir, max_iteration_count) # here, depending on the oracle_type, a intermediate summary is generated. This intermediate summary is # satisfies other optimization criteria, so that the amount/probability of getting useful feedback is maximized if iteration > 2: subset = self.__generate_optimal_feedback_summary__( flag, oracle_type, summary_length) print('Summary Subset:', subset) # acquire feedback and record it using the flight_recorder #new_accepts, new_rejects, new_implicits = self.get_feedback(subset, RECOMMENDER_METHOD_HIGHEST_WEIGHT) new_accepts, new_rejects, new_implicits = self.get_feedback(subset) self.flight_recorder.record(new_accepts, new_rejects, new_implicits) # update the summarizer weights for next iteration self.recalculate_weights(oracle_type, propagation) summary, self.score, _ = self.get_details(iteration, summary_length, oracle_type) if oracle_type.startswith(ORACLE_TYPE_ACTIVE_LEARNING): self.uncertainity, self.labels = self.get_uncertainity_labels( model) if self.check_break_condition(iteration, prev_summary, summary, ub_summary, self.prev_score): break self.prev_score = self.score prev_summary = summary return summary
"King Sihanouk declined to chair talks in either place.", "A U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.2", "But in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.", "Left out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians." ] sents = [ "Budget negotiations between the White House and House Republicans were delayed on several issues.", "At issue were provisions that included requiring Federal Health Insurance providers to provide contraceptives to women as Well as a provision to build a road across a wildlife preserve in Alaska.", "The contraceptive issue faced an uncertain future while Clinton likely will veto the road.", "There is disagreement also on how to spend the funding on education.", "This year's budget discussions also have been hampered because it is the first time since budget procedures were established in 1974 that there has been a surplus, preventing agreement on a budget resolution." ] sentences = parser.raw_parse_sents(sents) language = 'english' stemmer = SnowballStemmer(language) stoplist = set(stopwords.words(language)) for sent in sentences: phrases = [] parsestr = unicode(list(sent)[0]) #print 'Sent:', parsestr tokens = Tree.fromstring(parsestr).leaves() print tokens hash_pos_tokens, phrases = get_parse_info(parsestr, stemmer, language, stoplist) check = prune_phrases(phrases, stoplist, stemmer, language) for x in check: print(unicode(x)) print('No. of phrases:', len(check))