def make_concepts_baseline(id, path, sents, query): """ only use first sentences TODO: choose best of first 3 """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) max_order = 0 for sent in sents: ## store this sentence's concepts sent.concepts = set([]) concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) > 0: for concept in concepts: all_concepts[concept].add(sent.doc) if sent.order == 0: for concept in concepts: all_concepts[concept].add(sent.doc + 'first') ## ignore some sents if sent.order == 0: max_order = 0 skip = False if sent.length <= 5: skip = True if sent.tok in seen_sents: skip = True #if sent.length < 20: skip = True if sent.order > max_order or max_order > 0: skip = True max_order = 0 if skip: max_order += 1 continue #print sent.order, max_order, sent.doc, sent seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) #if count < 3: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path+id)
def make_concepts_gold(id, path, sents, gold_sents): ## get gold concepts all_concepts = collections.defaultdict(set) for sent in gold_sents: concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) for concept in concepts: all_concepts[concept].add(sent.doc) ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) if util.is_just_stopwords(concept.split("_")): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) ## get sentence concepts seen_sents = set() for sent_index in range(len(sents)): sent = sents[sent_index] sent.concepts = set([]) ## skip some sents skip = False # if sent.order >= 3: skip = True if not sent.new_par: skip = True if sent.length < 20: skip = True if sent.orig in seen_sents: skip = True if sent.length <= 5: skip = True if skip: continue seen_sents.add(sent.orig) s = util.porter_stem_sent(util.tokenize(fix_text(sent.orig))) concepts = set(util.get_ngrams(s, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)
def make_concepts(id, path, sents, R1=True, R2=False, R4=False, SU4=False): all_concepts = collections.defaultdict(int) print "******************make_concepts", id for sent in sents: ## store this sentence's concepts sent.concepts = set() if R1: concepts = set(util.get_ngrams(sent.tok2, 1, bounds=False, as_string=True)) elif R2: concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) elif R4: concepts = set(util.get_ngrams(sent.tok2, 4, bounds=False, as_string=True)) elif SU4: concepts = set(util.get_su4(sent.tok2, as_string=True)) for concept in concepts: all_concepts[concept]+=1 sent.concepts = concepts return create_ilp_output(sents, all_concepts, path+id)
def build_program(problem, concept_weight, length=100, sentences = None): """ the ILP keeps tracks of the constraints s<num> variables handle sentences, subsentences and removable subtrees c<num> variables represent concepts in those selected pseudo-sentences """ program = compression.SentenceSelectionILP(concept_weight, length, use_subsentences=True, use_removables=True, use_min_length=True, use_min_length_ratio=False) if not sentences: sentences = problem.get_new_sentences() for sentence in sentences: if not hasattr(sentence, "compression_node"): sentence.compression_node = compression.TreebankNode(sentence.parsed) nounPhraseMapping = compression.generateNounPhraseMapping([s.compression_node for s in sentences]) for sentence in sentences: ## generate a compression candidate tree candidates = sentence.compression_node.getCandidateTree(nounPhraseMapping) candidate_root = compression.TreebankNode(candidates) candidate_root.sentence = sentence ## (or a non compressed tree) #candidate_root = treenode.TreeNode(sentence.compression_node.getNonCompressedCandidate()) if candidate_root.isLeaf(): continue ## debugging #candidate_root.original = root #candidate_root.original_text = candidates # update ILP with the new sentence program.addSentence(candidate_root, lambda x: compression.get_bigrams_from_node(x, node_skip=lambda y: not re.match(r'[A-Za-z0-9]', y.label), node_transform=lambda y: text.text_processor.porter_stem(y.text.lower()))) # skip debugging part continue sentence_concepts = program.getConcepts(candidate_root, lambda x: compression.get_bigrams_from_node(x, node_skip=lambda y: not re.match(r'[A-Za-z0-9]', y.label), node_transform=lambda y: text.text_processor.porter_stem(y.text.lower()))) print sentence.original print candidate_root.getPrettyCandidates() for concept in sentence_concepts.keys(): if concept not in concept_weight: del sentence_concepts[concept] print sorted(sentence_concepts.keys()) units = dict([(x, 1) for x in util.get_ngrams(sentence.stemmed, n=2, bounds=False)]) for concept in units.keys(): if concept not in concept_weight: del units[concept] print sorted(units.keys()) return program
def __init__(self, summary_problem, units='n2'): self.unit_name = units self.problem = summary_problem if units == 'n1': self.unit_selector = lambda x: util.get_ngrams(x, n=1) elif units == 'n2': self.unit_selector = lambda x: util.get_ngrams(x, n=2) elif units == 'n3': self.unit_selector = lambda x: util.get_ngrams(x, n=3) elif units == 'n4': self.unit_selector = lambda x: util.get_ngrams(x, n=4) elif units == 'su4' : self.unit_selector = lambda x: util.get_skip_bigrams(x, k=4) + util.get_ngrams(x, n=1) else: units = util.get_ngrams # default options ## variables to set later self.concepts = None self.concept_weights = None self.concept_index = None self.relevant_sents = None self.relevant_sent_concepts = None ## defaults self.min_sent_length = 5 self.max_sents = 10000
def prep_docs(path, out_path): files = os.popen('ls %s*.sent' %path).read().splitlines() ## on the first pass, create a vocab mapping vocab = set() for file in files: if '-B' in file: continue sents = open(file).read().splitlines() doc = prob_util.Counter() for sent in sents[:20]: s = util.porter_stem_sent(util.tokenize(fix_text(sent))) concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True)) vocab.update(concepts) fh = open(out_path+'vocab', 'w') vocab = zip(vocab, range(len(vocab))) for concept, count in vocab: fh.write('%s %d\n' %(concept, count)) fh.close() vocab = dict(vocab) ## on the second pass, output one doc per line for file in files: if '-B' in file: continue sents = open(file).read().splitlines() doc = prob_util.Counter() for sent in sents[:20]: s = util.porter_stem_sent(util.tokenize(fix_text(sent))) concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True)) for concept in concepts: doc[concept] += 1 ## doc output output = '%d %s' %(len(doc), ' '.join(['%s:%d' %(vocab[t],c) for t,c in doc.items()])) print output
def __init__(self, summary_problem, units='n2', df=None): self.unit_name = units self.problem = summary_problem self.df = df use_bounds = False if units == 'n1': self.unit_selector = lambda x: util.get_ngrams(x, n=1, bounds=use_bounds) elif units == 'n2': self.unit_selector = lambda x: util.get_ngrams(x, n=2, bounds=use_bounds) elif units == 'n3': self.unit_selector = lambda x: util.get_ngrams(x, n=3, bounds=use_bounds) elif units == 'n12': self.unit_selector = lambda x: util.get_ngrams(x, n=1) + util.get_ngrams(x, n=2) elif units == 'n23': self.unit_selector = lambda x: util.get_ngrams(x, n=2) + util.get_ngrams(x, n=3) elif units == 's2' : self.unit_selector = lambda x: get_skip_bigrams(x, k=4) + util.get_ngrams(x, n=1) else: units = util.get_ngrams # default options ## variables to set later self.concept_sets = None self.concept_weight_sets = None self.concept_index_sets = None self.relevant_sent_sets = None self.relevant_sent_concepts = None ## defaults self.min_sent_length = 5
with open(prob, 'r') as fin, open('%s/%s.bigram.pos' % (output_dir, doc_id), 'w') as fout: while True: # line of original sentence line = fin.readline() if line == '': break line = line.strip() # line of toks fin.readline() # line of stems stems = fin.readline().strip().split('\t') # line of pos fin.readline() # line of deps fin.readline() # line of labels fin.readline() units = util.get_ngrams(stems, 2) assert len(units) == (len(stems) - 1) contained_bigrams = [x for x in enumerate(units) if x[1] in bigrams] for pos, bigram in contained_bigrams: bigram_pos[bigram].append((sent_id, pos, pos+1)) # empty line fin.readline() sent_id += 1 for bigram, position in bigram_pos.items(): fout.write('%s\t%s\n' % (' '.join(bigram), ' '.join(['_'.join(str(x) for x in pos) for pos in position])))
def get_features(sent, tag): tokens = sent.split() tags = [t.split('/')[-1] for t in tag.split()] #structs = [p.split()[0] for p in parse.split('(') if len(p.strip())>0] feats = {} ## number of tokens length = len(tokens) feats['len'] = length / 30.0 ## connector words connectors = set(['however', 'because', 'and', 'so', 'also', 'nonetheless', 'still', 'but']) feats['connect'] = tokens[0].lower() in connectors ## number of capitalized words (assume 1st word is always capitalized) num_cap = len([1 for token in tokens[1:] if token[0].isupper()]) feats['cap'] = 1.0 * num_cap / max((length-1), 1) ## pronouns (PRP$ are possessive; WP are who, what, which, when) num_pron = len([1 for tag in tags if tag in ['PRP$']]) feats['prn'] = 1.0 * num_pron / length ## definite articles num_da = len([1 for token in tokens if token.lower() in ['the', 'that', 'these', 'those', 'this']]) feats['da'] = 1.0 * num_da / length ## the [A-Z] construction feats['cap_cons'] = len(re.findall('[t|T]he [A-Z]', sent)) ## first word #feats['first=%s' %tokens[0].lower()] = 1 #feats['first_pos=%s' %tags[0]] = 1 ## token ngrams ngrams = util.get_ngrams(tokens, 1, False) + util.get_ngrams(tokens, 2, True) for ngram in ngrams: feats['tok=%s' %'_'.join(ngram).lower()] = 1 ## tag ngrams ngrams = util.get_ngrams(tags, 2, True) #+ util.get_ngrams(tags, 4, True) for ngram in ngrams: feats['pos=%s' %'_'.join(ngram)] = 1 ## parser ngrams #ngrams = util.get_ngrams(structs, 4, True) #for ngram in ngrams: # feats['struct=%s' %'_'.join(ngram)] = 1 ## quotes feats['quotes'] = int('"' in tokens) #num_tokens_in_quotes = 0 #num_quotes = 0 #in_quotes = False #for token in tokens: # if token == '"': # in_quotes = not in_quotes # num_quotes += 1 # else: # if in_quotes: num_tokens_in_quotes += 1 #feats['in_quotes'] = 1.0 * num_tokens_in_quotes / (length - num_quotes) return feats
def build_alternative_program(problem, concept_weight, length=100, sentences = None, longuest_candidate_only=False): if not sentences: sentences = problem.get_new_sentences() for sentence in sentences: if not hasattr(sentence, "compression_node"): sentence.compression_node = compression.TreebankNode(sentence.parsed) nounPhraseMapping = compression.generateNounPhraseMapping([s.compression_node for s in sentences]) #print "generating acronyms" acronymMapping = compression.generateAcronymMapping(problem.get_new_sentences()) print problem.id, acronymMapping compressed_sentences = [] seen_sentences = {} group_id = 0 for sentence in sentences: subsentences = sentence.compression_node.getNodesByFilter(compression.TreebankNode.isSubsentence) candidates = {} for node in subsentences: candidates.update(node.getCandidates(mapping=nounPhraseMapping)) if longuest_candidate_only: max_length = 0 argmax = None for candidate in candidates: if len(candidate) > max_length: max_length = len(candidate) argmax = candidate if argmax != None: candidates = [argmax] for candidate in candidates: new_sentence = text.Sentence(compression.postProcess(candidate), sentence.order, sentence.source, sentence.date) if new_sentence.length <= 5: continue # skip short guys new_sentence.group_id = group_id compressed_sentences.append(new_sentence) seen_sentences[new_sentence.original] = 1 group_id += 1 compression.replaceAcronyms(compressed_sentences, acronymMapping) log_file = open("%s.log" % problem.id, "w") for sentence in compressed_sentences: log_file.write("%d %s\n" %( group_id, str(sentence))) log_file.close() # generate ids for acronyms acronym_id = {} acronym_length = {} for definition, acronym in acronymMapping.items(): if acronym not in acronym_id: acronym_id[acronym] = len(acronym_id) acronym_length[acronym] = len(definition.strip().split()) # get concepts relevant_sentences = [] sentence_concepts = [] groups = {} used_concepts = set() acronym_index = {} sent_index = 0 for sentence in compressed_sentences: units = util.get_ngrams(sentence.stemmed, n=2, bounds=False) overlapping = set([u for u in units if u in concept_weight]) if len(overlapping) == 0: continue relevant_sentences.append(sentence) sentence_concepts.append(overlapping) used_concepts.update(overlapping) if sentence.group_id not in groups: groups[sentence.group_id] = [] groups[sentence.group_id].append(sent_index) # generate an acronym index for acronym in acronym_id: if re.search(r'\b' + acronym + r'\b', sentence.original): if acronym not in acronym_index: acronym_index[acronym] = [] acronym_index[acronym].append(sent_index) sent_index += 1 # build inverted index filtered_concepts = {} concept_index = {} index = 0 for concept in used_concepts: concept_index[concept] = index filtered_concepts[concept] = concept_weight[concept] index += 1 relevant_sent_concepts = [[concept_index[c] for c in cs] for cs in sentence_concepts] concept_weights = filtered_concepts curr_concept_sents = {} for sent_index in range(len(relevant_sentences)): concepts = relevant_sent_concepts[sent_index] for concept in concepts: if not concept in curr_concept_sents: curr_concept_sents[concept] = [] curr_concept_sents[concept].append(sent_index) # generate the actual ILP program = ilp.IntegerLinearProgram() program.objective["score"] = ' + '.join(['%f c%d' %(concept_weight[concept], concept_index[concept]) for concept in concept_index]) s1 = ' + '.join(['%d s%d' %(relevant_sentences[sent_index].length, sent_index) for sent_index in range(len(relevant_sentences))]) # add enough space to fit the definition of each acronym employed in the summary s_acronyms = ' + '.join(['%d a%d' %(acronym_length[acronym], acronym_id[acronym]) for acronym in acronym_id]) if s_acronyms != "": s_acronyms = " + " + s_acronyms s2 = ' <= %s\n' %length program.constraints["length"] = s1 + s_acronyms + s2 for concept, index in concept_index.items(): ## at least one sentence containing a selected bigram must be selected s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]]) s2 = ' - c%d >= 0' %index program.constraints["presence_%d" % index] = s1 + s2 ## if a bigram is not selected then all sentences containing it are deselected s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]]) s2 = '- %d c%d <= 0' %(len(curr_concept_sents[index]), index) program.constraints["absence_%d" % index] = s1 + s2 # constraints so that acronyms get selected along with sentences they belong to for acronym, index in acronym_index.items(): s1 = ' + '.join([ 's%d' %sent_index for sent_index in index]) s2 = ' - a%d >= 0' %acronym_id[acronym] program.constraints["acronym_presence_%d" % acronym_id[acronym]] = s1 + s2 s1 = ' + '.join([ 's%d' %sent_index for sent_index in index]) s2 = '- %d a%d <= 0' %(len(index), acronym_id[acronym]) program.constraints["acronym_absence_%d" % acronym_id[acronym]] = s1 + s2 # add sentence compression groups for group in groups: program.constraints["group_%d" % group] = " + ".join(["s%d" % sent_index for sent_index in groups[group]]) + " <= 1" for sent_index in range(len(relevant_sentences)): program.binary["s%d" % sent_index] = relevant_sentences[sent_index] for concept, concept_index in concept_index.items(): program.binary["c%d" % concept_index] = 1 for acronym, id in acronym_id.items(): program.binary["a%d" % id] = 1 sys.stderr.write("compression candidates: %d, original: %d\n" % (len(relevant_sentences), len(sentences))) program.acronyms = acronymMapping return program
def make_concepts_compress2(id, path, sents, query, compressed_sents): """ """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) ## different processing for set A and set B if '-B' in id: first_weight = 2 count_thresh = 4 query_thresh = 0 else: first_weight = 1 count_thresh = 3 query_thresh = 1 for sent in sents: ## store this sentence's concepts sent.concepts = set() concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) >= query_thresh: for concept in concepts: if sent.order == 0: all_concepts[concept].add('first' + sent.doc) else: all_concepts[concept].add(sent.doc) ## ignore some sents skip = False #if not sent.new_par: skip = True #if sent.length <= 20: skip = True if sent.tok in seen_sents: skip = True #if sent.ignore: skip = True if skip: continue seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) firsts = len([1 for d in docs if 'first' in d]) count = count + (first_weight * firsts) if count < count_thresh: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) for sent in compressed_sents: sent.concepts = set([]) if sent.unresolved: continue if sent.length < 10: continue if re.match('^["(].*[")]$', sent.orig): skip = True concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(compressed_sents, final_concepts, path + id)
def make_concepts_baseline(id, path, sents, query): """ only use first sentences TODO: choose best of first 3 """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) max_order = 0 for sent in sents: ## store this sentence's concepts sent.concepts = set([]) concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) > 0: for concept in concepts: all_concepts[concept].add(sent.doc) if sent.order == 0: for concept in concepts: all_concepts[concept].add(sent.doc + 'first') ## ignore some sents if sent.order == 0: max_order = 0 skip = False if sent.length <= 5: skip = True if sent.tok in seen_sents: skip = True #if sent.length < 20: skip = True if sent.order > max_order or max_order > 0: skip = True max_order = 0 if skip: max_order += 1 continue #print sent.order, max_order, sent.doc, sent seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) #if count < 3: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)
def __init__(self, concepts, problem): # bigrams are tuples self.bigrams = concepts self.problem = problem self.unit_selector = lambda x: util.get_ngrams(x, n=2)
def build_alternative_program(problem, concept_weight, length=100, sentences=None, longuest_candidate_only=False): if not sentences: sentences = problem.get_new_sentences() for sentence in sentences: if not hasattr(sentence, "compression_node"): sentence.compression_node = compression.TreebankNode( sentence.parsed) nounPhraseMapping = compression.generateNounPhraseMapping( [s.compression_node for s in sentences]) #print "generating acronyms" acronymMapping = compression.generateAcronymMapping( problem.get_new_sentences()) print problem.id, acronymMapping compressed_sentences = [] seen_sentences = {} group_id = 0 for sentence in sentences: subsentences = sentence.compression_node.getNodesByFilter( compression.TreebankNode.isSubsentence) candidates = {} for node in subsentences: candidates.update(node.getCandidates(mapping=nounPhraseMapping)) if longuest_candidate_only: max_length = 0 argmax = None for candidate in candidates: if len(candidate) > max_length: max_length = len(candidate) argmax = candidate if argmax != None: candidates = [argmax] for candidate in candidates: new_sentence = text.Sentence(compression.postProcess(candidate), sentence.order, sentence.source, sentence.date) if new_sentence.length <= 5: continue # skip short guys new_sentence.group_id = group_id compressed_sentences.append(new_sentence) seen_sentences[new_sentence.original] = 1 group_id += 1 compression.replaceAcronyms(compressed_sentences, acronymMapping) log_file = open("%s.log" % problem.id, "w") for sentence in compressed_sentences: log_file.write("%d %s\n" % (group_id, str(sentence))) log_file.close() # generate ids for acronyms acronym_id = {} acronym_length = {} for definition, acronym in acronymMapping.items(): if acronym not in acronym_id: acronym_id[acronym] = len(acronym_id) acronym_length[acronym] = len(definition.strip().split()) # get concepts relevant_sentences = [] sentence_concepts = [] groups = {} used_concepts = set() acronym_index = {} sent_index = 0 for sentence in compressed_sentences: units = util.get_ngrams(sentence.stemmed, n=2, bounds=False) overlapping = set([u for u in units if u in concept_weight]) if len(overlapping) == 0: continue relevant_sentences.append(sentence) sentence_concepts.append(overlapping) used_concepts.update(overlapping) if sentence.group_id not in groups: groups[sentence.group_id] = [] groups[sentence.group_id].append(sent_index) # generate an acronym index for acronym in acronym_id: if re.search(r'\b' + acronym + r'\b', sentence.original): if acronym not in acronym_index: acronym_index[acronym] = [] acronym_index[acronym].append(sent_index) sent_index += 1 # build inverted index filtered_concepts = {} concept_index = {} index = 0 for concept in used_concepts: concept_index[concept] = index filtered_concepts[concept] = concept_weight[concept] index += 1 relevant_sent_concepts = [[concept_index[c] for c in cs] for cs in sentence_concepts] concept_weights = filtered_concepts curr_concept_sents = {} for sent_index in range(len(relevant_sentences)): concepts = relevant_sent_concepts[sent_index] for concept in concepts: if not concept in curr_concept_sents: curr_concept_sents[concept] = [] curr_concept_sents[concept].append(sent_index) # generate the actual ILP program = ilp.IntegerLinearProgram() program.objective["score"] = ' + '.join([ '%f c%d' % (concept_weight[concept], concept_index[concept]) for concept in concept_index ]) s1 = ' + '.join([ '%d s%d' % (relevant_sentences[sent_index].length, sent_index) for sent_index in range(len(relevant_sentences)) ]) # add enough space to fit the definition of each acronym employed in the summary s_acronyms = ' + '.join([ '%d a%d' % (acronym_length[acronym], acronym_id[acronym]) for acronym in acronym_id ]) if s_acronyms != "": s_acronyms = " + " + s_acronyms s2 = ' <= %s\n' % length program.constraints["length"] = s1 + s_acronyms + s2 for concept, index in concept_index.items(): ## at least one sentence containing a selected bigram must be selected s1 = ' + '.join( ['s%d' % sent_index for sent_index in curr_concept_sents[index]]) s2 = ' - c%d >= 0' % index program.constraints["presence_%d" % index] = s1 + s2 ## if a bigram is not selected then all sentences containing it are deselected s1 = ' + '.join( ['s%d' % sent_index for sent_index in curr_concept_sents[index]]) s2 = '- %d c%d <= 0' % (len(curr_concept_sents[index]), index) program.constraints["absence_%d" % index] = s1 + s2 # constraints so that acronyms get selected along with sentences they belong to for acronym, index in acronym_index.items(): s1 = ' + '.join(['s%d' % sent_index for sent_index in index]) s2 = ' - a%d >= 0' % acronym_id[acronym] program.constraints["acronym_presence_%d" % acronym_id[acronym]] = s1 + s2 s1 = ' + '.join(['s%d' % sent_index for sent_index in index]) s2 = '- %d a%d <= 0' % (len(index), acronym_id[acronym]) program.constraints["acronym_absence_%d" % acronym_id[acronym]] = s1 + s2 # add sentence compression groups for group in groups: program.constraints["group_%d" % group] = " + ".join( ["s%d" % sent_index for sent_index in groups[group]]) + " <= 1" for sent_index in range(len(relevant_sentences)): program.binary["s%d" % sent_index] = relevant_sentences[sent_index] for concept, concept_index in concept_index.items(): program.binary["c%d" % concept_index] = 1 for acronym, id in acronym_id.items(): program.binary["a%d" % id] = 1 sys.stderr.write("compression candidates: %d, original: %d\n" % (len(relevant_sentences), len(sentences))) program.acronyms = acronymMapping return program
def build_program(problem, concept_weight, length=100, sentences=None): """ the ILP keeps tracks of the constraints s<num> variables handle sentences, subsentences and removable subtrees c<num> variables represent concepts in those selected pseudo-sentences """ program = compression.SentenceSelectionILP(concept_weight, length, use_subsentences=True, use_removables=True, use_min_length=True, use_min_length_ratio=False) if not sentences: sentences = problem.get_new_sentences() for sentence in sentences: if not hasattr(sentence, "compression_node"): sentence.compression_node = compression.TreebankNode( sentence.parsed) nounPhraseMapping = compression.generateNounPhraseMapping( [s.compression_node for s in sentences]) for sentence in sentences: ## generate a compression candidate tree candidates = sentence.compression_node.getCandidateTree( nounPhraseMapping) candidate_root = compression.TreebankNode(candidates) candidate_root.sentence = sentence ## (or a non compressed tree) #candidate_root = treenode.TreeNode(sentence.compression_node.getNonCompressedCandidate()) if candidate_root.isLeaf(): continue ## debugging #candidate_root.original = root #candidate_root.original_text = candidates # update ILP with the new sentence program.addSentence( candidate_root, lambda x: compression.get_bigrams_from_node( x, node_skip=lambda y: not re.match(r'[A-Za-z0-9]', y.label), node_transform=lambda y: text.text_processor.porter_stem( y.text.lower()))) # skip debugging part continue sentence_concepts = program.getConcepts( candidate_root, lambda x: compression.get_bigrams_from_node( x, node_skip=lambda y: not re.match(r'[A-Za-z0-9]', y.label), node_transform=lambda y: text.text_processor.porter_stem( y.text.lower()))) print sentence.original print candidate_root.getPrettyCandidates() for concept in sentence_concepts.keys(): if concept not in concept_weight: del sentence_concepts[concept] print sorted(sentence_concepts.keys()) units = dict([ (x, 1) for x in util.get_ngrams(sentence.stemmed, n=2, bounds=False) ]) for concept in units.keys(): if concept not in concept_weight: del units[concept] print sorted(units.keys()) return program
def make_concepts_compress2(id, path, sents, query, compressed_sents): """ """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) ## different processing for set A and set B if '-B' in id: first_weight = 2 count_thresh = 4 query_thresh = 0 else: first_weight = 1 count_thresh = 3 query_thresh = 1 for sent in sents: ## store this sentence's concepts sent.concepts = set() concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) >= query_thresh: for concept in concepts: if sent.order == 0: all_concepts[concept].add('first' + sent.doc) else: all_concepts[concept].add(sent.doc) ## ignore some sents skip = False #if not sent.new_par: skip = True #if sent.length <= 20: skip = True if sent.tok in seen_sents: skip = True #if sent.ignore: skip = True if skip: continue seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) firsts = len([1 for d in docs if 'first' in d]) count = count + (first_weight * firsts) if count < count_thresh: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) for sent in compressed_sents: sent.concepts = set([]) if sent.unresolved: continue if sent.length < 10: continue if re.match('^["(].*[")]$', sent.orig): skip = True concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(compressed_sents, final_concepts, path+id)
def make_concepts_exp(id, path, sents, query): """ """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) ## get sentence values sent_vals = prob_util.Counter() for sent in sents: query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) sent_vals[sent] = max(0, len(query_overlap)) #if sent.relevance < 0.3: sent_vals[sent] = 0.0 #else: sent_vals[sent] = 100000**sent.relevance concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = set() for concept in concepts: if util.is_just_stopwords(concept.split('_')): continue sent.concepts.add(concept) sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) iter = 0 while True: iter += 1 se = prob_util.entropy(sent_vals) ce = prob_util.entropy(concept_vals) print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce) if iter >= 1: break ## get sent vals again sent_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: sent_vals[sent] += concept_vals[concept] sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) sorted_sents = sent_vals.sortedKeys() #for sent in sorted_sents: # print sent_vals[sent], sent.order, sent.new_par, sent sorted_concepts = concept_vals.sortedKeys() #for concept in sorted_concepts: # print concept_vals[concept], concept ## create final concept set final_concepts = {} for concept in sorted_concepts: val = concept_vals[concept] #if val < 0.00001: continue final_concepts[concept] = val final_concept_set = set(final_concepts.keys()) ## get final sentence list and their concepts seen_sents = set() for sent in sents: skip = False if sent.length <= 5: skip = True if sent in seen_sents: skip = True if sent.order > 0: skip = True else: seen_sents.add(sent) if skip: sent.concepts = set() else: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path+id)
def make_concepts_exp(id, path, sents, query): """ """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) ## get sentence values sent_vals = prob_util.Counter() for sent in sents: query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) sent_vals[sent] = max(0, len(query_overlap)) #if sent.relevance < 0.3: sent_vals[sent] = 0.0 #else: sent_vals[sent] = 100000**sent.relevance concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = set() for concept in concepts: if util.is_just_stopwords(concept.split('_')): continue sent.concepts.add(concept) sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) iter = 0 while True: iter += 1 se = prob_util.entropy(sent_vals) ce = prob_util.entropy(concept_vals) print >> sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' % ( iter, se, ce) if iter >= 1: break ## get sent vals again sent_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: sent_vals[sent] += concept_vals[concept] sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) sorted_sents = sent_vals.sortedKeys() #for sent in sorted_sents: # print sent_vals[sent], sent.order, sent.new_par, sent sorted_concepts = concept_vals.sortedKeys() #for concept in sorted_concepts: # print concept_vals[concept], concept ## create final concept set final_concepts = {} for concept in sorted_concepts: val = concept_vals[concept] #if val < 0.00001: continue final_concepts[concept] = val final_concept_set = set(final_concepts.keys()) ## get final sentence list and their concepts seen_sents = set() for sent in sents: skip = False if sent.length <= 5: skip = True if sent in seen_sents: skip = True if sent.order > 0: skip = True else: seen_sents.add(sent) if skip: sent.concepts = set() else: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)