import sys, os import ilp if len(sys.argv) < 5 or len(sys.argv) > 8: sys.stderr.write( 'USAGE: %s <length_constraint> <sentence_lengths> <concepts_in_sentences> <concept_weights> [sentence_groups] [dependencies] [atleast]\n' ) sys.exit(1) solver = ilp.IntegerLinearProgram( debug=0, tmp="tmp_decoder.%d.%s.%s" % (os.getpid(), os.environ["USER"], os.environ["HOSTNAME"])) concept_id = {} concept = 0 concept_weights = {} for line in open(sys.argv[4]): tokens = line.strip().split() weight = float(tokens[1]) if tokens[0] in concept_id: sys.stderr.write('ERROR: duplicate concept \"%s\", line %d in %s\n' % (tokens[0], concept + 1, sys.argv[4])) sys.exit(1) concept_id[tokens[0]] = concept concept_weights[concept] = weight concept += 1 index = {} sentence_concepts = {} sentence = 0 for line in open(sys.argv[3]):
def build_alternative_program(problem, concept_weight, length=100, sentences=None, longuest_candidate_only=False, providedAcronyms=None): if not sentences: sentences = problem.get_new_sentences() for sentence in problem.get_new_and_old_sentences(): if not hasattr(sentence, "compression_node"): sentence.compression_node = compression.TreebankNode( sentence.parsed) nounPhraseMapping = compression.generateNounPhraseMapping( [s.compression_node for s in problem.get_new_and_old_sentences()]) #print "generating acronyms" acronymMapping = None if providedAcronyms: acronymMapping = providedAcronyms else: acronymMapping = compression.generateAcronymMapping( problem.get_new_and_old_sentences()) print problem.id, acronymMapping compressed_sentences = [] seen_sentences = {} group_id = 0 for sentence in sentences: subsentences = sentence.compression_node.getNodesByFilter( compression.TreebankNode.isSubsentence) candidates = {} for node in subsentences: candidates.update( node.getCandidates(beam=100, mapping=nounPhraseMapping, use_mandatory_removals=True)) if longuest_candidate_only: max_length = 0 argmax = None for candidate in candidates: if len(candidate) > max_length: max_length = len(candidate) argmax = candidate if argmax != None: candidates = [argmax] for candidate in candidates: new_sentence = text.Sentence(compression.postProcess(candidate), sentence.order, sentence.source, sentence.date) if new_sentence.length <= 5: continue # skip short guys new_sentence.group_id = group_id compressed_sentences.append(new_sentence) seen_sentences[new_sentence.original] = 1 group_id += 1 compression.replaceAcronyms(compressed_sentences, acronymMapping) #log_file = open("%s.log" % problem.id, "w") #for sentence in compressed_sentences: # log_file.write("%d %s\n" %( group_id, str(sentence))) #log_file.close() # generate ids for acronyms acronym_id = {} acronym_length = {} for definition, acronym in acronymMapping.items(): if acronym not in acronym_id: acronym_id[acronym] = len(acronym_id) acronym_length[acronym] = len(definition.strip().split()) # get concepts relevant_sentences = [] sentence_concepts = [] groups = {} used_concepts = set() acronym_index = {} sent_index = 0 for sentence in compressed_sentences: units = util.get_ngrams(sentence.stemmed, n=2, bounds=False) overlapping = set([u for u in units if u in concept_weight]) if len(overlapping) == 0: continue # get rid of sentences that do not overlap with concepts relevant_sentences.append(sentence) sentence_concepts.append(overlapping) used_concepts.update(overlapping) if sentence.group_id not in groups: groups[sentence.group_id] = [] groups[sentence.group_id].append(sent_index) # generate an acronym index for acronym in acronym_id: if re.search(r'\b' + acronym + r'\b', sentence.original): if acronym not in acronym_index: acronym_index[acronym] = [] acronym_index[acronym].append(sent_index) sent_index += 1 # build inverted index filtered_concepts = {} concept_index = {} index = 0 for concept in used_concepts: concept_index[concept] = index filtered_concepts[concept] = concept_weight[concept] index += 1 relevant_sent_concepts = [[concept_index[c] for c in cs] for cs in sentence_concepts] concept_weights = filtered_concepts curr_concept_sents = {} for sent_index in range(len(relevant_sentences)): concepts = relevant_sent_concepts[sent_index] for concept in concepts: if not concept in curr_concept_sents: curr_concept_sents[concept] = [] curr_concept_sents[concept].append(sent_index) # generate the actual ILP program = ilp.IntegerLinearProgram() program.objective["score"] = ' + '.join([ '%f c%d' % (concept_weight[concept], concept_index[concept]) for concept in concept_index ]) s1 = ' + '.join([ '%d s%d' % (relevant_sentences[sent_index].length, sent_index) for sent_index in range(len(relevant_sentences)) ]) # add enough space to fit the definition of each acronym employed in the summary s_acronyms = ' + '.join([ '%d a%d' % (acronym_length[acronym], acronym_id[acronym]) for acronym in acronym_id ]) if s_acronyms != "": s_acronyms = " + " + s_acronyms s2 = ' <= %s\n' % length program.constraints["length"] = s1 + s_acronyms + s2 for concept, index in concept_index.items(): ## at least one sentence containing a selected bigram must be selected s1 = ' + '.join( ['s%d' % sent_index for sent_index in curr_concept_sents[index]]) s2 = ' - c%d >= 0' % index program.constraints["presence_%d" % index] = s1 + s2 ## if a bigram is not selected then all sentences containing it are deselected #### this constraint is disabled since it is not necessary when all sentences contain at least one concept #### it might also be the reason for singlar matrices that crash the solver #s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]]) #s2 = ' - %d c%d <= 0' %(len(curr_concept_sents[index]), index) #program.constraints["absence_%d" % index] = s1 + s2 # constraints so that acronyms get selected along with sentences they belong to for acronym, index in acronym_index.items(): s1 = ' + '.join(['s%d' % sent_index for sent_index in index]) s2 = ' - a%d >= 0' % acronym_id[acronym] program.constraints["acronym_presence_%d" % acronym_id[acronym]] = s1 + s2 s1 = ' + '.join(['s%d' % sent_index for sent_index in index]) s2 = ' - %d a%d <= 0' % (len(index), acronym_id[acronym]) program.constraints["acronym_absence_%d" % acronym_id[acronym]] = s1 + s2 # add sentence compression groups for group in groups: if len(groups[group]) > 1: program.constraints["group_%d" % group] = " + ".join( ["s%d" % sent_index for sent_index in groups[group]]) + " <= 1" for sent_index in range(len(relevant_sentences)): program.binary["s%d" % sent_index] = relevant_sentences[sent_index] for concept, concept_index in concept_index.items(): program.binary["c%d" % concept_index] = 1 for acronym, id in acronym_id.items(): program.binary["a%d" % id] = 1 sys.stderr.write("compression candidates: %d, original: %d\n" % (len(relevant_sentences), len(sentences))) program.acronyms = acronymMapping return program
def run_standard(options, max_sents=10000): ## create output directory try: os.popen('rm -rf %s' % options.output) except: pass try: os.popen('mkdir -p %s' % options.output) except: sys.stderr.write('Error: could not create output directory [%s]\n') sys.exit() ## summarize! sys.stderr.write('generating summaries for task [%s]\n' % options.task) sys.stderr.write('length limit [%d]\n' % task.length_limit) sys.stderr.write('writing output to [%s]\n' % options.output) map_times, run_times = {}, {} ## sentence compression if options.compress: for problem in task.problems: if not '-A' in problem.id: continue sys.stderr.write( "%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs]))) #mapper = concept_mapper.HeuristicMapper(problem, "n2") mapper = concept_mapper.CheatingMapper(problem, "n2") mapper.map_concepts() mapper.choose_sents() concept_weights = mapper.concept_weights #print concept_weight #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0]) program = framework.build_alternative_program( problem, concept_weights, length=task.length_limit, sentences=mapper.relevant_sents, longuest_candidate_only=False) # run the program and get the output program.debug = 0 program.run() #selection = framework.get_program_result(program) selection = [] for variable in program.output: if re.match(r'^s\d+$', variable) and program.output[variable] == 1: selection.append(program.binary[variable]) selection = ordering.by_date(selection) summary = "\n".join(sentence.original for sentence in selection) #summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms) ## TAC id convention is annoying output_id = problem.id if options.task in ['u09', 'u08']: output_id = problem.id[:5] + problem.id[6:] output_file = open('%s/%s' % (options.output, output_id), 'w') output_file.write(summary) output_file.close() elif options.mcd: for problem in task.problems: num_problem_sentences = len(problem.get_new_sentences()) if num_problem_sentences < 500: continue used_sent_count = 0 for sentence in problem.get_new_sentences(): used_sent_count += 1 sentence.set_text(sentence.original) if used_sent_count < max_sents: sentence.used = True else: sentence.used = False problem.query.set_text(problem.query.original) sys.stdout.write( "%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs]))) # compute idf values word_idf = {} for doc in problem.new_docs: seen_words = {} for sentence in doc.sentences: if not sentence.used: continue for word in sentence.no_stop_freq: if word not in seen_words: seen_words[word] = 1 for word in seen_words: if word not in word_idf: word_idf[word] = 1 else: word_idf[word] += 1 for word in word_idf: word_idf[word] = 1.0 / word_idf[word] # compare sentences to centroid and derive McDonald's relevance score sentences = [] index = 0 for doc in problem.new_docs: doc_text = " ".join([ sentence.original for sentence in doc.sentences if sentence.used ]) centroid = text.Sentence(doc_text) centroid.compute_norm() problem.query.compute_norm() for sentence in doc.sentences: if not sentence.used: continue sentence.compute_norm() sentence.rel_score = sentence.sim_cosine( centroid, word_idf) + 1 / (sentence.order + 1) #sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + sentence.sim_cosine(problem.query, word_idf) sentences.append(sentence) sentence.index = index index += 1 # apply cutoff sentences.sort(lambda x, y: 1 if x.rel_score < y.rel_score else -1) if options.cutoff > 0 and len(sentences) > options.cutoff: sentences = sentences[0:options.cutoff] # construct ILP program = ilp.IntegerLinearProgram(debug=0) objective = [] length_constraint = [] for sentence in sentences: objective.append("%+g s%d" % (sentence.rel_score, sentence.index)) program.binary["s%d" % sentence.index] = sentence length_constraint.append("%+g s%d" % (sentence.length, sentence.index)) for peer in sentences: if sentence == peer: continue score = sentence.sim_cosine(peer, word_idf) if score > 0: objective.append("%+g s%d_%d" % (-score, sentence.index, peer.index)) program.binary["s%d_%d" % (sentence.index, peer.index)] = [ sentence, peer ] program.constraints["c1_%d_%d" % (sentence.index, peer.index)] = \ "s%d_%d - s%d <= 0" % (sentence.index, peer.index, sentence.index) program.constraints["c2_%d_%d" % (sentence.index, peer.index)] = \ "s%d_%d - s%d <= 0" % (sentence.index, peer.index, peer.index) program.constraints["c3_%d_%d" % (sentence.index, peer.index)] = \ "s%d + s%d - s%d_%d <= 1" % (sentence.index, peer.index, sentence.index, peer.index) program.objective["score"] = " ".join(objective) program.constraints["length"] = " ".join( length_constraint) + " <= %g" % task.length_limit run_times[problem.id] = time.time() program.run() run_times[problem.id] = time.time() - run_times[problem.id] selection = [] score = 0 # get solution and check consistency for variable in program.binary: if variable in program.output and program.output[variable] == 1: if type(program.binary[variable]) == type(sentences[0]): selection.append(program.binary[variable]) score += program.binary[variable].rel_score for peer in program.output: if program.output[ peer] == 0 or peer == variable or type( program.binary[peer]) != type( sentences[0]): continue if program.binary[variable].sim_cosine( program.binary[peer], word_idf) == 0: continue quadratic = "s%d_%d" % ( program.binary[variable].index, program.binary[peer].index) if quadratic not in program.output or program.output[ quadratic] != 1: print "WARNING: %s selected but %s not selected" % ( variable, quadratic) else: score -= program.binary[variable][0].sim_cosine( program.binary[variable][1], word_idf) if program.output[ "s%d" % program.binary[variable][0].index] != 1: print "WARNING: %s selected while s%d not selected" % ( variable, program.binary[variable][0].index) if program.output[ "s%d" % program.binary[variable][1].index] != 1: print "WARNING: %s selected while s%d not selected" % ( variable, program.binary[variable][1].index) #if math.fabs(program.result["score"] - score) > .1: # print "WARNING: difference between score = %g and expected = %g" % (program.result["score"], score) selection = ordering.by_date(selection) new_id = re.sub(r'.-(.)$', r'-\1', problem.id) output_file = open("%s/%s" % (options.output, new_id), "w") for sentence in selection: output_file.write(sentence.original + "\n") output_file.close() else: hist = prob_util.Counter() input_sents = [] for problem in task.problems: num_problem_sentences = len(problem.get_new_sentences()) #if num_problem_sentences < 300: continue if not '-A' in problem.id: continue if options.ir: #docs = [doc for doc, val in problem.ir_docs] #for doc in docs: doc.get_sentences() num_overlap = len( set([d.id for d in problem.ir_docs ]).intersection(set([d.id for d in problem.new_docs]))) print '%s overlap: %d' % (problem.id, num_overlap) info_fh.write('%s overlap [%d]\n' % (problem.id, num_overlap)) sys.stderr.write('problem [%s] input sentences [%d]' % (problem.id, num_problem_sentences)) input_sents.append(num_problem_sentences) ## select a concept mapper map_times[problem.id] = time.time() if options.cheat: mapper = concept_mapper.CheatingMapper(problem, options.units) else: mapper = concept_mapper.HeuristicMapperExp( problem, options.units) ## timing test mapper.max_sents = max_sents ## map input concepts to weights success = mapper.map_concepts() if not success: sys.exit() ## choose a subset of the input sentences based on the mapping success = mapper.choose_sents() if not success: sys.exit() map_times[problem.id] = time.time() - map_times[problem.id] ## testing #fh = open('concept_matrix', 'w') for sent in mapper.relevant_sent_concepts: hist[len(sent)] += 1 #fh.write(''.join(['%d, ' %concept for concept in sent[:-1]])) #fh.write('%d\n' %sent[-1]) hist[0] += (num_problem_sentences - len(mapper.relevant_sent_concepts)) #hist.displaySorted(N=100) #sys.exit() ## end testing ## setup and run the ILP run_times[problem.id] = time.time() selection = mapper.run(task.length_limit) selection = ordering.by_date(selection) run_times[problem.id] = time.time() - run_times[problem.id] ## TAC id convention is annoying output_id = problem.id if options.task in ['u09', 'u08']: output_id = problem.id[:5] + problem.id[6:] output_file = open('%s/%s' % (options.output, output_id), 'w') word_count = 0 for sentence in selection: output_file.write(sentence.original + '\n') word_count += len(sentence.original.split()) output_file.close() curr_time = map_times[problem.id] + run_times[problem.id] sys.stderr.write(' word count [%d] time [%1.2fs]\n' % (word_count, curr_time))
def format_output(self, style=None, max_length = 100): """ Step 3: create formatted output """ ## make sure step 2 has been completed if not self.relevant_sent_sets: sys.stderr.write('Error: need to run choose_sents first\n') return None outputs = {} sentences = {} self.concept_sents_sets = [] ## new member variable (clean this up!) for update_index in range(len(self.concept_sets)): output = [] id = self.problem.id ## deal with update id convention if len(self.concept_sets) > 1: id += '-' + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')[update_index] id += '.%s.%d.%s.%d' %('M', max_length, self.problem.id, 1) curr_sents = self.relevant_sent_sets[update_index] # list of sentences curr_sent_concepts = self.relevant_sent_concepts[update_index] # dict of concepts in each sentence {0: [1, 4, ... ], 1: ... } curr_concept_weights = self.concept_weight_sets[update_index] # dict of weight for each concept {'(he, said)': 3.4, ... } curr_concept_index = self.concept_index_sets[update_index] # dict of index for each concept {'(he, said)': 100, ... } curr_concept_sents = {} # dict of sentences for each concept for sent_index in range(len(curr_sents)): concepts = curr_sent_concepts[sent_index] for concept in concepts: if not concept in curr_concept_sents: curr_concept_sents[concept] = [] curr_concept_sents[concept].append(sent_index) self.concept_sents_sets.append(curr_concept_sents) ## custom format if style != 'ilp': output.append('%s NUM_SENTENCES %d' %(id, len(curr_sents))) output.append('%s NUM_CONCEPTS %d' %(id, len(curr_concept_index))) output.append('%s TEXT TOPIC %s %s' %(id, self.problem.title, self.problem.narr)) ## sentence info for sent_index in range(len(curr_sents)): output.append('%s LENGTH %d %d' %(id, sent_index, curr_sents[sent_index].length)) output.append('%s TEXT %d %s' %(id, sent_index, curr_sents[sent_index].original)) concept_list = ' '.join(map(str, curr_sent_concepts[sent_index])) output.append('%s CONCEPTS %d %d %s' %(id, sent_index, len(curr_sent_concepts[sent_index]), concept_list)) ## concept info for concept in curr_concept_weights.keys(): str_concept = '_'.join(concept) concept_weight = curr_concept_weights[concept] concept_index = curr_concept_index[concept] output.append('%s CONCEPT_INFO %d %1.4f %s' %(id, concept_index, concept_weight, str_concept)) ## ILP output format used by glpsol (glpk) else: problem = ilp.IntegerLinearProgram() problem.objective["score"] = ' + '.join(['%f c%d' %(weight, curr_concept_index[concept]) for concept, weight in curr_concept_weights.items()]) s1 = ' + '.join(['%d s%d' %(curr_sents[sent_index].length, sent_index) for sent_index in range(len(curr_sents))]) s2 = ' <= %s\n' %max_length problem.constraints["length"] = s1 + s2 for concept, concept_index in curr_concept_index.items(): ## at least one sentence containing a selected bigram must be selected s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[concept_index]]) s2 = ' - c%d >= 0' %concept_index problem.constraints["presence_%d" % concept_index] = s1 + s2 ## if a bigram is not selected then all sentences containing it are deselected s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[concept_index]]) s2 = '- %d c%d <= 0' %(len(curr_concept_sents[concept_index]), concept_index) problem.constraints["absence_%d" % concept_index] = s1 + s2 for sent_index in range(len(curr_sents)): problem.binary["s%d" % sent_index] = curr_sents[sent_index] for concept, concept_index in curr_concept_index.items(): problem.binary["c%d" % concept_index] = 1 #problem.debug = 1 problem.run() output = [] for sent_index in range(len(curr_sents)): if problem.output["s%d" % sent_index] == 1: output.append(curr_sents[sent_index]) return output outputs[id] = output sentences[id] = curr_sents return outputs, sentences
def decode(max_length, sentence_length_file, concepts_in_sentence_file, concept_weight_file, sentence_group_file=None, dependency_file=None, atleast=None, command="glpsol"): solver = ilp.IntegerLinearProgram( debug=1, tmp="tmp_decoder.%d.%s.%s" % (os.getpid(), os.environ["USER"], os.environ["HOSTNAME"]), command=command) #solver = ilp_to_localsolver.IntegerLinearProgram(debug=1, tmp = concept_weight_file, time_limit=1) concept_id = {} concept = 0 concept_weights = {} for line in open(concept_weight_file): tokens = line.strip().split() weight = float(tokens[1]) if tokens[0] in concept_id: sys.stderr.write( 'ERROR: duplicate concept \"%s\", line %d in %s\n' % (tokens[0], concept + 1, concept_weight_file)) sys.exit(1) concept_id[tokens[0]] = concept concept_weights[concept] = weight concept += 1 index = {} sentence_concepts = {} sentence = 0 for line in open(concepts_in_sentence_file): tokens = line.strip().split() concepts = {} for token in tokens: concepts[token] = True mapped_concepts = {} for concept in concepts: if concept not in concept_id: sys.stderr.write( 'ERROR: not weight for concept \"%s\", line %d in %s\n' % (concept, sentence + 1, concepts_in_sentence_file)) sys.exit(1) id = concept_id[concept] if id not in index: index[id] = [] index[id].append(sentence) mapped_concepts[id] = True if len(mapped_concepts) > 0: sentence_concepts[sentence] = mapped_concepts sentence += 1 # build objective objective = [] for concept, weight in concept_weights.items(): if concept not in index: continue # skip unused concepts objective.append("%+g c%d" % (weight, concept)) solver.binary["c%d" % concept] = concept solver.objective["score"] = " ".join(objective) # sentence => concepts for sentence, concepts in sentence_concepts.items(): solver.binary["s%d" % sentence] = sentence # for concept in concepts: # solver.constraints["sent_%d" % len(solver.constraints)] = "s%d - c%d <= 0" % (sentence, concept) # concept => sentence for concept in index: solver.constraints["index_%d" % len(solver.constraints)] = " + ".join( ["s%d" % x for x in index[concept]]) + " - c%d >= 0" % concept if sentence_group_file != None: groups = {} sentence = 0 for line in open(sentence_group_file): if sentence in sentence_concepts: if line != '\n': if line not in groups: groups[line] = [] groups[line].append(sentence) sentence += 1 for group in groups: solver.constraints["group_%d" % len(solver.constraints)] = " + ".join( ["s%d" % x for x in groups[group]]) + " <= 1" if dependency_file != None: groups = {} sentence = 0 for line in open(dependency_file): if sentence in sentence_concepts: if line != '\n': for id in line.strip().split(): id = int(id) if "s%d" % id not in solver.binary: solver.constraints["depend_%d" % len( solver.constraints)] = "s%d = 0" % (sentence) else: solver.constraints["depend_%d" % len( solver.constraints)] = "s%d - s%d >= 0" % ( id, sentence) sentence += 1 length_constraint = [] sentence = 0 for line in open(sentence_length_file): if sentence in sentence_concepts: length = line.strip() length_constraint.append("%s s%d" % (length, sentence)) solver.objective["score"] += " - %g s%d" % (float(length) / 1000.0, sentence) sentence += 1 solver.constraints["length_%d" % len(solver.constraints)] = " + ".join( length_constraint) + " <= " + str(max_length) if atleast != None: at_least = [] sentence = 0 for line in open(atleast): line = line.strip() if sentence in sentence_concepts: if line == "1": at_least.append("s%d" % sentence) sentence += 1 if len(at_least) > 0: solver.constraints["at_least_%d" % len( solver.constraints)] = " + ".join( at_least) + " >= 1" # select at least one of those sys.stderr.write("ilp: %d sentences, %d concepts\n" % (len(sentence_concepts), len(index))) if len(sentence_concepts) > 0 and len(index) > 0: solver.run() output = [] for variable in solver.output: if variable.startswith("s") and solver.output[variable] == 1: output.append(int(variable[1:])) return output
def run(self, max_length=100, style='ilp'): """ Step 3: create formatted output """ ## make sure step 2 has been completed if not self.relevant_sents: sys.stderr.write('\nError: need to run choose_sents first\n') return None output = [] curr_sents = self.relevant_sents # list of sentences curr_sent_concepts = self.relevant_sent_concepts # dict of concepts in each sentence {0: [1, 4, ... ], 1: ... } curr_concept_weights = self.concept_weights # dict of weight for each concept {'(he, said)': 3.4, ... } curr_concept_index = self.concept_index # dict of index for each concept {'(he, said)': 100, ... } curr_concept_sents = {} # dict of sentences for each concept for sent_index in range(len(curr_sents)): concepts = curr_sent_concepts[sent_index] for concept in concepts: if not concept in curr_concept_sents: curr_concept_sents[concept] = [] curr_concept_sents[concept].append(sent_index) ## testing code #num_sents = len(curr_sents) #num_concepts = len(curr_concept_index) #sent_lengths = [len(s.tokens) for s in curr_sents] #print #print 'sents [%d] concepts [%d] [%1.2f]' %(num_sents, num_concepts, 1.0*num_concepts/num_sents) #print 'avg sent length [%1.2f]' %(1.0*sum(sent_lengths)/len(sent_lengths)) #print ## custom format if style != 'ilp': ## TODO: this is broken! Add local search solver class output.append('%s NUM_SENTENCES %d' % (id, len(curr_sents))) output.append('%s NUM_CONCEPTS %d' % (id, len(curr_concept_index))) output.append('%s TEXT TOPIC %s %s' % (id, self.problem.title, self.problem.narr)) ## sentence info for sent_index in range(len(curr_sents)): output.append('%s LENGTH %d %d' % (id, sent_index, curr_sents[sent_index].length)) output.append( '%s TEXT %d %s' % (id, sent_index, curr_sents[sent_index].original)) concept_list = ' '.join( map(str, curr_sent_concepts[sent_index])) output.append( '%s CONCEPTS %d %d %s' % (id, sent_index, len( curr_sent_concepts[sent_index]), concept_list)) ## concept info for concept in curr_concept_weights.keys(): str_concept = '_'.join(concept) concept_weight = curr_concept_weights[concept] concept_index = curr_concept_index[concept] output.append('%s CONCEPT_INFO %d %1.4f %s' % (id, concept_index, concept_weight, str_concept)) ## ILP output format used by glpsol (glpk) else: problem = ilp.IntegerLinearProgram() obj = ' + '.join([ '%f c%d' % (weight, curr_concept_index[concept]) for concept, weight in curr_concept_weights.items() ]) obj = obj.replace(' + -', ' - ') problem.objective["score"] = obj s1 = ' + '.join([ '%d s%d' % (curr_sents[sent_index].length, sent_index) for sent_index in range(len(curr_sents)) ]) s2 = ' <= %s\n' % max_length problem.constraints["length"] = s1 + s2 for concept, concept_index in curr_concept_index.items(): ## at least one sentence containing a selected bigram must be selected s1 = ' + '.join([ 's%d' % sent_index for sent_index in curr_concept_sents[concept_index] ]) s2 = ' - c%d >= 0' % concept_index problem.constraints["presence_%d" % concept_index] = s1 + s2 ## if a bigram is not selected then all sentences containing it are deselected s1 = ' + '.join([ 's%d' % sent_index for sent_index in curr_concept_sents[concept_index] ]) s2 = '- %d c%d <= 0' % (len( curr_concept_sents[concept_index]), concept_index) problem.constraints["absence_%d" % concept_index] = s1 + s2 for sent_index in range(len(curr_sents)): problem.binary["s%d" % sent_index] = curr_sents[sent_index] for concept, concept_index in curr_concept_index.items(): problem.binary["c%d" % concept_index] = 1 #problem.debug = 1 problem.run() output = [] for sent_index in range(len(curr_sents)): if problem.output["s%d" % sent_index] == 1: output.append(curr_sents[sent_index]) return output