예제 #1
0
import sys, os
import ilp

if len(sys.argv) < 5 or len(sys.argv) > 8:
    sys.stderr.write(
        'USAGE: %s <length_constraint> <sentence_lengths> <concepts_in_sentences> <concept_weights> [sentence_groups] [dependencies] [atleast]\n'
    )
    sys.exit(1)

solver = ilp.IntegerLinearProgram(
    debug=0,
    tmp="tmp_decoder.%d.%s.%s" %
    (os.getpid(), os.environ["USER"], os.environ["HOSTNAME"]))
concept_id = {}
concept = 0
concept_weights = {}
for line in open(sys.argv[4]):
    tokens = line.strip().split()
    weight = float(tokens[1])
    if tokens[0] in concept_id:
        sys.stderr.write('ERROR: duplicate concept \"%s\", line %d in %s\n' %
                         (tokens[0], concept + 1, sys.argv[4]))
        sys.exit(1)
    concept_id[tokens[0]] = concept
    concept_weights[concept] = weight
    concept += 1

index = {}
sentence_concepts = {}
sentence = 0
for line in open(sys.argv[3]):
예제 #2
0
def build_alternative_program(problem,
                              concept_weight,
                              length=100,
                              sentences=None,
                              longuest_candidate_only=False,
                              providedAcronyms=None):
    if not sentences:
        sentences = problem.get_new_sentences()

    for sentence in problem.get_new_and_old_sentences():
        if not hasattr(sentence, "compression_node"):
            sentence.compression_node = compression.TreebankNode(
                sentence.parsed)

    nounPhraseMapping = compression.generateNounPhraseMapping(
        [s.compression_node for s in problem.get_new_and_old_sentences()])
    #print "generating acronyms"
    acronymMapping = None
    if providedAcronyms:
        acronymMapping = providedAcronyms
    else:
        acronymMapping = compression.generateAcronymMapping(
            problem.get_new_and_old_sentences())
    print problem.id, acronymMapping

    compressed_sentences = []
    seen_sentences = {}
    group_id = 0
    for sentence in sentences:
        subsentences = sentence.compression_node.getNodesByFilter(
            compression.TreebankNode.isSubsentence)
        candidates = {}
        for node in subsentences:
            candidates.update(
                node.getCandidates(beam=100,
                                   mapping=nounPhraseMapping,
                                   use_mandatory_removals=True))
        if longuest_candidate_only:
            max_length = 0
            argmax = None
            for candidate in candidates:
                if len(candidate) > max_length:
                    max_length = len(candidate)
                    argmax = candidate
            if argmax != None:
                candidates = [argmax]
        for candidate in candidates:
            new_sentence = text.Sentence(compression.postProcess(candidate),
                                         sentence.order, sentence.source,
                                         sentence.date)
            if new_sentence.length <= 5: continue  # skip short guys
            new_sentence.group_id = group_id
            compressed_sentences.append(new_sentence)
            seen_sentences[new_sentence.original] = 1
        group_id += 1

    compression.replaceAcronyms(compressed_sentences, acronymMapping)
    #log_file = open("%s.log" % problem.id, "w")
    #for sentence in compressed_sentences:
    #    log_file.write("%d %s\n" %( group_id, str(sentence)))
    #log_file.close()

    # generate ids for acronyms
    acronym_id = {}
    acronym_length = {}
    for definition, acronym in acronymMapping.items():
        if acronym not in acronym_id:
            acronym_id[acronym] = len(acronym_id)
            acronym_length[acronym] = len(definition.strip().split())

    # get concepts
    relevant_sentences = []
    sentence_concepts = []
    groups = {}
    used_concepts = set()
    acronym_index = {}
    sent_index = 0
    for sentence in compressed_sentences:
        units = util.get_ngrams(sentence.stemmed, n=2, bounds=False)
        overlapping = set([u for u in units if u in concept_weight])
        if len(overlapping) == 0:
            continue  # get rid of sentences that do not overlap with concepts
        relevant_sentences.append(sentence)
        sentence_concepts.append(overlapping)
        used_concepts.update(overlapping)
        if sentence.group_id not in groups: groups[sentence.group_id] = []
        groups[sentence.group_id].append(sent_index)
        # generate an acronym index
        for acronym in acronym_id:
            if re.search(r'\b' + acronym + r'\b', sentence.original):
                if acronym not in acronym_index: acronym_index[acronym] = []
                acronym_index[acronym].append(sent_index)
        sent_index += 1

    # build inverted index
    filtered_concepts = {}
    concept_index = {}
    index = 0
    for concept in used_concepts:
        concept_index[concept] = index
        filtered_concepts[concept] = concept_weight[concept]
        index += 1
    relevant_sent_concepts = [[concept_index[c] for c in cs]
                              for cs in sentence_concepts]
    concept_weights = filtered_concepts
    curr_concept_sents = {}
    for sent_index in range(len(relevant_sentences)):
        concepts = relevant_sent_concepts[sent_index]
        for concept in concepts:
            if not concept in curr_concept_sents:
                curr_concept_sents[concept] = []
            curr_concept_sents[concept].append(sent_index)

    # generate the actual ILP
    program = ilp.IntegerLinearProgram()

    program.objective["score"] = ' + '.join([
        '%f c%d' % (concept_weight[concept], concept_index[concept])
        for concept in concept_index
    ])

    s1 = ' + '.join([
        '%d s%d' % (relevant_sentences[sent_index].length, sent_index)
        for sent_index in range(len(relevant_sentences))
    ])
    # add enough space to fit the definition of each acronym employed in the summary
    s_acronyms = ' + '.join([
        '%d a%d' % (acronym_length[acronym], acronym_id[acronym])
        for acronym in acronym_id
    ])
    if s_acronyms != "":
        s_acronyms = " + " + s_acronyms
    s2 = ' <= %s\n' % length
    program.constraints["length"] = s1 + s_acronyms + s2

    for concept, index in concept_index.items():
        ## at least one sentence containing a selected bigram must be selected
        s1 = ' + '.join(
            ['s%d' % sent_index for sent_index in curr_concept_sents[index]])
        s2 = ' - c%d >= 0' % index
        program.constraints["presence_%d" % index] = s1 + s2
        ## if a bigram is not selected then all sentences containing it are deselected
        #### this constraint is disabled since it is not necessary when all sentences contain at least one concept
        #### it might also be the reason for singlar matrices that crash the solver
        #s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]])
        #s2 = ' - %d c%d <= 0' %(len(curr_concept_sents[index]), index)
        #program.constraints["absence_%d" % index] = s1 + s2

    # constraints so that acronyms get selected along with sentences they belong to
    for acronym, index in acronym_index.items():
        s1 = ' + '.join(['s%d' % sent_index for sent_index in index])
        s2 = ' - a%d >= 0' % acronym_id[acronym]
        program.constraints["acronym_presence_%d" %
                            acronym_id[acronym]] = s1 + s2
        s1 = ' + '.join(['s%d' % sent_index for sent_index in index])
        s2 = ' - %d a%d <= 0' % (len(index), acronym_id[acronym])
        program.constraints["acronym_absence_%d" %
                            acronym_id[acronym]] = s1 + s2

    # add sentence compression groups
    for group in groups:
        if len(groups[group]) > 1:
            program.constraints["group_%d" % group] = " + ".join(
                ["s%d" % sent_index for sent_index in groups[group]]) + " <= 1"

    for sent_index in range(len(relevant_sentences)):
        program.binary["s%d" % sent_index] = relevant_sentences[sent_index]
    for concept, concept_index in concept_index.items():
        program.binary["c%d" % concept_index] = 1
    for acronym, id in acronym_id.items():
        program.binary["a%d" % id] = 1

    sys.stderr.write("compression candidates: %d, original: %d\n" %
                     (len(relevant_sentences), len(sentences)))
    program.acronyms = acronymMapping
    return program
예제 #3
0
def run_standard(options, max_sents=10000):

    ## create output directory
    try:
        os.popen('rm -rf %s' % options.output)
    except:
        pass
    try:
        os.popen('mkdir -p %s' % options.output)
    except:
        sys.stderr.write('Error: could not create output directory [%s]\n')
        sys.exit()

    ## summarize!
    sys.stderr.write('generating summaries for task [%s]\n' % options.task)
    sys.stderr.write('length limit [%d]\n' % task.length_limit)
    sys.stderr.write('writing output to [%s]\n' % options.output)

    map_times, run_times = {}, {}

    ## sentence compression
    if options.compress:
        for problem in task.problems:
            if not '-A' in problem.id: continue
            sys.stderr.write(
                "%s %d\n" %
                (problem.id,
                 sum([len(doc.sentences) for doc in problem.new_docs])))
            #mapper = concept_mapper.HeuristicMapper(problem, "n2")
            mapper = concept_mapper.CheatingMapper(problem, "n2")
            mapper.map_concepts()
            mapper.choose_sents()
            concept_weights = mapper.concept_weights
            #print concept_weight
            #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0])
            program = framework.build_alternative_program(
                problem,
                concept_weights,
                length=task.length_limit,
                sentences=mapper.relevant_sents,
                longuest_candidate_only=False)
            # run the program and get the output
            program.debug = 0
            program.run()
            #selection = framework.get_program_result(program)
            selection = []
            for variable in program.output:
                if re.match(r'^s\d+$',
                            variable) and program.output[variable] == 1:
                    selection.append(program.binary[variable])
            selection = ordering.by_date(selection)
            summary = "\n".join(sentence.original for sentence in selection)
            #summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms)

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']:
                output_id = problem.id[:5] + problem.id[6:]
            output_file = open('%s/%s' % (options.output, output_id), 'w')
            output_file.write(summary)
            output_file.close()

    elif options.mcd:
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            if num_problem_sentences < 500: continue
            used_sent_count = 0
            for sentence in problem.get_new_sentences():
                used_sent_count += 1
                sentence.set_text(sentence.original)
                if used_sent_count < max_sents: sentence.used = True
                else: sentence.used = False
            problem.query.set_text(problem.query.original)
            sys.stdout.write(
                "%s %d\n" %
                (problem.id,
                 sum([len(doc.sentences) for doc in problem.new_docs])))

            # compute idf values
            word_idf = {}
            for doc in problem.new_docs:
                seen_words = {}
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    for word in sentence.no_stop_freq:
                        if word not in seen_words: seen_words[word] = 1
                for word in seen_words:
                    if word not in word_idf: word_idf[word] = 1
                    else: word_idf[word] += 1
            for word in word_idf:
                word_idf[word] = 1.0 / word_idf[word]

            # compare sentences to centroid and derive McDonald's relevance score
            sentences = []
            index = 0
            for doc in problem.new_docs:
                doc_text = " ".join([
                    sentence.original for sentence in doc.sentences
                    if sentence.used
                ])
                centroid = text.Sentence(doc_text)
                centroid.compute_norm()
                problem.query.compute_norm()
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    sentence.compute_norm()
                    sentence.rel_score = sentence.sim_cosine(
                        centroid, word_idf) + 1 / (sentence.order + 1)
                    #sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + sentence.sim_cosine(problem.query, word_idf)
                    sentences.append(sentence)
                    sentence.index = index
                    index += 1

            # apply cutoff
            sentences.sort(lambda x, y: 1 if x.rel_score < y.rel_score else -1)
            if options.cutoff > 0 and len(sentences) > options.cutoff:
                sentences = sentences[0:options.cutoff]

            # construct ILP
            program = ilp.IntegerLinearProgram(debug=0)
            objective = []
            length_constraint = []
            for sentence in sentences:
                objective.append("%+g s%d" %
                                 (sentence.rel_score, sentence.index))
                program.binary["s%d" % sentence.index] = sentence
                length_constraint.append("%+g s%d" %
                                         (sentence.length, sentence.index))
                for peer in sentences:
                    if sentence == peer: continue
                    score = sentence.sim_cosine(peer, word_idf)
                    if score > 0:
                        objective.append("%+g s%d_%d" %
                                         (-score, sentence.index, peer.index))
                        program.binary["s%d_%d" %
                                       (sentence.index, peer.index)] = [
                                           sentence, peer
                                       ]
                        program.constraints["c1_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, sentence.index)
                        program.constraints["c2_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, peer.index)
                        program.constraints["c3_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d + s%d - s%d_%d <= 1" % (sentence.index, peer.index, sentence.index, peer.index)
            program.objective["score"] = " ".join(objective)
            program.constraints["length"] = " ".join(
                length_constraint) + " <= %g" % task.length_limit

            run_times[problem.id] = time.time()
            program.run()
            run_times[problem.id] = time.time() - run_times[problem.id]

            selection = []
            score = 0
            # get solution and check consistency
            for variable in program.binary:
                if variable in program.output and program.output[variable] == 1:
                    if type(program.binary[variable]) == type(sentences[0]):
                        selection.append(program.binary[variable])
                        score += program.binary[variable].rel_score
                        for peer in program.output:
                            if program.output[
                                    peer] == 0 or peer == variable or type(
                                        program.binary[peer]) != type(
                                            sentences[0]):
                                continue
                            if program.binary[variable].sim_cosine(
                                    program.binary[peer], word_idf) == 0:
                                continue
                            quadratic = "s%d_%d" % (
                                program.binary[variable].index,
                                program.binary[peer].index)
                            if quadratic not in program.output or program.output[
                                    quadratic] != 1:
                                print "WARNING: %s selected but %s not selected" % (
                                    variable, quadratic)

                    else:
                        score -= program.binary[variable][0].sim_cosine(
                            program.binary[variable][1], word_idf)
                        if program.output[
                                "s%d" %
                                program.binary[variable][0].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (
                                variable, program.binary[variable][0].index)
                        if program.output[
                                "s%d" %
                                program.binary[variable][1].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (
                                variable, program.binary[variable][1].index)
            #if math.fabs(program.result["score"] - score) > .1:
            #    print "WARNING: difference between score = %g and expected = %g" % (program.result["score"], score)
            selection = ordering.by_date(selection)
            new_id = re.sub(r'.-(.)$', r'-\1', problem.id)
            output_file = open("%s/%s" % (options.output, new_id), "w")
            for sentence in selection:
                output_file.write(sentence.original + "\n")
            output_file.close()

    else:
        hist = prob_util.Counter()
        input_sents = []
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            #if num_problem_sentences < 300: continue
            if not '-A' in problem.id: continue

            if options.ir:
                #docs = [doc for doc, val in problem.ir_docs]
                #for doc in docs: doc.get_sentences()
                num_overlap = len(
                    set([d.id for d in problem.ir_docs
                         ]).intersection(set([d.id
                                              for d in problem.new_docs])))
                print '%s overlap: %d' % (problem.id, num_overlap)
                info_fh.write('%s overlap [%d]\n' % (problem.id, num_overlap))

            sys.stderr.write('problem [%s] input sentences [%d]' %
                             (problem.id, num_problem_sentences))
            input_sents.append(num_problem_sentences)

            ## select a concept mapper
            map_times[problem.id] = time.time()
            if options.cheat:
                mapper = concept_mapper.CheatingMapper(problem, options.units)
            else:
                mapper = concept_mapper.HeuristicMapperExp(
                    problem, options.units)

            ## timing test
            mapper.max_sents = max_sents

            ## map input concepts to weights
            success = mapper.map_concepts()
            if not success: sys.exit()

            ## choose a subset of the input sentences based on the mapping
            success = mapper.choose_sents()
            if not success: sys.exit()
            map_times[problem.id] = time.time() - map_times[problem.id]

            ## testing
            #fh = open('concept_matrix', 'w')
            for sent in mapper.relevant_sent_concepts:
                hist[len(sent)] += 1
                #fh.write(''.join(['%d, ' %concept for concept in sent[:-1]]))
                #fh.write('%d\n' %sent[-1])
            hist[0] += (num_problem_sentences -
                        len(mapper.relevant_sent_concepts))
            #hist.displaySorted(N=100)
            #sys.exit()
            ## end testing

            ## setup and run the ILP
            run_times[problem.id] = time.time()
            selection = mapper.run(task.length_limit)
            selection = ordering.by_date(selection)
            run_times[problem.id] = time.time() - run_times[problem.id]

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']:
                output_id = problem.id[:5] + problem.id[6:]

            output_file = open('%s/%s' % (options.output, output_id), 'w')
            word_count = 0
            for sentence in selection:
                output_file.write(sentence.original + '\n')
                word_count += len(sentence.original.split())
            output_file.close()
            curr_time = map_times[problem.id] + run_times[problem.id]
            sys.stderr.write(' word count [%d] time [%1.2fs]\n' %
                             (word_count, curr_time))
예제 #4
0
    def format_output(self, style=None, max_length = 100):
        """
        Step 3: create formatted output
        """
        ## make sure step 2 has been completed
        if not self.relevant_sent_sets:
            sys.stderr.write('Error: need to run choose_sents first\n')
            return None

        outputs = {}
        sentences = {}
        self.concept_sents_sets = []  ## new member variable (clean this up!)

        for update_index in range(len(self.concept_sets)):
            output = []
            id = self.problem.id

            ## deal with update id convention
            if len(self.concept_sets) > 1:
                id += '-' + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')[update_index]

            id += '.%s.%d.%s.%d' %('M', max_length, self.problem.id, 1)

            curr_sents = self.relevant_sent_sets[update_index]                # list of sentences
            curr_sent_concepts = self.relevant_sent_concepts[update_index]    # dict of concepts in each sentence {0: [1, 4, ... ], 1: ... }
            curr_concept_weights = self.concept_weight_sets[update_index]     # dict of weight for each concept {'(he, said)': 3.4, ... }
            curr_concept_index = self.concept_index_sets[update_index]        # dict of index for each concept  {'(he, said)': 100, ... }
            curr_concept_sents = {}                                           # dict of sentences for each concept
            for sent_index in range(len(curr_sents)):
                concepts = curr_sent_concepts[sent_index]
                for concept in concepts:
                    if not concept in curr_concept_sents: curr_concept_sents[concept] = []
                    curr_concept_sents[concept].append(sent_index)
            self.concept_sents_sets.append(curr_concept_sents)
                    
            ## custom format
            if style != 'ilp':
                output.append('%s NUM_SENTENCES %d' %(id, len(curr_sents)))
                output.append('%s NUM_CONCEPTS %d' %(id, len(curr_concept_index)))
                output.append('%s TEXT TOPIC %s %s' %(id, self.problem.title, self.problem.narr))
    
                ## sentence info
                for sent_index in range(len(curr_sents)):
                    output.append('%s LENGTH %d %d' %(id, sent_index, curr_sents[sent_index].length))
                    output.append('%s TEXT %d %s' %(id, sent_index, curr_sents[sent_index].original))
                    concept_list = ' '.join(map(str, curr_sent_concepts[sent_index]))
                    output.append('%s CONCEPTS %d %d %s' %(id, sent_index, len(curr_sent_concepts[sent_index]), concept_list))
    
                ## concept info
                for concept in curr_concept_weights.keys():
                    str_concept = '_'.join(concept)
                    concept_weight = curr_concept_weights[concept]
                    concept_index = curr_concept_index[concept]
                    output.append('%s CONCEPT_INFO %d %1.4f %s' %(id, concept_index, concept_weight, str_concept))

            ## ILP output format used by glpsol (glpk)
            else:
                problem = ilp.IntegerLinearProgram()
                problem.objective["score"] = ' + '.join(['%f c%d' %(weight, curr_concept_index[concept]) for concept, weight in curr_concept_weights.items()])
                
                s1 = ' + '.join(['%d s%d' %(curr_sents[sent_index].length, sent_index) for sent_index in range(len(curr_sents))])
                s2 = ' <= %s\n' %max_length
                problem.constraints["length"] = s1 + s2
                
                for concept, concept_index in curr_concept_index.items():
                    ## at least one sentence containing a selected bigram must be selected
                    s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[concept_index]])
                    s2 = ' - c%d >= 0' %concept_index                    
                    problem.constraints["presence_%d" % concept_index] = s1 + s2
                    ## if a bigram is not selected then all sentences containing it are deselected
                    s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[concept_index]])
                    s2 = '- %d c%d <= 0' %(len(curr_concept_sents[concept_index]), concept_index)
                    problem.constraints["absence_%d" % concept_index] = s1 + s2
                for sent_index in range(len(curr_sents)):
                    problem.binary["s%d" % sent_index] = curr_sents[sent_index]
                for concept, concept_index in curr_concept_index.items():
                    problem.binary["c%d" % concept_index] = 1
                #problem.debug = 1
                problem.run()
                output = []
                for sent_index in range(len(curr_sents)):
                    if problem.output["s%d" % sent_index] == 1:
                        output.append(curr_sents[sent_index])
                return output
                    
            outputs[id] = output
            sentences[id] = curr_sents
        return outputs, sentences
예제 #5
0
def decode(max_length,
           sentence_length_file,
           concepts_in_sentence_file,
           concept_weight_file,
           sentence_group_file=None,
           dependency_file=None,
           atleast=None,
           command="glpsol"):
    solver = ilp.IntegerLinearProgram(
        debug=1,
        tmp="tmp_decoder.%d.%s.%s" %
        (os.getpid(), os.environ["USER"], os.environ["HOSTNAME"]),
        command=command)
    #solver = ilp_to_localsolver.IntegerLinearProgram(debug=1, tmp = concept_weight_file, time_limit=1)
    concept_id = {}
    concept = 0
    concept_weights = {}
    for line in open(concept_weight_file):
        tokens = line.strip().split()
        weight = float(tokens[1])
        if tokens[0] in concept_id:
            sys.stderr.write(
                'ERROR: duplicate concept \"%s\", line %d in %s\n' %
                (tokens[0], concept + 1, concept_weight_file))
            sys.exit(1)
        concept_id[tokens[0]] = concept
        concept_weights[concept] = weight
        concept += 1

    index = {}
    sentence_concepts = {}
    sentence = 0
    for line in open(concepts_in_sentence_file):
        tokens = line.strip().split()
        concepts = {}
        for token in tokens:
            concepts[token] = True
        mapped_concepts = {}
        for concept in concepts:
            if concept not in concept_id:
                sys.stderr.write(
                    'ERROR: not weight for concept \"%s\", line %d in %s\n' %
                    (concept, sentence + 1, concepts_in_sentence_file))
                sys.exit(1)
            id = concept_id[concept]
            if id not in index: index[id] = []
            index[id].append(sentence)
            mapped_concepts[id] = True
        if len(mapped_concepts) > 0:
            sentence_concepts[sentence] = mapped_concepts
        sentence += 1

    # build objective
    objective = []
    for concept, weight in concept_weights.items():
        if concept not in index: continue  # skip unused concepts
        objective.append("%+g c%d" % (weight, concept))
        solver.binary["c%d" % concept] = concept
    solver.objective["score"] = " ".join(objective)

    # sentence => concepts
    for sentence, concepts in sentence_concepts.items():
        solver.binary["s%d" % sentence] = sentence
    #    for concept in concepts:
    #        solver.constraints["sent_%d" % len(solver.constraints)] = "s%d - c%d <= 0" % (sentence, concept)

    # concept => sentence
    for concept in index:
        solver.constraints["index_%d" % len(solver.constraints)] = " + ".join(
            ["s%d" % x for x in index[concept]]) + " - c%d >= 0" % concept

    if sentence_group_file != None:
        groups = {}
        sentence = 0
        for line in open(sentence_group_file):
            if sentence in sentence_concepts:
                if line != '\n':
                    if line not in groups:
                        groups[line] = []
                    groups[line].append(sentence)
            sentence += 1
        for group in groups:
            solver.constraints["group_%d" %
                               len(solver.constraints)] = " + ".join(
                                   ["s%d" % x
                                    for x in groups[group]]) + " <= 1"

    if dependency_file != None:
        groups = {}
        sentence = 0
        for line in open(dependency_file):
            if sentence in sentence_concepts:
                if line != '\n':
                    for id in line.strip().split():
                        id = int(id)
                        if "s%d" % id not in solver.binary:
                            solver.constraints["depend_%d" % len(
                                solver.constraints)] = "s%d = 0" % (sentence)
                        else:
                            solver.constraints["depend_%d" % len(
                                solver.constraints)] = "s%d - s%d >= 0" % (
                                    id, sentence)
            sentence += 1

    length_constraint = []
    sentence = 0
    for line in open(sentence_length_file):
        if sentence in sentence_concepts:
            length = line.strip()
            length_constraint.append("%s s%d" % (length, sentence))
            solver.objective["score"] += " - %g s%d" % (float(length) / 1000.0,
                                                        sentence)
        sentence += 1

    solver.constraints["length_%d" % len(solver.constraints)] = " + ".join(
        length_constraint) + " <= " + str(max_length)

    if atleast != None:
        at_least = []
        sentence = 0
        for line in open(atleast):
            line = line.strip()
            if sentence in sentence_concepts:
                if line == "1":
                    at_least.append("s%d" % sentence)
            sentence += 1
        if len(at_least) > 0:
            solver.constraints["at_least_%d" % len(
                solver.constraints)] = " + ".join(
                    at_least) + " >= 1"  # select at least one of those

    sys.stderr.write("ilp: %d sentences, %d concepts\n" %
                     (len(sentence_concepts), len(index)))

    if len(sentence_concepts) > 0 and len(index) > 0:
        solver.run()
    output = []
    for variable in solver.output:
        if variable.startswith("s") and solver.output[variable] == 1:
            output.append(int(variable[1:]))
    return output
예제 #6
0
    def run(self, max_length=100, style='ilp'):
        """
        Step 3: create formatted output
        """
        ## make sure step 2 has been completed
        if not self.relevant_sents:
            sys.stderr.write('\nError: need to run choose_sents first\n')
            return None

        output = []
        curr_sents = self.relevant_sents  # list of sentences
        curr_sent_concepts = self.relevant_sent_concepts  # dict of concepts in each sentence {0: [1, 4, ... ], 1: ... }
        curr_concept_weights = self.concept_weights  # dict of weight for each concept {'(he, said)': 3.4, ... }
        curr_concept_index = self.concept_index  # dict of index for each concept  {'(he, said)': 100, ... }

        curr_concept_sents = {}  # dict of sentences for each concept
        for sent_index in range(len(curr_sents)):
            concepts = curr_sent_concepts[sent_index]
            for concept in concepts:
                if not concept in curr_concept_sents:
                    curr_concept_sents[concept] = []
                curr_concept_sents[concept].append(sent_index)

        ## testing code
        #num_sents = len(curr_sents)
        #num_concepts = len(curr_concept_index)
        #sent_lengths = [len(s.tokens) for s in curr_sents]
        #print
        #print 'sents [%d] concepts [%d]  [%1.2f]' %(num_sents, num_concepts, 1.0*num_concepts/num_sents)
        #print 'avg sent length [%1.2f]' %(1.0*sum(sent_lengths)/len(sent_lengths))
        #print

        ## custom format
        if style != 'ilp':
            ## TODO: this is broken! Add local search solver class
            output.append('%s NUM_SENTENCES %d' % (id, len(curr_sents)))
            output.append('%s NUM_CONCEPTS %d' % (id, len(curr_concept_index)))
            output.append('%s TEXT TOPIC %s %s' %
                          (id, self.problem.title, self.problem.narr))

            ## sentence info
            for sent_index in range(len(curr_sents)):
                output.append('%s LENGTH %d %d' %
                              (id, sent_index, curr_sents[sent_index].length))
                output.append(
                    '%s TEXT %d %s' %
                    (id, sent_index, curr_sents[sent_index].original))
                concept_list = ' '.join(
                    map(str, curr_sent_concepts[sent_index]))
                output.append(
                    '%s CONCEPTS %d %d %s' %
                    (id, sent_index, len(
                        curr_sent_concepts[sent_index]), concept_list))

            ## concept info
            for concept in curr_concept_weights.keys():
                str_concept = '_'.join(concept)
                concept_weight = curr_concept_weights[concept]
                concept_index = curr_concept_index[concept]
                output.append('%s CONCEPT_INFO %d %1.4f %s' %
                              (id, concept_index, concept_weight, str_concept))

        ## ILP output format used by glpsol (glpk)
        else:
            problem = ilp.IntegerLinearProgram()
            obj = ' + '.join([
                '%f c%d' % (weight, curr_concept_index[concept])
                for concept, weight in curr_concept_weights.items()
            ])
            obj = obj.replace(' + -', ' - ')
            problem.objective["score"] = obj

            s1 = ' + '.join([
                '%d s%d' % (curr_sents[sent_index].length, sent_index)
                for sent_index in range(len(curr_sents))
            ])
            s2 = ' <= %s\n' % max_length
            problem.constraints["length"] = s1 + s2

            for concept, concept_index in curr_concept_index.items():
                ## at least one sentence containing a selected bigram must be selected
                s1 = ' + '.join([
                    's%d' % sent_index
                    for sent_index in curr_concept_sents[concept_index]
                ])
                s2 = ' - c%d >= 0' % concept_index
                problem.constraints["presence_%d" % concept_index] = s1 + s2

                ## if a bigram is not selected then all sentences containing it are deselected
                s1 = ' + '.join([
                    's%d' % sent_index
                    for sent_index in curr_concept_sents[concept_index]
                ])
                s2 = '- %d c%d <= 0' % (len(
                    curr_concept_sents[concept_index]), concept_index)
                problem.constraints["absence_%d" % concept_index] = s1 + s2

            for sent_index in range(len(curr_sents)):
                problem.binary["s%d" % sent_index] = curr_sents[sent_index]
            for concept, concept_index in curr_concept_index.items():
                problem.binary["c%d" % concept_index] = 1

            #problem.debug = 1
            problem.run()
            output = []
            for sent_index in range(len(curr_sents)):
                if problem.output["s%d" % sent_index] == 1:
                    output.append(curr_sents[sent_index])

        return output