Пример #1
0
 #if problem.id != 'D0704': continue
 sys.stderr.write(
     "%s %d\n" %
     (problem.id,
      sum([len(doc.sentences) for doc in problem.new_docs])))
 acronyms = framework.removeAcronymsFromProblem(problem)
 mapper = concept_mapper.HeuristicMapperExp(problem, "n2", None)
 mapper.map_concepts()
 mapper.choose_sents()
 concept_weight = mapper.concept_weight_sets[0]
 #print concept_weight.keys()
 #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0])
 program = framework.build_alternative_program(
     problem,
     concept_weight,
     length=task.length_limit,
     sentences=mapper.relevant_sent_sets[0],
     longuest_candidate_only=False,
     providedAcronyms=acronyms)
 # run the program and get the output
 program.debug = 0
 program.run()
 #selection = framework.get_program_result(program)
 selection = []
 for variable in program.output:
     if re.match(r'^s\d+$',
                 variable) and program.output[variable] == 1:
         selection.append(program.binary[variable])
 if len(selection) == 0:
     sys.stderr.write(
         'ERROR: empty summary, check the output of the solver\n')
Пример #2
0
            for sentence in problem.get_new_sentences():
                sentence.set_text(sentence.original)
            #    sentence.stemmed = sentence.no_stop
            problem.query.set_text(problem.query.original)
            #problem.query.stemmed = problem.query.no_stop

            #if problem.id != 'D0704': continue
            sys.stderr.write("%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs])))
            acronyms = framework.removeAcronymsFromProblem(problem)
            mapper = concept_mapper.HeuristicMapperExp(problem, "n2", None)
            mapper.map_concepts()
            mapper.choose_sents()
            concept_weight = mapper.concept_weight_sets[0]
            #print concept_weight.keys()
            #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0])
            program = framework.build_alternative_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0], longuest_candidate_only=False, providedAcronyms=acronyms)
            # run the program and get the output
            program.debug = 0
            program.run()
            #selection = framework.get_program_result(program)
            selection = []
            for variable in program.output:
                if re.match(r'^s\d+$', variable) and program.output[variable] == 1:
                    selection.append(program.binary[variable])
            if len(selection) == 0:
                sys.stderr.write('ERROR: empty summary, check the output of the solver\n')
                sys.exit(1)
            #selection = ordering.by_date(selection)
            selection = ordering.by_dendrogram(selection, concept_weight, problem)
            summary = "\n".join(sentence.original for sentence in selection)
            summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms)
Пример #3
0
def run_standard(options, max_sents=10000):

    ## create output directory
    try:
        os.popen('rm -rf %s' % options.output)
    except:
        pass
    try:
        os.popen('mkdir -p %s' % options.output)
    except:
        sys.stderr.write('Error: could not create output directory [%s]\n')
        sys.exit()

    ## summarize!
    sys.stderr.write('generating summaries for task [%s]\n' % options.task)
    sys.stderr.write('length limit [%d]\n' % task.length_limit)
    sys.stderr.write('writing output to [%s]\n' % options.output)

    map_times, run_times = {}, {}

    ## sentence compression
    if options.compress:
        for problem in task.problems:
            if not '-A' in problem.id: continue
            sys.stderr.write(
                "%s %d\n" %
                (problem.id,
                 sum([len(doc.sentences) for doc in problem.new_docs])))
            #mapper = concept_mapper.HeuristicMapper(problem, "n2")
            mapper = concept_mapper.CheatingMapper(problem, "n2")
            mapper.map_concepts()
            mapper.choose_sents()
            concept_weights = mapper.concept_weights
            #print concept_weight
            #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0])
            program = framework.build_alternative_program(
                problem,
                concept_weights,
                length=task.length_limit,
                sentences=mapper.relevant_sents,
                longuest_candidate_only=False)
            # run the program and get the output
            program.debug = 0
            program.run()
            #selection = framework.get_program_result(program)
            selection = []
            for variable in program.output:
                if re.match(r'^s\d+$',
                            variable) and program.output[variable] == 1:
                    selection.append(program.binary[variable])
            selection = ordering.by_date(selection)
            summary = "\n".join(sentence.original for sentence in selection)
            #summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms)

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']:
                output_id = problem.id[:5] + problem.id[6:]
            output_file = open('%s/%s' % (options.output, output_id), 'w')
            output_file.write(summary)
            output_file.close()

    elif options.mcd:
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            if num_problem_sentences < 500: continue
            used_sent_count = 0
            for sentence in problem.get_new_sentences():
                used_sent_count += 1
                sentence.set_text(sentence.original)
                if used_sent_count < max_sents: sentence.used = True
                else: sentence.used = False
            problem.query.set_text(problem.query.original)
            sys.stdout.write(
                "%s %d\n" %
                (problem.id,
                 sum([len(doc.sentences) for doc in problem.new_docs])))

            # compute idf values
            word_idf = {}
            for doc in problem.new_docs:
                seen_words = {}
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    for word in sentence.no_stop_freq:
                        if word not in seen_words: seen_words[word] = 1
                for word in seen_words:
                    if word not in word_idf: word_idf[word] = 1
                    else: word_idf[word] += 1
            for word in word_idf:
                word_idf[word] = 1.0 / word_idf[word]

            # compare sentences to centroid and derive McDonald's relevance score
            sentences = []
            index = 0
            for doc in problem.new_docs:
                doc_text = " ".join([
                    sentence.original for sentence in doc.sentences
                    if sentence.used
                ])
                centroid = text.Sentence(doc_text)
                centroid.compute_norm()
                problem.query.compute_norm()
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    sentence.compute_norm()
                    sentence.rel_score = sentence.sim_cosine(
                        centroid, word_idf) + 1 / (sentence.order + 1)
                    #sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + sentence.sim_cosine(problem.query, word_idf)
                    sentences.append(sentence)
                    sentence.index = index
                    index += 1

            # apply cutoff
            sentences.sort(lambda x, y: 1 if x.rel_score < y.rel_score else -1)
            if options.cutoff > 0 and len(sentences) > options.cutoff:
                sentences = sentences[0:options.cutoff]

            # construct ILP
            program = ilp.IntegerLinearProgram(debug=0)
            objective = []
            length_constraint = []
            for sentence in sentences:
                objective.append("%+g s%d" %
                                 (sentence.rel_score, sentence.index))
                program.binary["s%d" % sentence.index] = sentence
                length_constraint.append("%+g s%d" %
                                         (sentence.length, sentence.index))
                for peer in sentences:
                    if sentence == peer: continue
                    score = sentence.sim_cosine(peer, word_idf)
                    if score > 0:
                        objective.append("%+g s%d_%d" %
                                         (-score, sentence.index, peer.index))
                        program.binary["s%d_%d" %
                                       (sentence.index, peer.index)] = [
                                           sentence, peer
                                       ]
                        program.constraints["c1_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, sentence.index)
                        program.constraints["c2_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, peer.index)
                        program.constraints["c3_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d + s%d - s%d_%d <= 1" % (sentence.index, peer.index, sentence.index, peer.index)
            program.objective["score"] = " ".join(objective)
            program.constraints["length"] = " ".join(
                length_constraint) + " <= %g" % task.length_limit

            run_times[problem.id] = time.time()
            program.run()
            run_times[problem.id] = time.time() - run_times[problem.id]

            selection = []
            score = 0
            # get solution and check consistency
            for variable in program.binary:
                if variable in program.output and program.output[variable] == 1:
                    if type(program.binary[variable]) == type(sentences[0]):
                        selection.append(program.binary[variable])
                        score += program.binary[variable].rel_score
                        for peer in program.output:
                            if program.output[
                                    peer] == 0 or peer == variable or type(
                                        program.binary[peer]) != type(
                                            sentences[0]):
                                continue
                            if program.binary[variable].sim_cosine(
                                    program.binary[peer], word_idf) == 0:
                                continue
                            quadratic = "s%d_%d" % (
                                program.binary[variable].index,
                                program.binary[peer].index)
                            if quadratic not in program.output or program.output[
                                    quadratic] != 1:
                                print "WARNING: %s selected but %s not selected" % (
                                    variable, quadratic)

                    else:
                        score -= program.binary[variable][0].sim_cosine(
                            program.binary[variable][1], word_idf)
                        if program.output[
                                "s%d" %
                                program.binary[variable][0].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (
                                variable, program.binary[variable][0].index)
                        if program.output[
                                "s%d" %
                                program.binary[variable][1].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (
                                variable, program.binary[variable][1].index)
            #if math.fabs(program.result["score"] - score) > .1:
            #    print "WARNING: difference between score = %g and expected = %g" % (program.result["score"], score)
            selection = ordering.by_date(selection)
            new_id = re.sub(r'.-(.)$', r'-\1', problem.id)
            output_file = open("%s/%s" % (options.output, new_id), "w")
            for sentence in selection:
                output_file.write(sentence.original + "\n")
            output_file.close()

    else:
        hist = prob_util.Counter()
        input_sents = []
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            #if num_problem_sentences < 300: continue
            if not '-A' in problem.id: continue

            if options.ir:
                #docs = [doc for doc, val in problem.ir_docs]
                #for doc in docs: doc.get_sentences()
                num_overlap = len(
                    set([d.id for d in problem.ir_docs
                         ]).intersection(set([d.id
                                              for d in problem.new_docs])))
                print '%s overlap: %d' % (problem.id, num_overlap)
                info_fh.write('%s overlap [%d]\n' % (problem.id, num_overlap))

            sys.stderr.write('problem [%s] input sentences [%d]' %
                             (problem.id, num_problem_sentences))
            input_sents.append(num_problem_sentences)

            ## select a concept mapper
            map_times[problem.id] = time.time()
            if options.cheat:
                mapper = concept_mapper.CheatingMapper(problem, options.units)
            else:
                mapper = concept_mapper.HeuristicMapperExp(
                    problem, options.units)

            ## timing test
            mapper.max_sents = max_sents

            ## map input concepts to weights
            success = mapper.map_concepts()
            if not success: sys.exit()

            ## choose a subset of the input sentences based on the mapping
            success = mapper.choose_sents()
            if not success: sys.exit()
            map_times[problem.id] = time.time() - map_times[problem.id]

            ## testing
            #fh = open('concept_matrix', 'w')
            for sent in mapper.relevant_sent_concepts:
                hist[len(sent)] += 1
                #fh.write(''.join(['%d, ' %concept for concept in sent[:-1]]))
                #fh.write('%d\n' %sent[-1])
            hist[0] += (num_problem_sentences -
                        len(mapper.relevant_sent_concepts))
            #hist.displaySorted(N=100)
            #sys.exit()
            ## end testing

            ## setup and run the ILP
            run_times[problem.id] = time.time()
            selection = mapper.run(task.length_limit)
            selection = ordering.by_date(selection)
            run_times[problem.id] = time.time() - run_times[problem.id]

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']:
                output_id = problem.id[:5] + problem.id[6:]

            output_file = open('%s/%s' % (options.output, output_id), 'w')
            word_count = 0
            for sentence in selection:
                output_file.write(sentence.original + '\n')
                word_count += len(sentence.original.split())
            output_file.close()
            curr_time = map_times[problem.id] + run_times[problem.id]
            sys.stderr.write(' word count [%d] time [%1.2fs]\n' %
                             (word_count, curr_time))
Пример #4
0
def run_standard(options, max_sents=10000):

    ## create output directory
    try: os.popen('rm -rf %s' %options.output)
    except: pass
    try: os.popen('mkdir -p %s' %options.output)
    except:
        sys.stderr.write('Error: could not create output directory [%s]\n')
        sys.exit()

    ## summarize!
    sys.stderr.write('generating summaries for task [%s]\n' %options.task)
    sys.stderr.write('length limit [%d]\n' %task.length_limit)
    sys.stderr.write('writing output to [%s]\n' %options.output)

    map_times, run_times = {}, {}
    
    ## sentence compression
    if options.compress:
        for problem in task.problems:
            if not '-A' in problem.id: continue
            sys.stderr.write("%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs])))
            #mapper = concept_mapper.HeuristicMapper(problem, "n2")
            mapper = concept_mapper.CheatingMapper(problem, "n2")
            mapper.map_concepts()
            mapper.choose_sents()
            concept_weights = mapper.concept_weights
            #print concept_weight
            #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0])
            program = framework.build_alternative_program(problem, concept_weights, length=task.length_limit, sentences=mapper.relevant_sents, longuest_candidate_only=False)
            # run the program and get the output
            program.debug = 0
            program.run()
            #selection = framework.get_program_result(program)
            selection = []
            for variable in program.output:
                if re.match(r'^s\d+$', variable) and program.output[variable] == 1:
                    selection.append(program.binary[variable])
            selection = ordering.by_date(selection)
            summary = "\n".join(sentence.original for sentence in selection)
            #summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms)

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']: output_id = problem.id[:5]+problem.id[6:]
            output_file = open('%s/%s' % (options.output, output_id), 'w')
            output_file.write(summary)
            output_file.close()
    
    elif options.mcd:
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            if num_problem_sentences < 500: continue
            used_sent_count = 0
            for sentence in problem.get_new_sentences():
                used_sent_count += 1
                sentence.set_text(sentence.original)
                if used_sent_count < max_sents: sentence.used = True
                else: sentence.used = False
            problem.query.set_text(problem.query.original)
            sys.stdout.write("%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs])))
    
            # compute idf values
            word_idf = {}
            for doc in problem.new_docs:
                seen_words = {}
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    for word in sentence.no_stop_freq:
                        if word not in seen_words: seen_words[word] = 1
                for word in seen_words:
                    if word not in word_idf: word_idf[word] = 1
                    else: word_idf[word] += 1
            for word in word_idf:
                word_idf[word] = 1.0 / word_idf[word]
            
            # compare sentences to centroid and derive McDonald's relevance score
            sentences = []
            index = 0
            for doc in problem.new_docs:
                doc_text = " ".join([sentence.original for sentence in doc.sentences if sentence.used])
                centroid = text.Sentence(doc_text)
                centroid.compute_norm()
                problem.query.compute_norm()
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    sentence.compute_norm()
                    sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + 1 / (sentence.order + 1)
                    #sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + sentence.sim_cosine(problem.query, word_idf)
                    sentences.append(sentence)
                    sentence.index = index
                    index += 1
    
            # apply cutoff
            sentences.sort(lambda x, y: 1 if x.rel_score < y.rel_score else -1)
            if options.cutoff > 0 and len(sentences) > options.cutoff:
                sentences = sentences[0:options.cutoff]
    
            # construct ILP
            program = ilp.IntegerLinearProgram(debug=0)
            objective = []
            length_constraint = []
            for sentence in sentences:
                objective.append("%+g s%d" % (sentence.rel_score, sentence.index))
                program.binary["s%d" % sentence.index] = sentence
                length_constraint.append("%+g s%d" % (sentence.length, sentence.index))
                for peer in sentences:
                    if sentence == peer: continue
                    score = sentence.sim_cosine(peer, word_idf)
                    if score > 0:
                        objective.append("%+g s%d_%d" % (-score, sentence.index, peer.index))
                        program.binary["s%d_%d" % (sentence.index, peer.index)] = [sentence, peer]
                        program.constraints["c1_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, sentence.index)
                        program.constraints["c2_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, peer.index)
                        program.constraints["c3_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d + s%d - s%d_%d <= 1" % (sentence.index, peer.index, sentence.index, peer.index)
            program.objective["score"] = " ".join(objective)
            program.constraints["length"] = " ".join(length_constraint) + " <= %g" % task.length_limit

            run_times[problem.id] = time.time()    
            program.run()
            run_times[problem.id] = time.time() - run_times[problem.id]
            
            selection = []
            score = 0
            # get solution and check consistency
            for variable in program.binary:
                if variable in program.output and program.output[variable] == 1:
                    if type(program.binary[variable]) == type(sentences[0]):
                        selection.append(program.binary[variable])
                        score += program.binary[variable].rel_score
                        for peer in program.output:
                            if program.output[peer] == 0 or peer == variable or type(program.binary[peer]) != type(sentences[0]):
                                continue
                            if program.binary[variable].sim_cosine(program.binary[peer], word_idf) == 0:
                                continue
                            quadratic = "s%d_%d" % (program.binary[variable].index, program.binary[peer].index)
                            if quadratic not in program.output or program.output[quadratic] != 1:
                                print "WARNING: %s selected but %s not selected" % (variable, quadratic)
    
                    else:
                        score -= program.binary[variable][0].sim_cosine(program.binary[variable][1], word_idf)
                        if program.output["s%d" % program.binary[variable][0].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (variable, program.binary[variable][0].index)
                        if program.output["s%d" % program.binary[variable][1].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (variable, program.binary[variable][1].index)
            #if math.fabs(program.result["score"] - score) > .1:
            #    print "WARNING: difference between score = %g and expected = %g" % (program.result["score"], score)
            selection = ordering.by_date(selection)
            new_id = re.sub(r'.-(.)$', r'-\1', problem.id)
            output_file = open("%s/%s" % (options.output, new_id), "w")
            for sentence in selection:
                output_file.write(sentence.original + "\n")
            output_file.close()
        
    else:
        hist = prob_util.Counter()
        input_sents = []
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            #if num_problem_sentences < 300: continue
            if not '-A' in problem.id: continue

            if options.ir: 
                #docs = [doc for doc, val in problem.ir_docs]
                #for doc in docs: doc.get_sentences()
                num_overlap = len(set([d.id for d in problem.ir_docs]).intersection(set([d.id for d in problem.new_docs])))
                print '%s overlap: %d' %(problem.id, num_overlap)
                info_fh.write('%s overlap [%d]\n' %(problem.id, num_overlap))

            sys.stderr.write('problem [%s] input sentences [%d]' %(problem.id, num_problem_sentences))
            input_sents.append(num_problem_sentences)
    
            ## select a concept mapper
            map_times[problem.id] = time.time()
            if options.cheat:
                mapper = concept_mapper.CheatingMapper(problem, options.units)
            else:
                mapper = concept_mapper.HeuristicMapperExp(problem, options.units)
            
            ## timing test
            mapper.max_sents = max_sents
    
            ## map input concepts to weights
            success = mapper.map_concepts()
            if not success: sys.exit()
    
            ## choose a subset of the input sentences based on the mapping
            success = mapper.choose_sents()
            if not success: sys.exit()
            map_times[problem.id] = time.time() - map_times[problem.id]
            
            ## testing
            #fh = open('concept_matrix', 'w')
            for sent in mapper.relevant_sent_concepts:
                hist[len(sent)] += 1
                #fh.write(''.join(['%d, ' %concept for concept in sent[:-1]]))
                #fh.write('%d\n' %sent[-1])
            hist[0] += (num_problem_sentences-len(mapper.relevant_sent_concepts))
            #hist.displaySorted(N=100)
            #sys.exit()
            ## end testing

            ## setup and run the ILP
            run_times[problem.id] = time.time()
            selection = mapper.run(task.length_limit)
            selection = ordering.by_date(selection)
            run_times[problem.id] = time.time() - run_times[problem.id]

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']: output_id = problem.id[:5]+problem.id[6:]

            output_file = open('%s/%s' % (options.output, output_id), 'w')
            word_count = 0
            for sentence in selection:
                output_file.write(sentence.original + '\n')
                word_count += len(sentence.original.split())
            output_file.close()
            curr_time = map_times[problem.id] + run_times[problem.id]
            sys.stderr.write(' word count [%d] time [%1.2fs]\n' %(word_count, curr_time))