Exemplo n.º 1
0
def run_standard(options):

    ## create output directory
    try: os.popen('rm -rf %s' %options.output)
    except: pass
    try: os.popen('mkdir -p %s' %options.output)
    except:
        sys.stderr.write('Error: could not create output directory [%s]\n')
        sys.exit()

    ## summarize!
    sys.stderr.write('generating summaries for task [%s]\n' %options.task)
    sys.stderr.write('length limit [%d]\n' %task.length_limit)
    sys.stderr.write('writing output to [%s]\n' %options.output)

    map_times, run_times = {}, {}
    input_sents = []
    for problem in task.problems:
        if not 'A' in problem.id: continue
        sys.stderr.write('problem [%s] input sentences [%d]' %(problem.id, len(problem.get_new_sentences())))
        
        ## select a concept mapper
        map_times[problem.id] = time.time()
        if options.cheat:
            mapper = concept_mapper.CheatingMapper(problem, options.units)
        else:
            mapper = concept_mapper.HeuristicMapper(problem, options.units)
            
        ## map input concepts to weights
        success = mapper.map_concepts()
        if not success: sys.exit()
    
        ## choose a subset of the input sentences based on the mapping
        success = mapper.choose_sents()
        if not success: sys.exit()
        map_times[problem.id] = time.time() - map_times[problem.id]
            
        ## setup and run the ILP
        run_times[problem.id] = time.time()
        selection = mapper.run(task.length_limit)

        ## new ilp
        #mapper = mapping.LocationMapper(problem)
        #mapper.setup()
        
        selection = ordering.by_date(selection)
        run_times[problem.id] = time.time() - run_times[problem.id]

        ## TAC id convention is annoying
        output_id = problem.id
        if options.task == 'u08': output_id = problem.id[:5]+problem.id[6:]

        output_file = open('%s/%s' % (options.output, output_id), 'w')
        word_count = 0
        for sentence in selection:
            output_file.write(sentence.original + '\n')
            word_count += len(sentence.original.split())
        output_file.close()
        curr_time = map_times[problem.id] + run_times[problem.id]
        sys.stderr.write(' word count [%d] time [%1.2fs]\n' %(word_count, curr_time))

    ## timing information
    avg_run_time = 1.0*sum(run_times.values())/len(run_times)
    std_run_time = scipy.array(run_times.values()).std()
    sys.stderr.write('\nTiming results\n')
    sys.stderr.write('Mapper time:  total [%2.2fs]  min [%1.2fs]  max [%1.2fs]\n' %(sum(map_times.values()), min(map_times.values()), max(map_times.values())))
    sys.stderr.write('Run time:     total [%2.2fs]  min [%1.2fs]  max [%1.2fs] avg [%1.4f] std [%1.4f]\n' %(sum(run_times.values()), min(run_times.values()), max(run_times.values()), avg_run_time, std_run_time))
    sys.stderr.write('------\n')    
Exemplo n.º 2
0
            # allow memory cleanup
            problem.old_problems = []
            problem.old_docs = []
            problem.new_docs = []
    else:
        ## no sentence compression
        for problem in task.problems:
            for sentence in problem.get_new_sentences():
                sentence.set_text(sentence.original)
            problem.query.set_text(problem.query.original)
            sys.stderr.write(
                "%s %d\n" %
                (problem.id,
                 sum([len(doc.sentences) for doc in problem.new_docs])))
            mapper = concept_mapper.HeuristicMapperExp(problem, "n2", None)
            #mapper = concept_mapper.CheatingMapper(problem, "n2", None)
            mapper.map_concepts()
            mapper.choose_sents()
            selection = mapper.format_output("ilp", task.length_limit)
            selection = ordering.by_date(selection)
            output_file = open("%s/%s" % (options.output, problem.id), "w")
            for sentence in selection:
                output_file.write(sentence.original + "\n")
            output_file.close()
    ## evaluate
    if options.task != 'u08':
        cmd = '%s %s %s' % (ROUGE_SCORER, task.manual_path, options.output)
        eval = os.popen(cmd).read()
        [rouge_1, rouge_2, rouge_su4] = re.findall(': (\d\.\d+)', eval)
        print eval
Exemplo n.º 3
0
            output_file.write(summary)
            output_file.close()

            # allow memory cleanup
            problem.old_problems = []
            problem.old_docs = []
            problem.new_docs = []
    else:    
        ## no sentence compression
        for problem in task.problems:
            for sentence in problem.get_new_sentences():
                sentence.set_text(sentence.original)
            problem.query.set_text(problem.query.original)
            sys.stderr.write("%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs])))
            mapper = concept_mapper.HeuristicMapperExp(problem, "n2", None)
            #mapper = concept_mapper.CheatingMapper(problem, "n2", None)
            mapper.map_concepts()
            mapper.choose_sents()
            selection = mapper.format_output("ilp", task.length_limit)
            selection = ordering.by_date(selection)
            output_file = open("%s/%s" % (options.output, problem.id), "w")
            for sentence in selection:
                output_file.write(sentence.original + "\n")
            output_file.close()
    ## evaluate
    if options.task != 'u08':
        cmd = '%s %s %s' %(ROUGE_SCORER, task.manual_path, options.output)
        eval = os.popen(cmd).read()
        [rouge_1, rouge_2, rouge_su4] = re.findall(': (\d\.\d+)', eval)
        print eval
Exemplo n.º 4
0
def run_standard(options, max_sents=10000):

    ## create output directory
    try:
        os.popen('rm -rf %s' % options.output)
    except:
        pass
    try:
        os.popen('mkdir -p %s' % options.output)
    except:
        sys.stderr.write('Error: could not create output directory [%s]\n')
        sys.exit()

    ## summarize!
    sys.stderr.write('generating summaries for task [%s]\n' % options.task)
    sys.stderr.write('length limit [%d]\n' % task.length_limit)
    sys.stderr.write('writing output to [%s]\n' % options.output)

    map_times, run_times = {}, {}

    ## sentence compression
    if options.compress:
        for problem in task.problems:
            if not '-A' in problem.id: continue
            sys.stderr.write(
                "%s %d\n" %
                (problem.id,
                 sum([len(doc.sentences) for doc in problem.new_docs])))
            #mapper = concept_mapper.HeuristicMapper(problem, "n2")
            mapper = concept_mapper.CheatingMapper(problem, "n2")
            mapper.map_concepts()
            mapper.choose_sents()
            concept_weights = mapper.concept_weights
            #print concept_weight
            #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0])
            program = framework.build_alternative_program(
                problem,
                concept_weights,
                length=task.length_limit,
                sentences=mapper.relevant_sents,
                longuest_candidate_only=False)
            # run the program and get the output
            program.debug = 0
            program.run()
            #selection = framework.get_program_result(program)
            selection = []
            for variable in program.output:
                if re.match(r'^s\d+$',
                            variable) and program.output[variable] == 1:
                    selection.append(program.binary[variable])
            selection = ordering.by_date(selection)
            summary = "\n".join(sentence.original for sentence in selection)
            #summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms)

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']:
                output_id = problem.id[:5] + problem.id[6:]
            output_file = open('%s/%s' % (options.output, output_id), 'w')
            output_file.write(summary)
            output_file.close()

    elif options.mcd:
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            if num_problem_sentences < 500: continue
            used_sent_count = 0
            for sentence in problem.get_new_sentences():
                used_sent_count += 1
                sentence.set_text(sentence.original)
                if used_sent_count < max_sents: sentence.used = True
                else: sentence.used = False
            problem.query.set_text(problem.query.original)
            sys.stdout.write(
                "%s %d\n" %
                (problem.id,
                 sum([len(doc.sentences) for doc in problem.new_docs])))

            # compute idf values
            word_idf = {}
            for doc in problem.new_docs:
                seen_words = {}
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    for word in sentence.no_stop_freq:
                        if word not in seen_words: seen_words[word] = 1
                for word in seen_words:
                    if word not in word_idf: word_idf[word] = 1
                    else: word_idf[word] += 1
            for word in word_idf:
                word_idf[word] = 1.0 / word_idf[word]

            # compare sentences to centroid and derive McDonald's relevance score
            sentences = []
            index = 0
            for doc in problem.new_docs:
                doc_text = " ".join([
                    sentence.original for sentence in doc.sentences
                    if sentence.used
                ])
                centroid = text.Sentence(doc_text)
                centroid.compute_norm()
                problem.query.compute_norm()
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    sentence.compute_norm()
                    sentence.rel_score = sentence.sim_cosine(
                        centroid, word_idf) + 1 / (sentence.order + 1)
                    #sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + sentence.sim_cosine(problem.query, word_idf)
                    sentences.append(sentence)
                    sentence.index = index
                    index += 1

            # apply cutoff
            sentences.sort(lambda x, y: 1 if x.rel_score < y.rel_score else -1)
            if options.cutoff > 0 and len(sentences) > options.cutoff:
                sentences = sentences[0:options.cutoff]

            # construct ILP
            program = ilp.IntegerLinearProgram(debug=0)
            objective = []
            length_constraint = []
            for sentence in sentences:
                objective.append("%+g s%d" %
                                 (sentence.rel_score, sentence.index))
                program.binary["s%d" % sentence.index] = sentence
                length_constraint.append("%+g s%d" %
                                         (sentence.length, sentence.index))
                for peer in sentences:
                    if sentence == peer: continue
                    score = sentence.sim_cosine(peer, word_idf)
                    if score > 0:
                        objective.append("%+g s%d_%d" %
                                         (-score, sentence.index, peer.index))
                        program.binary["s%d_%d" %
                                       (sentence.index, peer.index)] = [
                                           sentence, peer
                                       ]
                        program.constraints["c1_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, sentence.index)
                        program.constraints["c2_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, peer.index)
                        program.constraints["c3_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d + s%d - s%d_%d <= 1" % (sentence.index, peer.index, sentence.index, peer.index)
            program.objective["score"] = " ".join(objective)
            program.constraints["length"] = " ".join(
                length_constraint) + " <= %g" % task.length_limit

            run_times[problem.id] = time.time()
            program.run()
            run_times[problem.id] = time.time() - run_times[problem.id]

            selection = []
            score = 0
            # get solution and check consistency
            for variable in program.binary:
                if variable in program.output and program.output[variable] == 1:
                    if type(program.binary[variable]) == type(sentences[0]):
                        selection.append(program.binary[variable])
                        score += program.binary[variable].rel_score
                        for peer in program.output:
                            if program.output[
                                    peer] == 0 or peer == variable or type(
                                        program.binary[peer]) != type(
                                            sentences[0]):
                                continue
                            if program.binary[variable].sim_cosine(
                                    program.binary[peer], word_idf) == 0:
                                continue
                            quadratic = "s%d_%d" % (
                                program.binary[variable].index,
                                program.binary[peer].index)
                            if quadratic not in program.output or program.output[
                                    quadratic] != 1:
                                print "WARNING: %s selected but %s not selected" % (
                                    variable, quadratic)

                    else:
                        score -= program.binary[variable][0].sim_cosine(
                            program.binary[variable][1], word_idf)
                        if program.output[
                                "s%d" %
                                program.binary[variable][0].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (
                                variable, program.binary[variable][0].index)
                        if program.output[
                                "s%d" %
                                program.binary[variable][1].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (
                                variable, program.binary[variable][1].index)
            #if math.fabs(program.result["score"] - score) > .1:
            #    print "WARNING: difference between score = %g and expected = %g" % (program.result["score"], score)
            selection = ordering.by_date(selection)
            new_id = re.sub(r'.-(.)$', r'-\1', problem.id)
            output_file = open("%s/%s" % (options.output, new_id), "w")
            for sentence in selection:
                output_file.write(sentence.original + "\n")
            output_file.close()

    else:
        hist = prob_util.Counter()
        input_sents = []
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            #if num_problem_sentences < 300: continue
            if not '-A' in problem.id: continue

            if options.ir:
                #docs = [doc for doc, val in problem.ir_docs]
                #for doc in docs: doc.get_sentences()
                num_overlap = len(
                    set([d.id for d in problem.ir_docs
                         ]).intersection(set([d.id
                                              for d in problem.new_docs])))
                print '%s overlap: %d' % (problem.id, num_overlap)
                info_fh.write('%s overlap [%d]\n' % (problem.id, num_overlap))

            sys.stderr.write('problem [%s] input sentences [%d]' %
                             (problem.id, num_problem_sentences))
            input_sents.append(num_problem_sentences)

            ## select a concept mapper
            map_times[problem.id] = time.time()
            if options.cheat:
                mapper = concept_mapper.CheatingMapper(problem, options.units)
            else:
                mapper = concept_mapper.HeuristicMapperExp(
                    problem, options.units)

            ## timing test
            mapper.max_sents = max_sents

            ## map input concepts to weights
            success = mapper.map_concepts()
            if not success: sys.exit()

            ## choose a subset of the input sentences based on the mapping
            success = mapper.choose_sents()
            if not success: sys.exit()
            map_times[problem.id] = time.time() - map_times[problem.id]

            ## testing
            #fh = open('concept_matrix', 'w')
            for sent in mapper.relevant_sent_concepts:
                hist[len(sent)] += 1
                #fh.write(''.join(['%d, ' %concept for concept in sent[:-1]]))
                #fh.write('%d\n' %sent[-1])
            hist[0] += (num_problem_sentences -
                        len(mapper.relevant_sent_concepts))
            #hist.displaySorted(N=100)
            #sys.exit()
            ## end testing

            ## setup and run the ILP
            run_times[problem.id] = time.time()
            selection = mapper.run(task.length_limit)
            selection = ordering.by_date(selection)
            run_times[problem.id] = time.time() - run_times[problem.id]

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']:
                output_id = problem.id[:5] + problem.id[6:]

            output_file = open('%s/%s' % (options.output, output_id), 'w')
            word_count = 0
            for sentence in selection:
                output_file.write(sentence.original + '\n')
                word_count += len(sentence.original.split())
            output_file.close()
            curr_time = map_times[problem.id] + run_times[problem.id]
            sys.stderr.write(' word count [%d] time [%1.2fs]\n' %
                             (word_count, curr_time))
Exemplo n.º 5
0
def run_standard(options, max_sents=10000):

    ## create output directory
    try: os.popen('rm -rf %s' %options.output)
    except: pass
    try: os.popen('mkdir -p %s' %options.output)
    except:
        sys.stderr.write('Error: could not create output directory [%s]\n')
        sys.exit()

    ## summarize!
    sys.stderr.write('generating summaries for task [%s]\n' %options.task)
    sys.stderr.write('length limit [%d]\n' %task.length_limit)
    sys.stderr.write('writing output to [%s]\n' %options.output)

    map_times, run_times = {}, {}
    
    ## sentence compression
    if options.compress:
        for problem in task.problems:
            if not '-A' in problem.id: continue
            sys.stderr.write("%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs])))
            #mapper = concept_mapper.HeuristicMapper(problem, "n2")
            mapper = concept_mapper.CheatingMapper(problem, "n2")
            mapper.map_concepts()
            mapper.choose_sents()
            concept_weights = mapper.concept_weights
            #print concept_weight
            #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0])
            program = framework.build_alternative_program(problem, concept_weights, length=task.length_limit, sentences=mapper.relevant_sents, longuest_candidate_only=False)
            # run the program and get the output
            program.debug = 0
            program.run()
            #selection = framework.get_program_result(program)
            selection = []
            for variable in program.output:
                if re.match(r'^s\d+$', variable) and program.output[variable] == 1:
                    selection.append(program.binary[variable])
            selection = ordering.by_date(selection)
            summary = "\n".join(sentence.original for sentence in selection)
            #summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms)

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']: output_id = problem.id[:5]+problem.id[6:]
            output_file = open('%s/%s' % (options.output, output_id), 'w')
            output_file.write(summary)
            output_file.close()
    
    elif options.mcd:
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            if num_problem_sentences < 500: continue
            used_sent_count = 0
            for sentence in problem.get_new_sentences():
                used_sent_count += 1
                sentence.set_text(sentence.original)
                if used_sent_count < max_sents: sentence.used = True
                else: sentence.used = False
            problem.query.set_text(problem.query.original)
            sys.stdout.write("%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs])))
    
            # compute idf values
            word_idf = {}
            for doc in problem.new_docs:
                seen_words = {}
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    for word in sentence.no_stop_freq:
                        if word not in seen_words: seen_words[word] = 1
                for word in seen_words:
                    if word not in word_idf: word_idf[word] = 1
                    else: word_idf[word] += 1
            for word in word_idf:
                word_idf[word] = 1.0 / word_idf[word]
            
            # compare sentences to centroid and derive McDonald's relevance score
            sentences = []
            index = 0
            for doc in problem.new_docs:
                doc_text = " ".join([sentence.original for sentence in doc.sentences if sentence.used])
                centroid = text.Sentence(doc_text)
                centroid.compute_norm()
                problem.query.compute_norm()
                for sentence in doc.sentences:
                    if not sentence.used: continue
                    sentence.compute_norm()
                    sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + 1 / (sentence.order + 1)
                    #sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + sentence.sim_cosine(problem.query, word_idf)
                    sentences.append(sentence)
                    sentence.index = index
                    index += 1
    
            # apply cutoff
            sentences.sort(lambda x, y: 1 if x.rel_score < y.rel_score else -1)
            if options.cutoff > 0 and len(sentences) > options.cutoff:
                sentences = sentences[0:options.cutoff]
    
            # construct ILP
            program = ilp.IntegerLinearProgram(debug=0)
            objective = []
            length_constraint = []
            for sentence in sentences:
                objective.append("%+g s%d" % (sentence.rel_score, sentence.index))
                program.binary["s%d" % sentence.index] = sentence
                length_constraint.append("%+g s%d" % (sentence.length, sentence.index))
                for peer in sentences:
                    if sentence == peer: continue
                    score = sentence.sim_cosine(peer, word_idf)
                    if score > 0:
                        objective.append("%+g s%d_%d" % (-score, sentence.index, peer.index))
                        program.binary["s%d_%d" % (sentence.index, peer.index)] = [sentence, peer]
                        program.constraints["c1_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, sentence.index)
                        program.constraints["c2_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d_%d - s%d <= 0" % (sentence.index, peer.index, peer.index)
                        program.constraints["c3_%d_%d" % (sentence.index, peer.index)] = \
                            "s%d + s%d - s%d_%d <= 1" % (sentence.index, peer.index, sentence.index, peer.index)
            program.objective["score"] = " ".join(objective)
            program.constraints["length"] = " ".join(length_constraint) + " <= %g" % task.length_limit

            run_times[problem.id] = time.time()    
            program.run()
            run_times[problem.id] = time.time() - run_times[problem.id]
            
            selection = []
            score = 0
            # get solution and check consistency
            for variable in program.binary:
                if variable in program.output and program.output[variable] == 1:
                    if type(program.binary[variable]) == type(sentences[0]):
                        selection.append(program.binary[variable])
                        score += program.binary[variable].rel_score
                        for peer in program.output:
                            if program.output[peer] == 0 or peer == variable or type(program.binary[peer]) != type(sentences[0]):
                                continue
                            if program.binary[variable].sim_cosine(program.binary[peer], word_idf) == 0:
                                continue
                            quadratic = "s%d_%d" % (program.binary[variable].index, program.binary[peer].index)
                            if quadratic not in program.output or program.output[quadratic] != 1:
                                print "WARNING: %s selected but %s not selected" % (variable, quadratic)
    
                    else:
                        score -= program.binary[variable][0].sim_cosine(program.binary[variable][1], word_idf)
                        if program.output["s%d" % program.binary[variable][0].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (variable, program.binary[variable][0].index)
                        if program.output["s%d" % program.binary[variable][1].index] != 1:
                            print "WARNING: %s selected while s%d not selected" % (variable, program.binary[variable][1].index)
            #if math.fabs(program.result["score"] - score) > .1:
            #    print "WARNING: difference between score = %g and expected = %g" % (program.result["score"], score)
            selection = ordering.by_date(selection)
            new_id = re.sub(r'.-(.)$', r'-\1', problem.id)
            output_file = open("%s/%s" % (options.output, new_id), "w")
            for sentence in selection:
                output_file.write(sentence.original + "\n")
            output_file.close()
        
    else:
        hist = prob_util.Counter()
        input_sents = []
        for problem in task.problems:
            num_problem_sentences = len(problem.get_new_sentences())
            #if num_problem_sentences < 300: continue
            if not '-A' in problem.id: continue

            if options.ir: 
                #docs = [doc for doc, val in problem.ir_docs]
                #for doc in docs: doc.get_sentences()
                num_overlap = len(set([d.id for d in problem.ir_docs]).intersection(set([d.id for d in problem.new_docs])))
                print '%s overlap: %d' %(problem.id, num_overlap)
                info_fh.write('%s overlap [%d]\n' %(problem.id, num_overlap))

            sys.stderr.write('problem [%s] input sentences [%d]' %(problem.id, num_problem_sentences))
            input_sents.append(num_problem_sentences)
    
            ## select a concept mapper
            map_times[problem.id] = time.time()
            if options.cheat:
                mapper = concept_mapper.CheatingMapper(problem, options.units)
            else:
                mapper = concept_mapper.HeuristicMapperExp(problem, options.units)
            
            ## timing test
            mapper.max_sents = max_sents
    
            ## map input concepts to weights
            success = mapper.map_concepts()
            if not success: sys.exit()
    
            ## choose a subset of the input sentences based on the mapping
            success = mapper.choose_sents()
            if not success: sys.exit()
            map_times[problem.id] = time.time() - map_times[problem.id]
            
            ## testing
            #fh = open('concept_matrix', 'w')
            for sent in mapper.relevant_sent_concepts:
                hist[len(sent)] += 1
                #fh.write(''.join(['%d, ' %concept for concept in sent[:-1]]))
                #fh.write('%d\n' %sent[-1])
            hist[0] += (num_problem_sentences-len(mapper.relevant_sent_concepts))
            #hist.displaySorted(N=100)
            #sys.exit()
            ## end testing

            ## setup and run the ILP
            run_times[problem.id] = time.time()
            selection = mapper.run(task.length_limit)
            selection = ordering.by_date(selection)
            run_times[problem.id] = time.time() - run_times[problem.id]

            ## TAC id convention is annoying
            output_id = problem.id
            if options.task in ['u09', 'u08']: output_id = problem.id[:5]+problem.id[6:]

            output_file = open('%s/%s' % (options.output, output_id), 'w')
            word_count = 0
            for sentence in selection:
                output_file.write(sentence.original + '\n')
                word_count += len(sentence.original.split())
            output_file.close()
            curr_time = map_times[problem.id] + run_times[problem.id]
            sys.stderr.write(' word count [%d] time [%1.2fs]\n' %(word_count, curr_time))