示例#1
0
def removeAcronymsFromProblem(problem):
    acronymMapping = compression.generateAcronymMapping(
        problem.get_new_and_old_sentences())
    # this will modify the sentences, but keep parse trees
    compression.replaceAcronyms(problem.get_new_sentences(), acronymMapping)
    return acronymMapping
示例#2
0
def build_alternative_program(problem, concept_weight, length=100, sentences = None, longuest_candidate_only=False):
    if not sentences:
        sentences = problem.get_new_sentences()

    for sentence in sentences:
        if not hasattr(sentence, "compression_node"):
            sentence.compression_node = compression.TreebankNode(sentence.parsed)

    nounPhraseMapping = compression.generateNounPhraseMapping([s.compression_node for s in sentences])
    #print "generating acronyms"
    acronymMapping = compression.generateAcronymMapping(problem.get_new_sentences())
    print problem.id, acronymMapping
    
    compressed_sentences = []
    seen_sentences = {}
    group_id = 0
    for sentence in sentences:
        subsentences = sentence.compression_node.getNodesByFilter(compression.TreebankNode.isSubsentence)
        candidates = {}
        for node in subsentences:
            candidates.update(node.getCandidates(mapping=nounPhraseMapping))
        if longuest_candidate_only:
            max_length = 0
            argmax = None
            for candidate in candidates:
                if len(candidate) > max_length:
                    max_length = len(candidate)
                    argmax = candidate
            if argmax != None:
                candidates = [argmax]
        for candidate in candidates:
            new_sentence = text.Sentence(compression.postProcess(candidate), sentence.order, sentence.source, sentence.date)
            if new_sentence.length <= 5: continue # skip short guys
            new_sentence.group_id = group_id
            compressed_sentences.append(new_sentence)
            seen_sentences[new_sentence.original] = 1
        group_id += 1

    compression.replaceAcronyms(compressed_sentences, acronymMapping)
    log_file = open("%s.log" % problem.id, "w")
    for sentence in compressed_sentences:
        log_file.write("%d %s\n" %( group_id, str(sentence)))
    log_file.close()

    # generate ids for acronyms
    acronym_id = {}
    acronym_length = {}
    for definition, acronym in acronymMapping.items():
        if acronym not in acronym_id:
            acronym_id[acronym] = len(acronym_id)
            acronym_length[acronym] = len(definition.strip().split())
    
    # get concepts
    relevant_sentences = []
    sentence_concepts = []
    groups = {}
    used_concepts = set()
    acronym_index = {}
    sent_index = 0
    for sentence in compressed_sentences:
        units = util.get_ngrams(sentence.stemmed, n=2, bounds=False)
        overlapping = set([u for u in units if u in concept_weight])
        if len(overlapping) == 0: continue
        relevant_sentences.append(sentence)
        sentence_concepts.append(overlapping)
        used_concepts.update(overlapping)
        if sentence.group_id not in groups: groups[sentence.group_id] = []
        groups[sentence.group_id].append(sent_index)
        # generate an acronym index
        for acronym in acronym_id:
            if re.search(r'\b' + acronym + r'\b', sentence.original):
                if acronym not in acronym_index: acronym_index[acronym] = []
                acronym_index[acronym].append(sent_index)
        sent_index += 1

    # build inverted index
    filtered_concepts = {}
    concept_index = {}
    index = 0
    for concept in used_concepts:
        concept_index[concept] = index
        filtered_concepts[concept] = concept_weight[concept]
        index += 1
    relevant_sent_concepts = [[concept_index[c] for c in cs] for cs in sentence_concepts]
    concept_weights = filtered_concepts
    curr_concept_sents = {}
    for sent_index in range(len(relevant_sentences)):
        concepts = relevant_sent_concepts[sent_index]
        for concept in concepts:
            if not concept in curr_concept_sents: curr_concept_sents[concept] = []
            curr_concept_sents[concept].append(sent_index)

    # generate the actual ILP
    program = ilp.IntegerLinearProgram()

    program.objective["score"] = ' + '.join(['%f c%d' %(concept_weight[concept], concept_index[concept]) for concept in concept_index])
    
    s1 = ' + '.join(['%d s%d' %(relevant_sentences[sent_index].length, sent_index) for sent_index in range(len(relevant_sentences))])
    # add enough space to fit the definition of each acronym employed in the summary
    s_acronyms = ' + '.join(['%d a%d' %(acronym_length[acronym], acronym_id[acronym]) for acronym in acronym_id])
    if s_acronyms != "":
        s_acronyms = " + " + s_acronyms
    s2 = ' <= %s\n' %length
    program.constraints["length"] = s1 + s_acronyms + s2
    
    for concept, index in concept_index.items():
        ## at least one sentence containing a selected bigram must be selected
        s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]])
        s2 = ' - c%d >= 0' %index                    
        program.constraints["presence_%d" % index] = s1 + s2
        ## if a bigram is not selected then all sentences containing it are deselected
        s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]])
        s2 = '- %d c%d <= 0' %(len(curr_concept_sents[index]), index)
        program.constraints["absence_%d" % index] = s1 + s2

    # constraints so that acronyms get selected along with sentences they belong to
    for acronym, index in acronym_index.items():
        s1 = ' + '.join([ 's%d' %sent_index for sent_index in index])
        s2 = ' - a%d >= 0' %acronym_id[acronym]                    
        program.constraints["acronym_presence_%d" % acronym_id[acronym]] = s1 + s2
        s1 = ' + '.join([ 's%d' %sent_index for sent_index in index])
        s2 = '- %d a%d <= 0' %(len(index), acronym_id[acronym])
        program.constraints["acronym_absence_%d" % acronym_id[acronym]] = s1 + s2

    # add sentence compression groups
    for group in groups:
        program.constraints["group_%d" % group] = " + ".join(["s%d" % sent_index for sent_index in groups[group]]) + " <= 1"

    for sent_index in range(len(relevant_sentences)):
        program.binary["s%d" % sent_index] = relevant_sentences[sent_index]
    for concept, concept_index in concept_index.items():
        program.binary["c%d" % concept_index] = 1
    for acronym, id in acronym_id.items():
        program.binary["a%d" % id] = 1

    sys.stderr.write("compression candidates: %d, original: %d\n" % (len(relevant_sentences), len(sentences)))
    program.acronyms = acronymMapping
    return program
示例#3
0
def build_alternative_program(problem,
                              concept_weight,
                              length=100,
                              sentences=None,
                              longuest_candidate_only=False,
                              providedAcronyms=None):
    if not sentences:
        sentences = problem.get_new_sentences()

    for sentence in problem.get_new_and_old_sentences():
        if not hasattr(sentence, "compression_node"):
            sentence.compression_node = compression.TreebankNode(
                sentence.parsed)

    nounPhraseMapping = compression.generateNounPhraseMapping(
        [s.compression_node for s in problem.get_new_and_old_sentences()])
    #print "generating acronyms"
    acronymMapping = None
    if providedAcronyms:
        acronymMapping = providedAcronyms
    else:
        acronymMapping = compression.generateAcronymMapping(
            problem.get_new_and_old_sentences())
    print problem.id, acronymMapping

    compressed_sentences = []
    seen_sentences = {}
    group_id = 0
    for sentence in sentences:
        subsentences = sentence.compression_node.getNodesByFilter(
            compression.TreebankNode.isSubsentence)
        candidates = {}
        for node in subsentences:
            candidates.update(
                node.getCandidates(beam=100,
                                   mapping=nounPhraseMapping,
                                   use_mandatory_removals=True))
        if longuest_candidate_only:
            max_length = 0
            argmax = None
            for candidate in candidates:
                if len(candidate) > max_length:
                    max_length = len(candidate)
                    argmax = candidate
            if argmax != None:
                candidates = [argmax]
        for candidate in candidates:
            new_sentence = text.Sentence(compression.postProcess(candidate),
                                         sentence.order, sentence.source,
                                         sentence.date)
            if new_sentence.length <= 5: continue  # skip short guys
            new_sentence.group_id = group_id
            compressed_sentences.append(new_sentence)
            seen_sentences[new_sentence.original] = 1
        group_id += 1

    compression.replaceAcronyms(compressed_sentences, acronymMapping)
    #log_file = open("%s.log" % problem.id, "w")
    #for sentence in compressed_sentences:
    #    log_file.write("%d %s\n" %( group_id, str(sentence)))
    #log_file.close()

    # generate ids for acronyms
    acronym_id = {}
    acronym_length = {}
    for definition, acronym in acronymMapping.items():
        if acronym not in acronym_id:
            acronym_id[acronym] = len(acronym_id)
            acronym_length[acronym] = len(definition.strip().split())

    # get concepts
    relevant_sentences = []
    sentence_concepts = []
    groups = {}
    used_concepts = set()
    acronym_index = {}
    sent_index = 0
    for sentence in compressed_sentences:
        units = util.get_ngrams(sentence.stemmed, n=2, bounds=False)
        overlapping = set([u for u in units if u in concept_weight])
        if len(overlapping) == 0:
            continue  # get rid of sentences that do not overlap with concepts
        relevant_sentences.append(sentence)
        sentence_concepts.append(overlapping)
        used_concepts.update(overlapping)
        if sentence.group_id not in groups: groups[sentence.group_id] = []
        groups[sentence.group_id].append(sent_index)
        # generate an acronym index
        for acronym in acronym_id:
            if re.search(r'\b' + acronym + r'\b', sentence.original):
                if acronym not in acronym_index: acronym_index[acronym] = []
                acronym_index[acronym].append(sent_index)
        sent_index += 1

    # build inverted index
    filtered_concepts = {}
    concept_index = {}
    index = 0
    for concept in used_concepts:
        concept_index[concept] = index
        filtered_concepts[concept] = concept_weight[concept]
        index += 1
    relevant_sent_concepts = [[concept_index[c] for c in cs]
                              for cs in sentence_concepts]
    concept_weights = filtered_concepts
    curr_concept_sents = {}
    for sent_index in range(len(relevant_sentences)):
        concepts = relevant_sent_concepts[sent_index]
        for concept in concepts:
            if not concept in curr_concept_sents:
                curr_concept_sents[concept] = []
            curr_concept_sents[concept].append(sent_index)

    # generate the actual ILP
    program = ilp.IntegerLinearProgram()

    program.objective["score"] = ' + '.join([
        '%f c%d' % (concept_weight[concept], concept_index[concept])
        for concept in concept_index
    ])

    s1 = ' + '.join([
        '%d s%d' % (relevant_sentences[sent_index].length, sent_index)
        for sent_index in range(len(relevant_sentences))
    ])
    # add enough space to fit the definition of each acronym employed in the summary
    s_acronyms = ' + '.join([
        '%d a%d' % (acronym_length[acronym], acronym_id[acronym])
        for acronym in acronym_id
    ])
    if s_acronyms != "":
        s_acronyms = " + " + s_acronyms
    s2 = ' <= %s\n' % length
    program.constraints["length"] = s1 + s_acronyms + s2

    for concept, index in concept_index.items():
        ## at least one sentence containing a selected bigram must be selected
        s1 = ' + '.join(
            ['s%d' % sent_index for sent_index in curr_concept_sents[index]])
        s2 = ' - c%d >= 0' % index
        program.constraints["presence_%d" % index] = s1 + s2
        ## if a bigram is not selected then all sentences containing it are deselected
        #### this constraint is disabled since it is not necessary when all sentences contain at least one concept
        #### it might also be the reason for singlar matrices that crash the solver
        #s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]])
        #s2 = ' - %d c%d <= 0' %(len(curr_concept_sents[index]), index)
        #program.constraints["absence_%d" % index] = s1 + s2

    # constraints so that acronyms get selected along with sentences they belong to
    for acronym, index in acronym_index.items():
        s1 = ' + '.join(['s%d' % sent_index for sent_index in index])
        s2 = ' - a%d >= 0' % acronym_id[acronym]
        program.constraints["acronym_presence_%d" %
                            acronym_id[acronym]] = s1 + s2
        s1 = ' + '.join(['s%d' % sent_index for sent_index in index])
        s2 = ' - %d a%d <= 0' % (len(index), acronym_id[acronym])
        program.constraints["acronym_absence_%d" %
                            acronym_id[acronym]] = s1 + s2

    # add sentence compression groups
    for group in groups:
        if len(groups[group]) > 1:
            program.constraints["group_%d" % group] = " + ".join(
                ["s%d" % sent_index for sent_index in groups[group]]) + " <= 1"

    for sent_index in range(len(relevant_sentences)):
        program.binary["s%d" % sent_index] = relevant_sentences[sent_index]
    for concept, concept_index in concept_index.items():
        program.binary["c%d" % concept_index] = 1
    for acronym, id in acronym_id.items():
        program.binary["a%d" % id] = 1

    sys.stderr.write("compression candidates: %d, original: %d\n" %
                     (len(relevant_sentences), len(sentences)))
    program.acronyms = acronymMapping
    return program
示例#4
0
def removeAcronymsFromProblem(problem):
    acronymMapping = compression.generateAcronymMapping(problem.get_new_and_old_sentences())
    # this will modify the sentences, but keep parse trees
    compression.replaceAcronyms(problem.get_new_sentences(), acronymMapping)
    return acronymMapping