示例#1
0
def make_concepts_baseline(id, path, sents, query):
    """
    only use first sentences
    TODO: choose best of first 3
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    max_order = 0
    for sent in sents:
        
        ## store this sentence's concepts
        sent.concepts = set([])
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) > 0:
            for concept in concepts:
                all_concepts[concept].add(sent.doc)

            if sent.order == 0:
                for concept in concepts:
                    all_concepts[concept].add(sent.doc + 'first')

        ## ignore some sents
        if sent.order == 0: max_order = 0
        skip = False
        if sent.length <= 5: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.length < 20: skip = True
        if sent.order > max_order or max_order > 0: 
            skip = True
            max_order = 0
        
        if skip: 
            max_order += 1
            continue
        
        #print sent.order, max_order, sent.doc, sent
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        #if count < 3: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)
        
    return create_ilp_output(sents, final_concepts, path+id)
示例#2
0
def make_concepts_gold(id, path, sents, gold_sents):

    ## get gold concepts
    all_concepts = collections.defaultdict(set)
    for sent in gold_sents:
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        for concept in concepts:
            all_concepts[concept].add(sent.doc)

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        if util.is_just_stopwords(concept.split("_")):
            continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    ## get sentence concepts
    seen_sents = set()
    for sent_index in range(len(sents)):
        sent = sents[sent_index]
        sent.concepts = set([])

        ## skip some sents
        skip = False
        # if sent.order >= 3: skip = True
        if not sent.new_par:
            skip = True
        if sent.length < 20:
            skip = True

        if sent.orig in seen_sents:
            skip = True
        if sent.length <= 5:
            skip = True
        if skip:
            continue

        seen_sents.add(sent.orig)
        s = util.porter_stem_sent(util.tokenize(fix_text(sent.orig)))
        concepts = set(util.get_ngrams(s, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)
def make_concepts(id, path, sents, R1=True, R2=False, R4=False, SU4=False):
	all_concepts = collections.defaultdict(int)
	
	print "******************make_concepts", id
	for sent in sents:
		## store this sentence's concepts
		sent.concepts = set()
		if R1:
			concepts = set(util.get_ngrams(sent.tok2, 1, bounds=False, as_string=True))
		elif R2:
			concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
		elif R4:
			concepts = set(util.get_ngrams(sent.tok2, 4, bounds=False, as_string=True))
		elif SU4:
			concepts = set(util.get_su4(sent.tok2, as_string=True))

		for concept in concepts:
			all_concepts[concept]+=1
		sent.concepts = concepts
	return create_ilp_output(sents, all_concepts, path+id)
示例#4
0
def build_program(problem, concept_weight, length=100, sentences = None):
    """
    the ILP keeps tracks of the constraints
    s<num> variables handle sentences, subsentences and removable subtrees
    c<num> variables represent concepts in those selected pseudo-sentences
    """
    program = compression.SentenceSelectionILP(concept_weight, length, use_subsentences=True, use_removables=True, 
                                                        use_min_length=True, use_min_length_ratio=False)
    if not sentences:
        sentences = problem.get_new_sentences()
    for sentence in sentences:
        if not hasattr(sentence, "compression_node"):
            sentence.compression_node = compression.TreebankNode(sentence.parsed)

    nounPhraseMapping = compression.generateNounPhraseMapping([s.compression_node for s in sentences])
    
    for sentence in sentences:
        ## generate a compression candidate tree
        candidates = sentence.compression_node.getCandidateTree(nounPhraseMapping)
        candidate_root = compression.TreebankNode(candidates)
        candidate_root.sentence = sentence
        
        ## (or a non compressed tree)
        #candidate_root = treenode.TreeNode(sentence.compression_node.getNonCompressedCandidate())
        
        if candidate_root.isLeaf(): continue
        
        ## debugging
        #candidate_root.original = root
        #candidate_root.original_text = candidates

        # update ILP with the new sentence
        program.addSentence(candidate_root, lambda x: compression.get_bigrams_from_node(x, 
            node_skip=lambda y: not re.match(r'[A-Za-z0-9]', y.label), node_transform=lambda y: text.text_processor.porter_stem(y.text.lower())))

        # skip debugging part
        continue
        sentence_concepts = program.getConcepts(candidate_root, lambda x: compression.get_bigrams_from_node(x,
                    node_skip=lambda y: not re.match(r'[A-Za-z0-9]', y.label), node_transform=lambda y: text.text_processor.porter_stem(y.text.lower())))
        print sentence.original
        print candidate_root.getPrettyCandidates()
        for concept in sentence_concepts.keys():
            if concept not in concept_weight:
                del sentence_concepts[concept]
        print sorted(sentence_concepts.keys())
        units = dict([(x, 1) for x in util.get_ngrams(sentence.stemmed, n=2, bounds=False)])
        for concept in units.keys():
            if concept not in concept_weight:
                del units[concept]
        print sorted(units.keys())

    return program
示例#5
0
    def __init__(self, summary_problem, units='n2'):
        
        self.unit_name = units
        self.problem = summary_problem

        if   units == 'n1': self.unit_selector = lambda x: util.get_ngrams(x, n=1)
        elif units == 'n2': self.unit_selector = lambda x: util.get_ngrams(x, n=2)
        elif units == 'n3': self.unit_selector = lambda x: util.get_ngrams(x, n=3)
        elif units == 'n4': self.unit_selector = lambda x: util.get_ngrams(x, n=4)
        elif units == 'su4' : self.unit_selector = lambda x: util.get_skip_bigrams(x, k=4) + util.get_ngrams(x, n=1)
        else: units = util.get_ngrams  # default options

        ## variables to set later
        self.concepts = None
        self.concept_weights = None
        self.concept_index = None
        self.relevant_sents = None
        self.relevant_sent_concepts = None

        ## defaults
        self.min_sent_length = 5
        self.max_sents = 10000
示例#6
0
文件: lda.py 项目: DrDub/icsisumm
def prep_docs(path, out_path):
    files = os.popen('ls %s*.sent' %path).read().splitlines()

    ## on the first pass, create a vocab mapping
    vocab = set()
    for file in files:
        if '-B' in file: continue
        sents = open(file).read().splitlines()
        doc = prob_util.Counter()

        for sent in sents[:20]:
            s = util.porter_stem_sent(util.tokenize(fix_text(sent)))
            concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True))
            vocab.update(concepts)

    fh = open(out_path+'vocab', 'w')
    vocab = zip(vocab, range(len(vocab)))
    for concept, count in vocab:
        fh.write('%s %d\n' %(concept, count))
    fh.close()
    vocab = dict(vocab)

    ## on the second pass, output one doc per line
    for file in files:
        if '-B' in file: continue
        sents = open(file).read().splitlines()
        doc = prob_util.Counter()

        for sent in sents[:20]:
            s = util.porter_stem_sent(util.tokenize(fix_text(sent)))
            concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True))
            for concept in concepts:
                doc[concept] += 1

        ## doc output
        output = '%d %s' %(len(doc), ' '.join(['%s:%d' %(vocab[t],c) for t,c in doc.items()]))
        print output
示例#7
0
    def __init__(self, summary_problem, units='n2', df=None):
        
        self.unit_name = units
        self.problem = summary_problem
        self.df = df

        use_bounds = False
        if   units == 'n1': self.unit_selector = lambda x: util.get_ngrams(x, n=1, bounds=use_bounds)
        elif units == 'n2': self.unit_selector = lambda x: util.get_ngrams(x, n=2, bounds=use_bounds)
        elif units == 'n3': self.unit_selector = lambda x: util.get_ngrams(x, n=3, bounds=use_bounds)
        elif units == 'n12': self.unit_selector = lambda x: util.get_ngrams(x, n=1) + util.get_ngrams(x, n=2)
        elif units == 'n23': self.unit_selector = lambda x: util.get_ngrams(x, n=2) + util.get_ngrams(x, n=3)
        elif units == 's2' : self.unit_selector = lambda x: get_skip_bigrams(x, k=4) + util.get_ngrams(x, n=1)
        else: units = util.get_ngrams  # default options

        ## variables to set later
        self.concept_sets = None
        self.concept_weight_sets = None
        self.concept_index_sets = None
        self.relevant_sent_sets = None
        self.relevant_sent_concepts = None

        ## defaults
        self.min_sent_length = 5
    with open(prob, 'r') as fin, open('%s/%s.bigram.pos' % (output_dir, doc_id), 'w') as fout:
      while True:
        # line of original sentence
        line = fin.readline()
        if line == '':
          break
        line = line.strip()
        # line of toks
        fin.readline()
        # line of stems
        stems = fin.readline().strip().split('\t')
        # line of pos
        fin.readline()
        # line of deps
        fin.readline()
        # line of labels
        fin.readline()

        units = util.get_ngrams(stems, 2)
        assert len(units) == (len(stems) - 1)
        contained_bigrams = [x for x in enumerate(units) if x[1] in bigrams]
        for pos, bigram in contained_bigrams:
          bigram_pos[bigram].append((sent_id, pos, pos+1))

        # empty line
        fin.readline()
        sent_id += 1

      for bigram, position in bigram_pos.items():
        fout.write('%s\t%s\n' % (' '.join(bigram), ' '.join(['_'.join(str(x) for x in pos) for pos in position])))
示例#9
0
def get_features(sent, tag):

    tokens = sent.split()
    tags = [t.split('/')[-1] for t in tag.split()]
    #structs = [p.split()[0] for p in parse.split('(') if len(p.strip())>0]

    feats = {}

    ## number of tokens
    length = len(tokens)
    feats['len'] = length / 30.0

    ## connector words
    connectors = set(['however', 'because', 'and', 'so', 'also', 'nonetheless',
                      'still', 'but'])
    feats['connect'] = tokens[0].lower() in connectors

    ## number of capitalized words (assume 1st word is always capitalized)
    num_cap = len([1 for token in tokens[1:] if token[0].isupper()])
    feats['cap'] = 1.0 * num_cap / max((length-1), 1)

    ## pronouns (PRP$ are possessive; WP are who, what, which, when)
    num_pron = len([1 for tag in tags if tag in ['PRP$']])
    feats['prn'] = 1.0 * num_pron / length

    ## definite articles
    num_da = len([1 for token in tokens if token.lower() in ['the', 'that', 'these', 'those', 'this']])
    feats['da'] = 1.0 * num_da / length

    ## the [A-Z] construction
    feats['cap_cons'] = len(re.findall('[t|T]he [A-Z]', sent))

    ## first word
    #feats['first=%s' %tokens[0].lower()] = 1
    #feats['first_pos=%s' %tags[0]] = 1

    ## token ngrams
    ngrams = util.get_ngrams(tokens, 1, False) + util.get_ngrams(tokens, 2, True) 
    for ngram in ngrams:
        feats['tok=%s' %'_'.join(ngram).lower()] = 1

    ## tag ngrams
    ngrams = util.get_ngrams(tags, 2, True) #+ util.get_ngrams(tags, 4, True)
    for ngram in ngrams:
        feats['pos=%s' %'_'.join(ngram)] = 1

    ## parser ngrams
    #ngrams = util.get_ngrams(structs, 4, True)
    #for ngram in ngrams:
    #    feats['struct=%s' %'_'.join(ngram)] = 1

    ## quotes
    feats['quotes'] = int('"' in tokens)
    #num_tokens_in_quotes = 0
    #num_quotes = 0
    #in_quotes = False
    #for token in tokens:
    #    if token == '"':
    #        in_quotes = not in_quotes
    #        num_quotes += 1
    #    else:
    #        if in_quotes: num_tokens_in_quotes += 1
    #feats['in_quotes'] = 1.0 * num_tokens_in_quotes / (length - num_quotes)
    
    return feats
示例#10
0
def build_alternative_program(problem, concept_weight, length=100, sentences = None, longuest_candidate_only=False):
    if not sentences:
        sentences = problem.get_new_sentences()

    for sentence in sentences:
        if not hasattr(sentence, "compression_node"):
            sentence.compression_node = compression.TreebankNode(sentence.parsed)

    nounPhraseMapping = compression.generateNounPhraseMapping([s.compression_node for s in sentences])
    #print "generating acronyms"
    acronymMapping = compression.generateAcronymMapping(problem.get_new_sentences())
    print problem.id, acronymMapping
    
    compressed_sentences = []
    seen_sentences = {}
    group_id = 0
    for sentence in sentences:
        subsentences = sentence.compression_node.getNodesByFilter(compression.TreebankNode.isSubsentence)
        candidates = {}
        for node in subsentences:
            candidates.update(node.getCandidates(mapping=nounPhraseMapping))
        if longuest_candidate_only:
            max_length = 0
            argmax = None
            for candidate in candidates:
                if len(candidate) > max_length:
                    max_length = len(candidate)
                    argmax = candidate
            if argmax != None:
                candidates = [argmax]
        for candidate in candidates:
            new_sentence = text.Sentence(compression.postProcess(candidate), sentence.order, sentence.source, sentence.date)
            if new_sentence.length <= 5: continue # skip short guys
            new_sentence.group_id = group_id
            compressed_sentences.append(new_sentence)
            seen_sentences[new_sentence.original] = 1
        group_id += 1

    compression.replaceAcronyms(compressed_sentences, acronymMapping)
    log_file = open("%s.log" % problem.id, "w")
    for sentence in compressed_sentences:
        log_file.write("%d %s\n" %( group_id, str(sentence)))
    log_file.close()

    # generate ids for acronyms
    acronym_id = {}
    acronym_length = {}
    for definition, acronym in acronymMapping.items():
        if acronym not in acronym_id:
            acronym_id[acronym] = len(acronym_id)
            acronym_length[acronym] = len(definition.strip().split())
    
    # get concepts
    relevant_sentences = []
    sentence_concepts = []
    groups = {}
    used_concepts = set()
    acronym_index = {}
    sent_index = 0
    for sentence in compressed_sentences:
        units = util.get_ngrams(sentence.stemmed, n=2, bounds=False)
        overlapping = set([u for u in units if u in concept_weight])
        if len(overlapping) == 0: continue
        relevant_sentences.append(sentence)
        sentence_concepts.append(overlapping)
        used_concepts.update(overlapping)
        if sentence.group_id not in groups: groups[sentence.group_id] = []
        groups[sentence.group_id].append(sent_index)
        # generate an acronym index
        for acronym in acronym_id:
            if re.search(r'\b' + acronym + r'\b', sentence.original):
                if acronym not in acronym_index: acronym_index[acronym] = []
                acronym_index[acronym].append(sent_index)
        sent_index += 1

    # build inverted index
    filtered_concepts = {}
    concept_index = {}
    index = 0
    for concept in used_concepts:
        concept_index[concept] = index
        filtered_concepts[concept] = concept_weight[concept]
        index += 1
    relevant_sent_concepts = [[concept_index[c] for c in cs] for cs in sentence_concepts]
    concept_weights = filtered_concepts
    curr_concept_sents = {}
    for sent_index in range(len(relevant_sentences)):
        concepts = relevant_sent_concepts[sent_index]
        for concept in concepts:
            if not concept in curr_concept_sents: curr_concept_sents[concept] = []
            curr_concept_sents[concept].append(sent_index)

    # generate the actual ILP
    program = ilp.IntegerLinearProgram()

    program.objective["score"] = ' + '.join(['%f c%d' %(concept_weight[concept], concept_index[concept]) for concept in concept_index])
    
    s1 = ' + '.join(['%d s%d' %(relevant_sentences[sent_index].length, sent_index) for sent_index in range(len(relevant_sentences))])
    # add enough space to fit the definition of each acronym employed in the summary
    s_acronyms = ' + '.join(['%d a%d' %(acronym_length[acronym], acronym_id[acronym]) for acronym in acronym_id])
    if s_acronyms != "":
        s_acronyms = " + " + s_acronyms
    s2 = ' <= %s\n' %length
    program.constraints["length"] = s1 + s_acronyms + s2
    
    for concept, index in concept_index.items():
        ## at least one sentence containing a selected bigram must be selected
        s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]])
        s2 = ' - c%d >= 0' %index                    
        program.constraints["presence_%d" % index] = s1 + s2
        ## if a bigram is not selected then all sentences containing it are deselected
        s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]])
        s2 = '- %d c%d <= 0' %(len(curr_concept_sents[index]), index)
        program.constraints["absence_%d" % index] = s1 + s2

    # constraints so that acronyms get selected along with sentences they belong to
    for acronym, index in acronym_index.items():
        s1 = ' + '.join([ 's%d' %sent_index for sent_index in index])
        s2 = ' - a%d >= 0' %acronym_id[acronym]                    
        program.constraints["acronym_presence_%d" % acronym_id[acronym]] = s1 + s2
        s1 = ' + '.join([ 's%d' %sent_index for sent_index in index])
        s2 = '- %d a%d <= 0' %(len(index), acronym_id[acronym])
        program.constraints["acronym_absence_%d" % acronym_id[acronym]] = s1 + s2

    # add sentence compression groups
    for group in groups:
        program.constraints["group_%d" % group] = " + ".join(["s%d" % sent_index for sent_index in groups[group]]) + " <= 1"

    for sent_index in range(len(relevant_sentences)):
        program.binary["s%d" % sent_index] = relevant_sentences[sent_index]
    for concept, concept_index in concept_index.items():
        program.binary["c%d" % concept_index] = 1
    for acronym, id in acronym_id.items():
        program.binary["a%d" % id] = 1

    sys.stderr.write("compression candidates: %d, original: %d\n" % (len(relevant_sentences), len(sentences)))
    program.acronyms = acronymMapping
    return program
示例#11
0
def make_concepts_compress2(id, path, sents, query, compressed_sents):
    """
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    ## different processing for set A and set B
    if '-B' in id:
        first_weight = 2
        count_thresh = 4
        query_thresh = 0
    else:
        first_weight = 1
        count_thresh = 3
        query_thresh = 1

    for sent in sents:

        ## store this sentence's concepts
        sent.concepts = set()
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) >= query_thresh:
            for concept in concepts:
                if sent.order == 0:
                    all_concepts[concept].add('first' + sent.doc)
                else:
                    all_concepts[concept].add(sent.doc)

        ## ignore some sents
        skip = False
        #if not sent.new_par: skip = True
        #if sent.length <= 20: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.ignore: skip = True
        if skip: continue

        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        firsts = len([1 for d in docs if 'first' in d])
        count = count + (first_weight * firsts)
        if count < count_thresh: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)

    for sent in compressed_sents:
        sent.concepts = set([])
        if sent.unresolved: continue
        if sent.length < 10: continue
        if re.match('^["(].*[")]$', sent.orig): skip = True
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)

    return create_ilp_output(compressed_sents, final_concepts, path + id)
示例#12
0
def make_concepts_baseline(id, path, sents, query):
    """
    only use first sentences
    TODO: choose best of first 3
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    max_order = 0
    for sent in sents:

        ## store this sentence's concepts
        sent.concepts = set([])
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) > 0:
            for concept in concepts:
                all_concepts[concept].add(sent.doc)

            if sent.order == 0:
                for concept in concepts:
                    all_concepts[concept].add(sent.doc + 'first')

        ## ignore some sents
        if sent.order == 0: max_order = 0
        skip = False
        if sent.length <= 5: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.length < 20: skip = True
        if sent.order > max_order or max_order > 0:
            skip = True
            max_order = 0

        if skip:
            max_order += 1
            continue

        #print sent.order, max_order, sent.doc, sent
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        #if count < 3: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)
示例#13
0
 def __init__(self, concepts, problem):
   # bigrams are tuples
   self.bigrams = concepts
   self.problem = problem
   self.unit_selector = lambda x: util.get_ngrams(x, n=2)
示例#14
0
def build_alternative_program(problem,
                              concept_weight,
                              length=100,
                              sentences=None,
                              longuest_candidate_only=False):
    if not sentences:
        sentences = problem.get_new_sentences()

    for sentence in sentences:
        if not hasattr(sentence, "compression_node"):
            sentence.compression_node = compression.TreebankNode(
                sentence.parsed)

    nounPhraseMapping = compression.generateNounPhraseMapping(
        [s.compression_node for s in sentences])
    #print "generating acronyms"
    acronymMapping = compression.generateAcronymMapping(
        problem.get_new_sentences())
    print problem.id, acronymMapping

    compressed_sentences = []
    seen_sentences = {}
    group_id = 0
    for sentence in sentences:
        subsentences = sentence.compression_node.getNodesByFilter(
            compression.TreebankNode.isSubsentence)
        candidates = {}
        for node in subsentences:
            candidates.update(node.getCandidates(mapping=nounPhraseMapping))
        if longuest_candidate_only:
            max_length = 0
            argmax = None
            for candidate in candidates:
                if len(candidate) > max_length:
                    max_length = len(candidate)
                    argmax = candidate
            if argmax != None:
                candidates = [argmax]
        for candidate in candidates:
            new_sentence = text.Sentence(compression.postProcess(candidate),
                                         sentence.order, sentence.source,
                                         sentence.date)
            if new_sentence.length <= 5: continue  # skip short guys
            new_sentence.group_id = group_id
            compressed_sentences.append(new_sentence)
            seen_sentences[new_sentence.original] = 1
        group_id += 1

    compression.replaceAcronyms(compressed_sentences, acronymMapping)
    log_file = open("%s.log" % problem.id, "w")
    for sentence in compressed_sentences:
        log_file.write("%d %s\n" % (group_id, str(sentence)))
    log_file.close()

    # generate ids for acronyms
    acronym_id = {}
    acronym_length = {}
    for definition, acronym in acronymMapping.items():
        if acronym not in acronym_id:
            acronym_id[acronym] = len(acronym_id)
            acronym_length[acronym] = len(definition.strip().split())

    # get concepts
    relevant_sentences = []
    sentence_concepts = []
    groups = {}
    used_concepts = set()
    acronym_index = {}
    sent_index = 0
    for sentence in compressed_sentences:
        units = util.get_ngrams(sentence.stemmed, n=2, bounds=False)
        overlapping = set([u for u in units if u in concept_weight])
        if len(overlapping) == 0: continue
        relevant_sentences.append(sentence)
        sentence_concepts.append(overlapping)
        used_concepts.update(overlapping)
        if sentence.group_id not in groups: groups[sentence.group_id] = []
        groups[sentence.group_id].append(sent_index)
        # generate an acronym index
        for acronym in acronym_id:
            if re.search(r'\b' + acronym + r'\b', sentence.original):
                if acronym not in acronym_index: acronym_index[acronym] = []
                acronym_index[acronym].append(sent_index)
        sent_index += 1

    # build inverted index
    filtered_concepts = {}
    concept_index = {}
    index = 0
    for concept in used_concepts:
        concept_index[concept] = index
        filtered_concepts[concept] = concept_weight[concept]
        index += 1
    relevant_sent_concepts = [[concept_index[c] for c in cs]
                              for cs in sentence_concepts]
    concept_weights = filtered_concepts
    curr_concept_sents = {}
    for sent_index in range(len(relevant_sentences)):
        concepts = relevant_sent_concepts[sent_index]
        for concept in concepts:
            if not concept in curr_concept_sents:
                curr_concept_sents[concept] = []
            curr_concept_sents[concept].append(sent_index)

    # generate the actual ILP
    program = ilp.IntegerLinearProgram()

    program.objective["score"] = ' + '.join([
        '%f c%d' % (concept_weight[concept], concept_index[concept])
        for concept in concept_index
    ])

    s1 = ' + '.join([
        '%d s%d' % (relevant_sentences[sent_index].length, sent_index)
        for sent_index in range(len(relevant_sentences))
    ])
    # add enough space to fit the definition of each acronym employed in the summary
    s_acronyms = ' + '.join([
        '%d a%d' % (acronym_length[acronym], acronym_id[acronym])
        for acronym in acronym_id
    ])
    if s_acronyms != "":
        s_acronyms = " + " + s_acronyms
    s2 = ' <= %s\n' % length
    program.constraints["length"] = s1 + s_acronyms + s2

    for concept, index in concept_index.items():
        ## at least one sentence containing a selected bigram must be selected
        s1 = ' + '.join(
            ['s%d' % sent_index for sent_index in curr_concept_sents[index]])
        s2 = ' - c%d >= 0' % index
        program.constraints["presence_%d" % index] = s1 + s2
        ## if a bigram is not selected then all sentences containing it are deselected
        s1 = ' + '.join(
            ['s%d' % sent_index for sent_index in curr_concept_sents[index]])
        s2 = '- %d c%d <= 0' % (len(curr_concept_sents[index]), index)
        program.constraints["absence_%d" % index] = s1 + s2

    # constraints so that acronyms get selected along with sentences they belong to
    for acronym, index in acronym_index.items():
        s1 = ' + '.join(['s%d' % sent_index for sent_index in index])
        s2 = ' - a%d >= 0' % acronym_id[acronym]
        program.constraints["acronym_presence_%d" %
                            acronym_id[acronym]] = s1 + s2
        s1 = ' + '.join(['s%d' % sent_index for sent_index in index])
        s2 = '- %d a%d <= 0' % (len(index), acronym_id[acronym])
        program.constraints["acronym_absence_%d" %
                            acronym_id[acronym]] = s1 + s2

    # add sentence compression groups
    for group in groups:
        program.constraints["group_%d" % group] = " + ".join(
            ["s%d" % sent_index for sent_index in groups[group]]) + " <= 1"

    for sent_index in range(len(relevant_sentences)):
        program.binary["s%d" % sent_index] = relevant_sentences[sent_index]
    for concept, concept_index in concept_index.items():
        program.binary["c%d" % concept_index] = 1
    for acronym, id in acronym_id.items():
        program.binary["a%d" % id] = 1

    sys.stderr.write("compression candidates: %d, original: %d\n" %
                     (len(relevant_sentences), len(sentences)))
    program.acronyms = acronymMapping
    return program
示例#15
0
def build_program(problem, concept_weight, length=100, sentences=None):
    """
    the ILP keeps tracks of the constraints
    s<num> variables handle sentences, subsentences and removable subtrees
    c<num> variables represent concepts in those selected pseudo-sentences
    """
    program = compression.SentenceSelectionILP(concept_weight,
                                               length,
                                               use_subsentences=True,
                                               use_removables=True,
                                               use_min_length=True,
                                               use_min_length_ratio=False)
    if not sentences:
        sentences = problem.get_new_sentences()
    for sentence in sentences:
        if not hasattr(sentence, "compression_node"):
            sentence.compression_node = compression.TreebankNode(
                sentence.parsed)

    nounPhraseMapping = compression.generateNounPhraseMapping(
        [s.compression_node for s in sentences])

    for sentence in sentences:
        ## generate a compression candidate tree
        candidates = sentence.compression_node.getCandidateTree(
            nounPhraseMapping)
        candidate_root = compression.TreebankNode(candidates)
        candidate_root.sentence = sentence

        ## (or a non compressed tree)
        #candidate_root = treenode.TreeNode(sentence.compression_node.getNonCompressedCandidate())

        if candidate_root.isLeaf(): continue

        ## debugging
        #candidate_root.original = root
        #candidate_root.original_text = candidates

        # update ILP with the new sentence
        program.addSentence(
            candidate_root, lambda x: compression.get_bigrams_from_node(
                x,
                node_skip=lambda y: not re.match(r'[A-Za-z0-9]', y.label),
                node_transform=lambda y: text.text_processor.porter_stem(
                    y.text.lower())))

        # skip debugging part
        continue
        sentence_concepts = program.getConcepts(
            candidate_root, lambda x: compression.get_bigrams_from_node(
                x,
                node_skip=lambda y: not re.match(r'[A-Za-z0-9]', y.label),
                node_transform=lambda y: text.text_processor.porter_stem(
                    y.text.lower())))
        print sentence.original
        print candidate_root.getPrettyCandidates()
        for concept in sentence_concepts.keys():
            if concept not in concept_weight:
                del sentence_concepts[concept]
        print sorted(sentence_concepts.keys())
        units = dict([
            (x, 1)
            for x in util.get_ngrams(sentence.stemmed, n=2, bounds=False)
        ])
        for concept in units.keys():
            if concept not in concept_weight:
                del units[concept]
        print sorted(units.keys())

    return program
示例#16
0
def make_concepts_compress2(id, path, sents, query, compressed_sents):
    """
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    ## different processing for set A and set B
    if '-B' in id: 
        first_weight = 2
        count_thresh = 4
        query_thresh = 0
    else: 
        first_weight = 1
        count_thresh = 3
        query_thresh = 1

    for sent in sents:
        
        ## store this sentence's concepts
        sent.concepts = set()
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) >= query_thresh:
            for concept in concepts:
                if sent.order == 0: all_concepts[concept].add('first' + sent.doc)
                else: all_concepts[concept].add(sent.doc)

        ## ignore some sents
        skip = False
        #if not sent.new_par: skip = True
        #if sent.length <= 20: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.ignore: skip = True
        if skip: continue
        
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        firsts = len([1 for d in docs if 'first' in d])
        count = count + (first_weight * firsts)
        if count < count_thresh: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)
        
    for sent in compressed_sents:
        sent.concepts = set([])
        if sent.unresolved: continue
        if sent.length < 10: continue
        if re.match('^["(].*[")]$', sent.orig): skip = True
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)
        
    return create_ilp_output(compressed_sents, final_concepts, path+id)
示例#17
0
def make_concepts_exp(id, path, sents, query):
    """
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
            sent.concepts.add(concept)
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]            
    concept_vals = prob_util.normalize(concept_vals)
    
    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce)
        if iter >= 1: break
        
        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)
        
        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]            
        concept_vals = prob_util.normalize(concept_vals)
    
    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept
        
    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)        
        
    return create_ilp_output(sents, final_concepts, path+id)
示例#18
0
def make_concepts_exp(id, path, sents, query):
    """
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
            sent.concepts.add(concept)
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]
    concept_vals = prob_util.normalize(concept_vals)

    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >> sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' % (
            iter, se, ce)
        if iter >= 1: break

        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)

        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]
        concept_vals = prob_util.normalize(concept_vals)

    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept

    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)