Пример #1
0
def make_concepts_baseline(id, path, sents, query):
    """
    only use first sentences
    TODO: choose best of first 3
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    max_order = 0
    for sent in sents:
        
        ## store this sentence's concepts
        sent.concepts = set([])
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) > 0:
            for concept in concepts:
                all_concepts[concept].add(sent.doc)

            if sent.order == 0:
                for concept in concepts:
                    all_concepts[concept].add(sent.doc + 'first')

        ## ignore some sents
        if sent.order == 0: max_order = 0
        skip = False
        if sent.length <= 5: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.length < 20: skip = True
        if sent.order > max_order or max_order > 0: 
            skip = True
            max_order = 0
        
        if skip: 
            max_order += 1
            continue
        
        #print sent.order, max_order, sent.doc, sent
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        #if count < 3: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)
        
    return create_ilp_output(sents, final_concepts, path+id)
Пример #2
0
def prep_docs(path, out_path):
    files = os.popen('ls %s*.sent' %path).read().splitlines()

    ## on the first pass, create a vocab mapping
    vocab = set()
    for file in files:
        if '-B' in file: continue
        sents = open(file).read().splitlines()
        doc = prob_util.Counter()

        for sent in sents[:20]:
            s = util.porter_stem_sent(util.tokenize(fix_text(sent)))
            concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True))
            vocab.update(concepts)

    fh = open(out_path+'vocab', 'w')
    vocab = zip(vocab, range(len(vocab)))
    for concept, count in vocab:
        fh.write('%s %d\n' %(concept, count))
    fh.close()
    vocab = dict(vocab)

    ## on the second pass, output one doc per line
    for file in files:
        if '-B' in file: continue
        sents = open(file).read().splitlines()
        doc = prob_util.Counter()

        for sent in sents[:20]:
            s = util.porter_stem_sent(util.tokenize(fix_text(sent)))
            concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True))
            for concept in concepts:
                doc[concept] += 1

        ## doc output
        output = '%d %s' %(len(doc), ' '.join(['%s:%d' %(vocab[t],c) for t,c in doc.items()]))
        print output
Пример #3
0
 def __init__(self, id, order, orig, doc, tok=None, parse=None, par=None, unresolved=False):
     self.id = id
     self.order = order
     self.orig = orig
     self.tok = tok
     self.tok2 = util.porter_stem_sent(util.tokenize(fix_text(self.orig)))
     self.doc = doc
     self.parse = parse
     self.new_par = (par == '1')
     self.length = len(self.orig.split())
     self.depends = set()
     self.groups = []
     self.skip = False
     self.skip_concepts = False
     self.unresolved = unresolved
     self.atleast = ""
Пример #4
0
def make_concepts_gold(id, path, sents, gold_sents):

    ## get gold concepts
    all_concepts = collections.defaultdict(set)
    for sent in gold_sents:
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        for concept in concepts:
            all_concepts[concept].add(sent.doc)

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        if util.is_just_stopwords(concept.split("_")):
            continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    ## get sentence concepts
    seen_sents = set()
    for sent_index in range(len(sents)):
        sent = sents[sent_index]
        sent.concepts = set([])

        ## skip some sents
        skip = False
        # if sent.order >= 3: skip = True
        if not sent.new_par:
            skip = True
        if sent.length < 20:
            skip = True

        if sent.orig in seen_sents:
            skip = True
        if sent.length <= 5:
            skip = True
        if skip:
            continue

        seen_sents.add(sent.orig)
        s = util.porter_stem_sent(util.tokenize(fix_text(sent.orig)))
        concepts = set(util.get_ngrams(s, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)
	def __init__(self, bytes, id, order, orig, doc, tok=None, par=None, unresolved=False, lang='fr'):
		self.id = id
		self.order = order
		self.orig = orig
		self.tok = tok
		if lang=='en':
			self.tok2 = util.porter_stem_sent(util.tokenize(fix_text(self.orig)))
		elif lang=='fr':
			self.tok2 = " ".join([stmr.stem(w) for w in nltk.tokenize.word_tokenize(self.orig.decode('utf8'))])
		else:
			print 'Unsupported language...'
			sys.exit(0)
#		print "TOK2", len(self.tok2)
		self.doc = doc
		self.new_par = (par == '1')
		if bytes >- 1:
			self.length = len(self.orig) #for bytes
		else:
			self.length = len(self.orig.split())
		self.unresolved = unresolved
Пример #6
0
def make_concepts_gold(id, path, sents, gold_sents):

    ## get gold concepts
    all_concepts = collections.defaultdict(set)
    for sent in gold_sents:
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        for concept in concepts:
            all_concepts[concept].add(sent.doc)

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    ## get sentence concepts
    seen_sents = set()
    for sent_index in range(len(sents)):
        sent = sents[sent_index]
        sent.concepts = set([])

        ## skip some sents
        skip = False
        #if sent.order >= 3: skip = True
        if not sent.new_par: skip = True
        if sent.length < 20: skip = True

        if sent.orig in seen_sents: skip = True
        if sent.length <= 5: skip = True
        if skip: continue

        seen_sents.add(sent.orig)
        s = util.porter_stem_sent(util.tokenize(fix_text(sent.orig)))
        concepts = set(util.get_ngrams(s, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)
Пример #7
0
 def __init__(self,
              id,
              order,
              orig,
              doc,
              tok=None,
              parse=None,
              par=None,
              unresolved=False):
     self.id = id
     self.order = order
     self.orig = orig
     self.tok = tok
     self.tok2 = util.porter_stem_sent(util.tokenize(fix_text(self.orig)))
     self.doc = doc
     self.parse = parse
     self.new_par = (par == '1')
     self.length = len(self.orig.split())
     self.depends = set()
     self.groups = []
     self.skip = False
     self.skip_concepts = False
     self.unresolved = unresolved
     self.atleast = ""
Пример #8
0
def make_concepts_exp(id, path, sents, query):
    """
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
            sent.concepts.add(concept)
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]            
    concept_vals = prob_util.normalize(concept_vals)
    
    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce)
        if iter >= 1: break
        
        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)
        
        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]            
        concept_vals = prob_util.normalize(concept_vals)
    
    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept
        
    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)        
        
    return create_ilp_output(sents, final_concepts, path+id)
Пример #9
0
def make_concepts_compress2(id, path, sents, query, compressed_sents):
    """
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    ## different processing for set A and set B
    if '-B' in id: 
        first_weight = 2
        count_thresh = 4
        query_thresh = 0
    else: 
        first_weight = 1
        count_thresh = 3
        query_thresh = 1

    for sent in sents:
        
        ## store this sentence's concepts
        sent.concepts = set()
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) >= query_thresh:
            for concept in concepts:
                if sent.order == 0: all_concepts[concept].add('first' + sent.doc)
                else: all_concepts[concept].add(sent.doc)

        ## ignore some sents
        skip = False
        #if not sent.new_par: skip = True
        #if sent.length <= 20: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.ignore: skip = True
        if skip: continue
        
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        firsts = len([1 for d in docs if 'first' in d])
        count = count + (first_weight * firsts)
        if count < count_thresh: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)
        
    for sent in compressed_sents:
        sent.concepts = set([])
        if sent.unresolved: continue
        if sent.length < 10: continue
        if re.match('^["(].*[")]$', sent.orig): skip = True
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)
        
    return create_ilp_output(compressed_sents, final_concepts, path+id)
Пример #10
0
def make_concepts_exp(id, path, sents, query):
    """
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
            sent.concepts.add(concept)
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]
    concept_vals = prob_util.normalize(concept_vals)

    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >> sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' % (
            iter, se, ce)
        if iter >= 1: break

        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)

        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]
        concept_vals = prob_util.normalize(concept_vals)

    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept

    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)
Пример #11
0
def make_concepts_compress2(id, path, sents, query, compressed_sents):
    """
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    ## different processing for set A and set B
    if '-B' in id:
        first_weight = 2
        count_thresh = 4
        query_thresh = 0
    else:
        first_weight = 1
        count_thresh = 3
        query_thresh = 1

    for sent in sents:

        ## store this sentence's concepts
        sent.concepts = set()
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) >= query_thresh:
            for concept in concepts:
                if sent.order == 0:
                    all_concepts[concept].add('first' + sent.doc)
                else:
                    all_concepts[concept].add(sent.doc)

        ## ignore some sents
        skip = False
        #if not sent.new_par: skip = True
        #if sent.length <= 20: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.ignore: skip = True
        if skip: continue

        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        firsts = len([1 for d in docs if 'first' in d])
        count = count + (first_weight * firsts)
        if count < count_thresh: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)

    for sent in compressed_sents:
        sent.concepts = set([])
        if sent.unresolved: continue
        if sent.length < 10: continue
        if re.match('^["(].*[")]$', sent.orig): skip = True
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)

    return create_ilp_output(compressed_sents, final_concepts, path + id)
Пример #12
0
def make_concepts_baseline(id, path, sents, query):
    """
    only use first sentences
    TODO: choose best of first 3
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    max_order = 0
    for sent in sents:

        ## store this sentence's concepts
        sent.concepts = set([])
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) > 0:
            for concept in concepts:
                all_concepts[concept].add(sent.doc)

            if sent.order == 0:
                for concept in concepts:
                    all_concepts[concept].add(sent.doc + 'first')

        ## ignore some sents
        if sent.order == 0: max_order = 0
        skip = False
        if sent.length <= 5: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.length < 20: skip = True
        if sent.order > max_order or max_order > 0:
            skip = True
            max_order = 0

        if skip:
            max_order += 1
            continue

        #print sent.order, max_order, sent.doc, sent
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        #if count < 3: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)