def make_concepts_baseline(id, path, sents, query): """ only use first sentences TODO: choose best of first 3 """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) max_order = 0 for sent in sents: ## store this sentence's concepts sent.concepts = set([]) concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) > 0: for concept in concepts: all_concepts[concept].add(sent.doc) if sent.order == 0: for concept in concepts: all_concepts[concept].add(sent.doc + 'first') ## ignore some sents if sent.order == 0: max_order = 0 skip = False if sent.length <= 5: skip = True if sent.tok in seen_sents: skip = True #if sent.length < 20: skip = True if sent.order > max_order or max_order > 0: skip = True max_order = 0 if skip: max_order += 1 continue #print sent.order, max_order, sent.doc, sent seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) #if count < 3: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path+id)
def prep_docs(path, out_path): files = os.popen('ls %s*.sent' %path).read().splitlines() ## on the first pass, create a vocab mapping vocab = set() for file in files: if '-B' in file: continue sents = open(file).read().splitlines() doc = prob_util.Counter() for sent in sents[:20]: s = util.porter_stem_sent(util.tokenize(fix_text(sent))) concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True)) vocab.update(concepts) fh = open(out_path+'vocab', 'w') vocab = zip(vocab, range(len(vocab))) for concept, count in vocab: fh.write('%s %d\n' %(concept, count)) fh.close() vocab = dict(vocab) ## on the second pass, output one doc per line for file in files: if '-B' in file: continue sents = open(file).read().splitlines() doc = prob_util.Counter() for sent in sents[:20]: s = util.porter_stem_sent(util.tokenize(fix_text(sent))) concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True)) for concept in concepts: doc[concept] += 1 ## doc output output = '%d %s' %(len(doc), ' '.join(['%s:%d' %(vocab[t],c) for t,c in doc.items()])) print output
def __init__(self, id, order, orig, doc, tok=None, parse=None, par=None, unresolved=False): self.id = id self.order = order self.orig = orig self.tok = tok self.tok2 = util.porter_stem_sent(util.tokenize(fix_text(self.orig))) self.doc = doc self.parse = parse self.new_par = (par == '1') self.length = len(self.orig.split()) self.depends = set() self.groups = [] self.skip = False self.skip_concepts = False self.unresolved = unresolved self.atleast = ""
def make_concepts_gold(id, path, sents, gold_sents): ## get gold concepts all_concepts = collections.defaultdict(set) for sent in gold_sents: concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) for concept in concepts: all_concepts[concept].add(sent.doc) ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) if util.is_just_stopwords(concept.split("_")): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) ## get sentence concepts seen_sents = set() for sent_index in range(len(sents)): sent = sents[sent_index] sent.concepts = set([]) ## skip some sents skip = False # if sent.order >= 3: skip = True if not sent.new_par: skip = True if sent.length < 20: skip = True if sent.orig in seen_sents: skip = True if sent.length <= 5: skip = True if skip: continue seen_sents.add(sent.orig) s = util.porter_stem_sent(util.tokenize(fix_text(sent.orig))) concepts = set(util.get_ngrams(s, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)
def __init__(self, bytes, id, order, orig, doc, tok=None, par=None, unresolved=False, lang='fr'): self.id = id self.order = order self.orig = orig self.tok = tok if lang=='en': self.tok2 = util.porter_stem_sent(util.tokenize(fix_text(self.orig))) elif lang=='fr': self.tok2 = " ".join([stmr.stem(w) for w in nltk.tokenize.word_tokenize(self.orig.decode('utf8'))]) else: print 'Unsupported language...' sys.exit(0) # print "TOK2", len(self.tok2) self.doc = doc self.new_par = (par == '1') if bytes >- 1: self.length = len(self.orig) #for bytes else: self.length = len(self.orig.split()) self.unresolved = unresolved
def make_concepts_gold(id, path, sents, gold_sents): ## get gold concepts all_concepts = collections.defaultdict(set) for sent in gold_sents: concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) for concept in concepts: all_concepts[concept].add(sent.doc) ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) ## get sentence concepts seen_sents = set() for sent_index in range(len(sents)): sent = sents[sent_index] sent.concepts = set([]) ## skip some sents skip = False #if sent.order >= 3: skip = True if not sent.new_par: skip = True if sent.length < 20: skip = True if sent.orig in seen_sents: skip = True if sent.length <= 5: skip = True if skip: continue seen_sents.add(sent.orig) s = util.porter_stem_sent(util.tokenize(fix_text(sent.orig))) concepts = set(util.get_ngrams(s, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)
def make_concepts_exp(id, path, sents, query): """ """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) ## get sentence values sent_vals = prob_util.Counter() for sent in sents: query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) sent_vals[sent] = max(0, len(query_overlap)) #if sent.relevance < 0.3: sent_vals[sent] = 0.0 #else: sent_vals[sent] = 100000**sent.relevance concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = set() for concept in concepts: if util.is_just_stopwords(concept.split('_')): continue sent.concepts.add(concept) sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) iter = 0 while True: iter += 1 se = prob_util.entropy(sent_vals) ce = prob_util.entropy(concept_vals) print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce) if iter >= 1: break ## get sent vals again sent_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: sent_vals[sent] += concept_vals[concept] sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) sorted_sents = sent_vals.sortedKeys() #for sent in sorted_sents: # print sent_vals[sent], sent.order, sent.new_par, sent sorted_concepts = concept_vals.sortedKeys() #for concept in sorted_concepts: # print concept_vals[concept], concept ## create final concept set final_concepts = {} for concept in sorted_concepts: val = concept_vals[concept] #if val < 0.00001: continue final_concepts[concept] = val final_concept_set = set(final_concepts.keys()) ## get final sentence list and their concepts seen_sents = set() for sent in sents: skip = False if sent.length <= 5: skip = True if sent in seen_sents: skip = True if sent.order > 0: skip = True else: seen_sents.add(sent) if skip: sent.concepts = set() else: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path+id)
def make_concepts_compress2(id, path, sents, query, compressed_sents): """ """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) ## different processing for set A and set B if '-B' in id: first_weight = 2 count_thresh = 4 query_thresh = 0 else: first_weight = 1 count_thresh = 3 query_thresh = 1 for sent in sents: ## store this sentence's concepts sent.concepts = set() concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) >= query_thresh: for concept in concepts: if sent.order == 0: all_concepts[concept].add('first' + sent.doc) else: all_concepts[concept].add(sent.doc) ## ignore some sents skip = False #if not sent.new_par: skip = True #if sent.length <= 20: skip = True if sent.tok in seen_sents: skip = True #if sent.ignore: skip = True if skip: continue seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) firsts = len([1 for d in docs if 'first' in d]) count = count + (first_weight * firsts) if count < count_thresh: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) for sent in compressed_sents: sent.concepts = set([]) if sent.unresolved: continue if sent.length < 10: continue if re.match('^["(].*[")]$', sent.orig): skip = True concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(compressed_sents, final_concepts, path+id)
def make_concepts_exp(id, path, sents, query): """ """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) ## get sentence values sent_vals = prob_util.Counter() for sent in sents: query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) sent_vals[sent] = max(0, len(query_overlap)) #if sent.relevance < 0.3: sent_vals[sent] = 0.0 #else: sent_vals[sent] = 100000**sent.relevance concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = set() for concept in concepts: if util.is_just_stopwords(concept.split('_')): continue sent.concepts.add(concept) sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) iter = 0 while True: iter += 1 se = prob_util.entropy(sent_vals) ce = prob_util.entropy(concept_vals) print >> sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' % ( iter, se, ce) if iter >= 1: break ## get sent vals again sent_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: sent_vals[sent] += concept_vals[concept] sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) sorted_sents = sent_vals.sortedKeys() #for sent in sorted_sents: # print sent_vals[sent], sent.order, sent.new_par, sent sorted_concepts = concept_vals.sortedKeys() #for concept in sorted_concepts: # print concept_vals[concept], concept ## create final concept set final_concepts = {} for concept in sorted_concepts: val = concept_vals[concept] #if val < 0.00001: continue final_concepts[concept] = val final_concept_set = set(final_concepts.keys()) ## get final sentence list and their concepts seen_sents = set() for sent in sents: skip = False if sent.length <= 5: skip = True if sent in seen_sents: skip = True if sent.order > 0: skip = True else: seen_sents.add(sent) if skip: sent.concepts = set() else: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)
def make_concepts_compress2(id, path, sents, query, compressed_sents): """ """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) ## different processing for set A and set B if '-B' in id: first_weight = 2 count_thresh = 4 query_thresh = 0 else: first_weight = 1 count_thresh = 3 query_thresh = 1 for sent in sents: ## store this sentence's concepts sent.concepts = set() concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) >= query_thresh: for concept in concepts: if sent.order == 0: all_concepts[concept].add('first' + sent.doc) else: all_concepts[concept].add(sent.doc) ## ignore some sents skip = False #if not sent.new_par: skip = True #if sent.length <= 20: skip = True if sent.tok in seen_sents: skip = True #if sent.ignore: skip = True if skip: continue seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) firsts = len([1 for d in docs if 'first' in d]) count = count + (first_weight * firsts) if count < count_thresh: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) for sent in compressed_sents: sent.concepts = set([]) if sent.unresolved: continue if sent.length < 10: continue if re.match('^["(].*[")]$', sent.orig): skip = True concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(compressed_sents, final_concepts, path + id)
def make_concepts_baseline(id, path, sents, query): """ only use first sentences TODO: choose best of first 3 """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) max_order = 0 for sent in sents: ## store this sentence's concepts sent.concepts = set([]) concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) > 0: for concept in concepts: all_concepts[concept].add(sent.doc) if sent.order == 0: for concept in concepts: all_concepts[concept].add(sent.doc + 'first') ## ignore some sents if sent.order == 0: max_order = 0 skip = False if sent.length <= 5: skip = True if sent.tok in seen_sents: skip = True #if sent.length < 20: skip = True if sent.order > max_order or max_order > 0: skip = True max_order = 0 if skip: max_order += 1 continue #print sent.order, max_order, sent.doc, sent seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) #if count < 3: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)