def query_expand(docs, unit_selector, query): ## get sentence set sents = [] for doc in docs: #if doc.doctype != 'NEWS STORY': continue for sent in doc.sentences: ## skip short sentences #if sent.length <= 5: continue sents.append(sent) ## initialize sentences with query similarity sent_values = prob_util.Counter() for sent in sents: try: sent_values[sent.original] = sent.sim_basic(query) except: sent_values[sent.original] = 1 sent_values = sent_values.makeProbDist() ## get units in each sent sent_units = {} for sent in sents: sent_units[sent.original] = prob_util.Counter() units = unit_selector(sent.stemmed) for unit in units: if text.text_processor.is_just_stopwords(unit): continue sent_units[sent.original][unit] += 1 ## repeat until convergence previous_entropy_sent = 0 previous_entropy_unit = 0 for iter in range(1, 51): prev_sent_values = sent_values.copy() ## get new unit values from sent values unit_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: unit_values[unit] += sent_values[sent] unit_values = unit_values.makeProbDist() ## get sent values from unit values sent_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: sent_values[sent] += unit_values[unit] #/ len(sent_units[sent]) sent_values = sent_values.makeProbDist() #prob_util.Counter(unit_values).displaySorted(N=5) #prob_util.Counter(sent_values).displaySorted(N=20) ## check for convergence entropy_sent = prob_util.entropy(sent_values) entropy_unit = prob_util.entropy(unit_values) if entropy_sent >= previous_entropy_sent or entropy_unit >= previous_entropy_unit: break previous_entropy_sent = entropy_sent previous_entropy_unit = entropy_unit dist = prob_util.klDistance(prev_sent_values, sent_values) sys.stderr.write('%d sent entropy [%1.4f] unit entropy [%1.4f] sent dist [%1.6f]\n' %(iter, entropy_sent, entropy_unit, dist)) #if iter == 2: break if dist < 0.0001: sys.stderr.write('----------------------------') break return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
def query_expand(docs, unit_selector, query): ## get sentence set sents = [] for doc in docs: #if doc.doctype != 'NEWS STORY': continue for sent in doc.sentences: ## skip short sentences #if sent.length <= 5: continue sents.append(sent) ## initialize sentences with query similarity sent_values = prob_util.Counter() for sent in sents: try: sent_values[sent.original] = sent.sim_basic(query) except: sent_values[sent.original] = 1 sent_values = sent_values.makeProbDist() original_sent_values = sent_values.copy() ## get units in each sent sent_units = {} for sent in sents: sent_units[sent.original] = prob_util.Counter() units = unit_selector(sent.stemmed) for unit in units: if text.text_processor.is_just_stopwords(unit): continue sent_units[sent.original][unit] += 1 ## repeat until convergence prev_unit_entropy = 0 prev_sent_entropy = 0 prev_unit_values = {} prev_sent_values = {} for iter in range(1, 51): prev_sent_values = sent_values.copy() ## get new unit values from sent values unit_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: unit_values[unit] += sent_values[sent] unit_values = unit_values.makeProbDist() ## get sent values from unit values sent_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: sent_values[sent] += unit_values[unit] #/ len(sent_units[sent]) sent_values = sent_values.makeProbDist() ## interpolate with original sent weights sent_prior = 0.1 for sent in sent_values: new_value = (sent_prior * original_sent_values[sent]) + ( (1-sent_prior) * sent_values[sent]) #sent_values[sent] = new_value #prob_util.Counter(unit_values).displaySorted(N=100) #prob_util.Counter(sent_values).displaySorted(N=20) ## check for convergence entropy_sent = prob_util.entropy(sent_values) entropy_unit = prob_util.entropy(unit_values) dist = prob_util.klDistance(prev_sent_values, sent_values) sys.stderr.write('%d sent entropy [%1.4f] unit entropy [%1.4f] sent dist [%1.6f]\n' %(iter, entropy_sent, entropy_unit, dist)) if iter == 2: break if (entropy_unit >= prev_unit_entropy) and (entropy_sent >= prev_sent_entropy): unit_values = prev_unit_values sent_values = prev_sent_values break prev_unit_entropy = entropy_unit prev_sent_entropy = entropy_sent prev_unit_values = unit_values prev_sent_values = sent_values if dist < 0.0001: break #prob_util.Counter(unit_values).displaySorted(N=10) #prob_util.Counter(sent_values).displaySorted(N=20) return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
def map_iterative_sents(docs, unit_selector, query): ## get sentence set sents = [] for doc in docs: for sent in doc.sentences: ## skip short sentences #if sent.length <= 5: continue ## skip sentences with no query overlap if query: sim = sent.sim_basic(query) else: sim = 1 if sim <= 0: continue sents.append(sent) ## initialize uniform sentence priors sent_values = prob_util.Counter() for sent in sents: sent_values[sent.original] = 1 sent_values = sent_values.makeProbDist() ## get units in each sent sent_units = {} for sent in sents: sent_units[sent.original] = prob_util.Counter() units = unit_selector(sent.stemmed) for unit in units: if text.text_processor.is_just_stopwords(unit): continue sent_units[sent.original][unit] += 1 ## repeat until convergence for iter in range(1, 51): prev_sent_values = sent_values.copy() ## get unit values from doc values unit_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: unit_values[unit] += sent_values[sent] unit_values = unit_values.makeProbDist() ## get sent values from unit values sent_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: sent_values[sent] += unit_values[unit] #/ len(sent_units[sent]) sent_values = sent_values.makeProbDist() #prob_util.Counter(unit_values).displaySorted(N=5) #prob_util.Counter(sent_values).displaySorted(N=3) ## check for convergence entropy_sent = prob_util.entropy(sent_values) entropy_unit = prob_util.entropy(unit_values) dist = prob_util.klDistance(prev_sent_values, sent_values) #print '%d sent entropy [%1.4f] unit entropy [%1.4f] sent dist [%1.6f]' %(iter, entropy_sent, entropy_unit, dist) if iter == 2: break if dist < 0.0001: #print '----------------------------' break return prob_util.Counter(unit_values), prob_util.Counter(sent_values)