def _find_top_k_similar_entities(self, query, h_doc_e_texts): """ find top k most similar entities in h_doc_e_texts, judged by each entity fields just use lm score :param query: :param h_doc_e_texts: :return: """ q_lm = text2lm(query) h_field_top_k_entities = {} for e_field in self.l_entity_fields: l_e_score = [] for e, h_field_texts in h_doc_e_texts.items(): e_text = h_field_texts.get(e_field, "") if not e_text: continue h_e_lm = text2lm(e_text.lower()) term_stat = TermStat() term_stat.set_from_raw(q_lm, h_e_lm, {}) lm_score = term_stat.lm() l_e_score.append((e, lm_score)) l_e_score.sort(key=lambda item: -item[1]) h_field_top_k_entities[e_field] = [ item[0] for item in l_e_score[:self.top_k] ] return h_field_top_k_entities
def _extract_q_doc_e_topk_merged_text_sim(self, query, h_field_top_k_entities, h_doc_e_texts): """ form an expanded documents with top k entities from each_e_field calc textual similarities between q and the expanded documents :param query: :param h_field_top_k_entities: top k most similar entities in each e fields :param h_doc_e_texts: entities' texts :return: """ h_feature = {} l_field_expanded_texts = [] for e_field, l_topk_e in h_field_top_k_entities.items(): text = "" for e in l_topk_e: text += h_doc_e_texts.get(e, {}).get(e_field, "") + ' ' l_field_expanded_texts.append((e_field, text)) q_lm = text2lm(query) total_df = self.h_corpus_stat[body_field]['total_df'] avg_doc_len = 100.0 h_doc_df = self.h_field_h_df[body_field] for e_field, text in l_field_expanded_texts: exp_lm = text2lm(text, clean=True) term_stat = TermStat() term_stat.set_from_raw(q_lm, exp_lm, h_doc_df, total_df, avg_doc_len) l_sim_score = term_stat.mul_scores() for sim, score in l_sim_score: if sim in self.l_model: h_feature[self.feature_name_pre + 'Exp' + e_field.title() + sim.title()] = score return h_feature
def _extract_per_e(self, h_q_info, e): h_feature = {} total_df = self.h_corpus_stat['bodyText']['total_df'] avg_doc_len = self.h_corpus_stat['bodyText']['average_len'] h_doc_df = self.h_field_h_df['bodyText'] q_lm = text2lm(h_q_info['query']) for e_field in self.l_entity_fields: cnt = 0 e_text = "" if e in self.h_entity_texts: if e_field in self.h_entity_texts[e]: e_text = self.h_entity_texts[e][e_field] if type(e_text) == list: e_text = ' '.join(e_text) e_lm = text2lm(e_text, clean=True) cnt += 1 term_stat = TermStat() term_stat.set_from_raw(q_lm, e_lm, h_doc_df, total_df, avg_doc_len) l_sim_score = term_stat.mul_scores() for sim, score in l_sim_score: if sim in self.s_model: h_feature[self.feature_name_pre + e_field.title() + sim.title()] = score return h_feature
def extract(self, qid, docno, h_q_info, h_doc_info): h_feature = {} l_q_e = [ ana['entities'][0]['id'] for ana in h_q_info[self.tagger]['query'] ] for field in self.l_text_fields: total_df = self.h_corpus_stat[field]['total_df'] avg_doc_len = self.h_corpus_stat[field]['average_len'] h_doc_df = self.h_field_h_df[field] h_doc_tf = {} if field in h_doc_info: h_doc_tf = text2lm(h_doc_info[field].lower()) for e_field in self.l_entity_fields: h_sim_score = {} cnt = 0 for e in l_q_e: if e not in self.h_entity_texts: continue if e_field not in self.h_entity_texts[e]: continue e_text = self.h_entity_texts[e][e_field] if type(e_text) == list: e_text = ' '.join(e_text) h_tf = text2lm(e_text, clean=True) cnt += 1 term_stat = TermStat() term_stat.set_from_raw(h_tf, h_doc_tf, h_doc_df, total_df, avg_doc_len) # if field == 'title': # title_ts = term_stat for sim, score in term_stat.mul_scores(): if sim in self.s_model: if sim not in h_sim_score: h_sim_score[sim] = score else: h_sim_score[sim] += score if cnt: for sim in h_sim_score: h_sim_score[sim] /= cnt for sim, score in h_sim_score.items(): h_feature[self.feature_name_pre + e_field.title() + field.title() + sim.title()] = score return h_feature
def extract_for_text(self, query, docno, h_q_info, h_doc_info): h_feature = {} # logging.info('extracting IR fusion for q [%s], doc [%s]', query, docno) # logging.info('q_info %s', json.dumps(h_q_info)) # logging.info('doc_info %s', json.dumps(h_doc_info)) h_tf = text2lm(query.lower()) # title_ts = None for field in self.l_text_fields: total_df = self.h_corpus_stat[field]['total_df'] avg_doc_len = self.h_corpus_stat[field]['average_len'] h_doc_df = self.h_field_h_df[field] h_doc_tf = {} if field in h_doc_info: h_doc_tf = text2lm(h_doc_info[field].lower(), clean=True) term_stat = TermStat() term_stat.set_from_raw(h_tf, h_doc_tf, h_doc_df, total_df, avg_doc_len) # if field == 'title': # title_ts = term_stat l_sim_score = term_stat.mul_scores() for sim, score in l_sim_score: if sim in self.s_model: feature_name = self.feature_name_pre + sim.title( ) + field.title() h_feature[feature_name] = score # # for feature, score in h_feature.items(): # if score != h_old_feature[feature]: # logging.warn('ltr feature value different') # logging.warn('old feature: %s', json.dumps(h_old_feature)) # logging.warn('new feature: %s', json.dumps(h_feature)) # # logging.warn('old ts: %s', title_old_ts.pretty_print()) # logging.warn('new ts: %s', title_ts.pretty_print()) # logging.warn('query: %s, h_tf: %s', query, json.dumps(h_tf)) # break return h_feature
def _extract_q_doc_e_textual_features(self, query, l_h_doc_e_lm, h_doc_e_texts): if not self.h_entity_texts: return {} h_feature = {} q_lm = text2lm(query) for p in xrange(len(self.l_text_fields)): field = self.l_text_fields[p] if self.l_top_k: self.top_k = self.l_top_k[p] h_doc_e_lm = l_h_doc_e_lm[p] total_df = self.h_corpus_stat[field]['total_df'] avg_doc_len = self.h_corpus_stat[field]['average_len'] h_doc_df = self.h_field_h_df[field] l_h_scores = [] l_e_tf = [] for e, e_tf in h_doc_e_lm.items(): h_scores = {} l_e_tf.append(e_tf) h_e_texts = h_doc_e_texts.get(e, {}) for e_field in self.l_entity_fields: text = h_e_texts.get(e_field, "") e_lm = text2lm(text, clean=True) term_stat = TermStat() term_stat.set_from_raw(q_lm, e_lm, h_doc_df, total_df, avg_doc_len) l_sim_score = term_stat.mul_scores() for sim, score in l_sim_score: if sim in self.l_model: h_scores[e_field.title() + sim.title()] = score l_h_scores.append(h_scores) h_pooled_scores = self._merge_entity_sim(l_h_scores, l_e_tf) for name, score in h_pooled_scores.items(): h_feature[self.feature_name_pre + field.title() + name] = score logging.debug(json.dumps(h_feature)) return h_feature
def _calc_esearch_per_pair(self, h_q_info, h_doc_info): # h_res = {} if 'tagme' not in h_doc_info: return [] l_e_name = [(ana[0], ana[-1]) for ana in h_doc_info['tagme'][body_field]] query = h_q_info['query'] q_lm = text2lm(query, clean=True) total_df, avg_len = self.h_corpus_stat[body_field]['total_df'], 100.0 l_e_score = [] for e, name in l_e_name: desp = self.h_entity_texts[e]['desp'] e_lm = text2lm(desp, clean=True) term_stat = TermStat() term_stat.set_from_raw(q_lm, e_lm, self.h_field_h_df[body_field], total_df, avg_len) lm_dir = term_stat.lm_dir() l_e_score.append((e, name, lm_dir)) l_e_score.sort(key=lambda item: -item[-1]) # h_res['e_lm_dir'] = l_e_score[:10] return l_e_score[:10]