def _entity_passage_features(self, q_info, l_grid, field): l_grid_sent = [grid['sent'] for grid in l_grid] q_lm = text2lm(q_info['query']) h_feature = dict() grid_lm = text2lm(' '.join(l_grid_sent)) r_model = RetrievalModel() r_model.set_from_raw( q_lm, grid_lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get(field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None) ) h_score = dict(r_model.scores()) h_feature.update(h_score) # l_grid_lm = [text2lm(sent) for sent in l_grid_sent] # l_scores = [] # for grid_lm in l_grid_lm: # r_model = RetrievalModel() # r_model.set_from_raw( # q_lm, grid_lm, # self.resource.corpus_stat.h_field_df.get(field, None), # self.resource.corpus_stat.h_field_total_df.get(field, None), # self.resource.corpus_stat.h_field_avg_len.get(field, None) # ) # l_scores.append(dict(r_model.scores())) # # h_feature.update(mean_pool_feature(l_scores)) # h_feature.update(max_pool_feature(l_scores)) h_feature = add_feature_prefix(h_feature, 'EntityPassage') return h_feature
def _find_top_k_similar_entities(self, query, h_doc_e_texts): """ find top k most similar entities in h_doc_e_texts, judged by each entity fields just use lm score :param query: :param h_doc_e_texts: :return: """ q_lm = text2lm(query) h_field_top_k_entities = {} for e_field in self.l_entity_fields: l_e_score = [] for e, h_field_texts in h_doc_e_texts.items(): e_text = h_field_texts.get(e_field, "") if not e_text: continue h_e_lm = text2lm(e_text.lower()) term_stat = TermStat() term_stat.set_from_raw(q_lm, h_e_lm, {}) lm_score = term_stat.lm() l_e_score.append((e, lm_score)) l_e_score.sort(key=lambda item: -item[1]) h_field_top_k_entities[e_field] = [ item[0] for item in l_e_score[:self.top_k] ] return h_field_top_k_entities
def _extract_q_doc_e_topk_merged_text_sim(self, query, h_field_top_k_entities, h_doc_e_texts): """ form an expanded documents with top k entities from each_e_field calc textual similarities between q and the expanded documents :param query: :param h_field_top_k_entities: top k most similar entities in each e fields :param h_doc_e_texts: entities' texts :return: """ h_feature = {} l_field_expanded_texts = [] for e_field, l_topk_e in h_field_top_k_entities.items(): text = "" for e in l_topk_e: text += h_doc_e_texts.get(e, {}).get(e_field, "") + ' ' l_field_expanded_texts.append((e_field, text)) q_lm = text2lm(query) total_df = self.h_corpus_stat[body_field]['total_df'] avg_doc_len = 100.0 h_doc_df = self.h_field_h_df[body_field] for e_field, text in l_field_expanded_texts: exp_lm = text2lm(text, clean=True) term_stat = TermStat() term_stat.set_from_raw(q_lm, exp_lm, h_doc_df, total_df, avg_doc_len) l_sim_score = term_stat.mul_scores() for sim, score in l_sim_score: if sim in self.l_model: h_feature[self.feature_name_pre + 'Exp' + e_field.title() + sim.title()] = score return h_feature
def _extract_per_e(self, h_q_info, e): h_feature = {} total_df = self.h_corpus_stat['bodyText']['total_df'] avg_doc_len = self.h_corpus_stat['bodyText']['average_len'] h_doc_df = self.h_field_h_df['bodyText'] q_lm = text2lm(h_q_info['query']) for e_field in self.l_entity_fields: cnt = 0 e_text = "" if e in self.h_entity_texts: if e_field in self.h_entity_texts[e]: e_text = self.h_entity_texts[e][e_field] if type(e_text) == list: e_text = ' '.join(e_text) e_lm = text2lm(e_text, clean=True) cnt += 1 term_stat = TermStat() term_stat.set_from_raw(q_lm, e_lm, h_doc_df, total_df, avg_doc_len) l_sim_score = term_stat.mul_scores() for sim, score in l_sim_score: if sim in self.s_model: h_feature[self.feature_name_pre + e_field.title() + sim.title()] = score return h_feature
def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss): """ extract e-d features do: get top k nlss form doc lm retrieval, as a whole of individually sum up to features :param q_info: query info :param ana: :param doc_info: :param l_qe_nlss: :return: h_feature: entity features for this nlss set """ l_top_nlss = self._find_top_k_nlss_for_q(q_info, ana, l_qe_nlss) l_top_sent = [nlss[0] for nlss in l_top_nlss] l_top_sent.append(' '.join(l_top_sent)) if not l_top_sent: l_top_sent.append('') # place holder for empty nlss e l_h_per_sent_feature = [] l_field_doc_lm = [ text2lm(doc_info.get(field, ""), clean=True) for field in self.l_target_fields ] for sent in l_top_sent: h_per_sent_feature = {} h_sent_lm = text2lm(sent, clean=True) for field, lm in zip(self.l_target_fields, l_field_doc_lm): r_model = RetrievalModel() r_model.set_from_raw( h_sent_lm, lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get( field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None)) l_retrieval_score = r_model.scores() q_len = float( max(sum([item[1] for item in h_sent_lm.items()]), 1)) h_per_sent_feature.update( dict([(field + name, score / q_len) for name, score in l_retrieval_score])) l_h_per_sent_feature.append(h_per_sent_feature) h_max_feature = max_pool_feature(l_h_per_sent_feature[:-1]) h_mean_feature = add_feature_prefix(l_h_per_sent_feature[-1], 'Conca') h_feature = h_max_feature h_feature.update(h_mean_feature) return h_feature
def _desp_passage_features(self, e_id, l_grid, field): l_grid_sent = [grid['sent'] for grid in l_grid] q_lm = text2lm(self.resource.h_e_desp.get(e_id, "")) grid_lm = text2lm(' '.join(l_grid_sent)) r_model = RetrievalModel() r_model.set_from_raw( q_lm, grid_lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get(field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None) ) h_score = dict(r_model.scores()) del h_score['lm_twoway'] h_feature = add_feature_prefix(h_score, 'DespPassage') return h_feature
def extract_per_entity(self, q_info, ana, doc_info): """ extract per entity feature :param q_info: :param ana: :param doc_info: :return: """ h_feature = {} qe = ana['id'] for field in self.l_target_fields: l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, []) l_qe_grid = self._filter_e_grid(qe, l_grid) doc_lm = text2lm(doc_info.get(field, "")) if 'grid' in self.l_feature: l_qe_grid = self._calc_grid_scores(l_qe_grid, doc_lm) if 'passage' in self.l_feature: h_proximity_f = self._entity_passage_features(q_info, l_qe_grid, field) h_feature.update(add_feature_prefix(h_proximity_f, field + '_')) if 'desp' in self.l_feature: h_desp_f = self._desp_passage_features(qe, l_qe_grid, field) h_feature.update(add_feature_prefix(h_desp_f, field + '_')) if 'grid' in self.l_feature: h_grid_score_f = self._grid_score_features(qe, l_qe_grid) h_feature.update(add_feature_prefix(h_grid_score_f, field + '_')) if 'coherence' in self.l_feature: if field == body_field: h_coherence_f = self._qe_grid_coherence(qe, l_grid) h_feature.update(add_feature_prefix(h_coherence_f, field + '_')) if 'esr' in self.l_feature: h_esr = self._local_esr(qe, l_qe_grid) h_feature.update(add_feature_prefix(h_esr, field + '_')) return h_feature
def _construct_e_nlss_cash_info(self, q_info, h_nlss): """ e -> [pos in l_this_nlss] :return: """ logging.info('constructing nlss cash for q [%s]', q_info['qid']) l_q_ana = form_boe_per_field(q_info, QUERY_FIELD) l_qe = list(set([ana['id'] for ana in l_q_ana])) self.h_qe_idx = dict(zip(l_qe, range(len(l_qe)))) self.ll_this_nlss = [] self.ll_this_nlss_lm = [] self.l_h_e_nlss_idx = [] for qe in l_qe: logging.info('forming nlss cash for qe [%s]', qe) l_this_nlss = h_nlss.get(qe, []) l_this_nlss_lm = [text2lm(sent, clean=True) for sent, __ in l_this_nlss] h_e = dict() for p in xrange(len(l_this_nlss)): l_e = l_this_nlss[p][1] for e in l_e: if e in qe: continue if e not in h_e: h_e[e] = [] h_e[e].append(p) logging.info('qe [%s] [%d] nlss, [%d] tail e', qe, len(l_this_nlss), len(h_e)) self.ll_this_nlss.append(l_this_nlss) self.ll_this_nlss_lm.append(l_this_nlss_lm) self.l_h_e_nlss_idx.append(h_e) logging.info('q [%s] nlss cash constructed', q_info['qid'])
def extract_per_entity(self, q_info, ana, doc_info): h_feature = dict() qe = ana['id'] qid = q_info['qid'] logging.info('start extracting [%s]-[%s]-[%s]', qid, qe, doc_info['docno']) if qid != self.current_qid: self.current_qid = qid self._construct_e_nlss_cash_info(q_info, self.resource.l_h_nlss[0]) for field in self.l_target_fields: l_field_ana = form_boe_per_field(doc_info, field) h_field_lm = text2lm(doc_info.get(field, ""), clean=True) if 'emb_vote' in self.l_features: h_feature.update(add_feature_prefix( self._connected_emb_vote(qe, l_field_ana), field + '_')) if 'edge_cnt' in self.l_features: h_feature.update(add_feature_prefix( self._edge_cnt(qe, l_field_ana), field + '_')) if 'edge_retrieval' in self.l_features: h_feature.update(add_feature_prefix( self._edge_retrieval(qe, l_field_ana, h_field_lm, field), field + '_')) if 'local_grid' in self.l_features: h_feature.update(add_feature_prefix( self._local_grid(q_info, qe, l_field_ana, doc_info, field), field + '_')) if 'qe_grid' in self.l_features: h_feature.update(add_feature_prefix( self._qe_grid(q_info, qe, doc_info, field), field + '_')) if 'nlss_grid' in self.l_features: h_feature.update(add_feature_prefix( self._nlss_grid(q_info, qe, l_field_ana, doc_info, field), field + '_')) if 'ltr_base' in self.l_features: h_feature.update(add_feature_prefix( self._ltr_baseline(q_info, h_field_lm, field), field + '_')) if 'local_vote' in self.l_features: h_feature.update(add_feature_prefix( self._local_vote(q_info, qe, l_field_ana, doc_info, field), field + '_' )) if 'grid_retrieval' in self.l_features: h_feature.update(add_feature_prefix( self._grid_retrieval(qe, h_field_lm, doc_info, field), field + '_' )) if 'edge_grid' in self.l_features: h_feature.update(add_feature_prefix( self._edge_grid(qe, doc_info, field), field + '_' )) return h_feature
def _e_desp_retrieval(self, e, grid_lm): desp = self.resource.h_e_desp.get(e, "") e_lm = text2lm(desp) r_m = RetrievalModel() r_m.set_from_raw(grid_lm, e_lm) z = float(max(sum([item[1] for item in grid_lm.items()]), 1.0)) coor = r_m.coordinate() / z lm = r_m.lm() / z l_score = [['coor', coor], ['lm', lm]] return l_score
def _local_grid(self, q_info, qe, l_field_ana, doc_info, field): """ only keep grids that 1) include qe 2) include qe->nlss->tail e :param q_info: query info :param qe: :param doc_info: :param field: :return: """ p = self.h_qe_idx[qe] h_e_nlss_idx = self.l_h_e_nlss_idx[p] l_tail_e = [ana['id'] for ana in l_field_ana if ana['id'] in h_e_nlss_idx] l_qe_grid = [] l_nlss_e_grid = [] l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, []) for grid in l_grid: l_grid_e = [ana['id'] for ana in grid['spot']] s_grid_e = set(l_grid_e) if qe in s_grid_e: l_qe_grid.append(grid['sent']) for tail_e in l_tail_e: if tail_e in s_grid_e: l_nlss_e_grid.append(grid['sent']) break logging.info('q [%s] e [%s] doc [%s] has [%d] qe grid, [%d] nlss grid', q_info['qid'], qe, doc_info['docno'], len(l_qe_grid), len(l_nlss_e_grid) ) qe_grid_lm = text2lm(' '.join(l_qe_grid), clean=True) nlss_e_grid_lm = text2lm(' '.join(l_nlss_e_grid), clean=True) q_lm = text2lm(q_info[QUERY_FIELD]) h_feature = {} h_qe_grid_scores = dict(self._extract_retrieval_scores(q_lm, qe_grid_lm, field)) h_nlss_grid_scores = dict(self._extract_retrieval_scores(q_lm, nlss_e_grid_lm, field)) h_feature.update(add_feature_prefix(h_qe_grid_scores, 'QEGrid_')) h_feature.update(add_feature_prefix(h_nlss_grid_scores, 'NlssGrid_')) return h_feature
def _extract_per_entity(self, e_id, d_info, external_resource): h_feature = dict() h_e_fields = external_resource.h_entity_fields.get(e_id, {}) l_e_text_fields = [(field, h_e_fields.get(field, "")) for field in self.l_entity_fields] corpus_stat = external_resource.corpus_stat for field, text in l_e_text_fields: h_q_lm = text2lm(text, clean=True) for doc_field in TARGET_TEXT_FIELDS: doc_text = d_info.get(doc_field, "") h_d_lm = text2lm(doc_text, clean=True) self.retrieval_model.set(h_q_lm, h_d_lm, doc_field, corpus_stat) l_sim_scores = self.retrieval_model.scores() l_feature = [(self.feature_name_pre + field.title() + doc_field.title() + name, score) for name, score in l_sim_scores] h_feature.update(dict(l_feature)) return h_feature
def _get_per_f(self, fname): l_w = [] logging.info('started [%s]', fname) for line in open(fname): h = json.loads(line.split('\t')[-1]) for field in self.l_target_fields: if field in h: text = h[field] lm = text2lm(text, clean=True) l_w.extend(lm.keys()) logging.info('[%d] words get from [%s]', len(l_w), fname) return l_w
def _form_prf_field_lm(self, qid): l_rank_info = self.h_q_rank_info.get(qid, []) h_field_l_doc_lm = {} for field in TARGET_TEXT_FIELDS: l_doc_lm = [] for doc, score, h_info in l_rank_info[:self.prf_d]: text = h_info.get(field, "") lm = text2lm(text, clean=True) l_doc_lm.append(lm) h_field_l_doc_lm[field] = l_doc_lm return h_field_l_doc_lm
def _qe_grid(self, q_info, qe, doc_info, field): p = self.h_qe_idx[qe] h_e_nlss_idx = self.l_h_e_nlss_idx[p] l_qe_grid = [] l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, []) for grid in l_grid: l_grid_e = [ana['id'] for ana in grid['spot']] s_grid_e = set(l_grid_e) if qe in s_grid_e: l_qe_grid.append(grid['sent']) logging.info('q [%s] e [%s] doc [%s] has [%d] qe grid', q_info['qid'], qe, doc_info['docno'], len(l_qe_grid) ) qe_grid_lm = text2lm(' '.join(l_qe_grid), clean=True) q_lm = text2lm(q_info[QUERY_FIELD]) h_feature = {} h_qe_grid_scores = dict(self._extract_retrieval_scores(q_lm, qe_grid_lm, field)) h_feature.update(add_feature_prefix(h_qe_grid_scores, 'QEGrid_')) return h_feature
def _extract_q_doc_e_textual_features(self, query, l_h_doc_e_lm, h_doc_e_texts): if not self.h_entity_texts: return {} h_feature = {} q_lm = text2lm(query) for p in xrange(len(self.l_text_fields)): field = self.l_text_fields[p] if self.l_top_k: self.top_k = self.l_top_k[p] h_doc_e_lm = l_h_doc_e_lm[p] total_df = self.h_corpus_stat[field]['total_df'] avg_doc_len = self.h_corpus_stat[field]['average_len'] h_doc_df = self.h_field_h_df[field] l_h_scores = [] l_e_tf = [] for e, e_tf in h_doc_e_lm.items(): h_scores = {} l_e_tf.append(e_tf) h_e_texts = h_doc_e_texts.get(e, {}) for e_field in self.l_entity_fields: text = h_e_texts.get(e_field, "") e_lm = text2lm(text, clean=True) term_stat = TermStat() term_stat.set_from_raw(q_lm, e_lm, h_doc_df, total_df, avg_doc_len) l_sim_score = term_stat.mul_scores() for sim, score in l_sim_score: if sim in self.l_model: h_scores[e_field.title() + sim.title()] = score l_h_scores.append(h_scores) h_pooled_scores = self._merge_entity_sim(l_h_scores, l_e_tf) for name, score in h_pooled_scores.items(): h_feature[self.feature_name_pre + field.title() + name] = score logging.debug(json.dumps(h_feature)) return h_feature
def _calc_esearch_per_pair(self, h_q_info, h_doc_info): # h_res = {} if 'tagme' not in h_doc_info: return [] l_e_name = [(ana[0], ana[-1]) for ana in h_doc_info['tagme'][body_field]] query = h_q_info['query'] q_lm = text2lm(query, clean=True) total_df, avg_len = self.h_corpus_stat[body_field]['total_df'], 100.0 l_e_score = [] for e, name in l_e_name: desp = self.h_entity_texts[e]['desp'] e_lm = text2lm(desp, clean=True) term_stat = TermStat() term_stat.set_from_raw(q_lm, e_lm, self.h_field_h_df[body_field], total_df, avg_len) lm_dir = term_stat.lm_dir() l_e_score.append((e, name, lm_dir)) l_e_score.sort(key=lambda item: -item[-1]) # h_res['e_lm_dir'] = l_e_score[:10] return l_e_score[:10]
def _grid_retrieval(self, qe, h_field_lm, doc_info, field): l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, []) z = float(len(l_grid)) l_h_scores = [] for grid in l_grid: l_grid_e = [ana['id'] for ana in grid['spot']] s_grid_e = set(l_grid_e) if qe not in s_grid_e: continue sent_lm = text2lm(grid['sent'], clean=True) l_scores = self._extract_retrieval_scores(sent_lm, h_field_lm, field) h_scores = dict([(k, v / z) for k, v in l_scores]) l_h_scores.append(h_scores) h_feature = sum_pool_feature(l_h_scores) h_feature = add_feature_prefix(h_feature, 'grid_retrieval') return h_feature
def _edge_grid(self, qe, doc_info, field): p = self.h_qe_idx[qe] h_e_nlss_idx = self.l_h_e_nlss_idx[p] l_this_nlss_lm = self.ll_this_nlss_lm[p] l_grids = doc_info.get(E_GRID_FIELD, {}).get(field, []) l_h_retrieval_scores = [] for grid in l_grids: l_grid_ana = grid['spot'] l_grid_e = [ana['id'] for ana in l_grid_ana] lh_this_sim = [] grid_sent_lm = text2lm(grid['sent'], clean=True) for e in l_grid_e: if e not in h_e_nlss_idx: continue l_lm = [l_this_nlss_lm[pos] for pos in h_e_nlss_idx[e]] for lm in l_lm: h_sim = dict(self._extract_simple_scores(lm, grid_sent_lm)) lh_this_sim.append(h_sim) h_this_grid_sim = mean_pool_feature(lh_this_sim, add_suffix=False) l_h_retrieval_scores.append(h_this_grid_sim) h_feature = sum_pool_feature(l_h_retrieval_scores) return h_feature
def _calc_grid_scores(self, l_grid, doc_lm): """ sent -> e scores include: frequency: emb_sim: desp_emb: desp_bow: gloss_emb: gloss_bow: :param l_grid: :return: for grid->'entity'->['id': e id, 'name':score], grid_score = {name:score} """ logging.info('start calculating grid scores') for grid in l_grid: l_e = [ana['id'] for ana in grid.get(SPOT_FIELD)] h_e_tf = term2lm(l_e) grid_sent = grid['sent'] grid_lm = text2lm(grid_sent) grid_emb = avg_embedding(self.resource.embedding, grid_sent) l_e_score = [] for e, tf in h_e_tf.items(): h_e_score = {'id': e, 'freq': tf} h_e_score['uw_emb'] = self._e_grid_emb(e, grid_emb) # h_e_score['gloss_emb'] = self._e_gloss_emb(e, grid_emb) # h_e_score['gloss_bow'] = self._e_gloss_bow(e, grid_lm) h_e_score['desp_emb'] = self._e_desp_emb(e, grid_emb) h_e_score['desp_bow'] = self._e_desp_bow(e, grid_lm) h_e_score['ESA'] = self._e_desp_bow(e, doc_lm) l_score = self._e_desp_retrieval(e, grid_lm) h_e_score.update(add_feature_prefix(dict(l_score), 'desp_')) l_e_score.append(h_e_score) grid['e_score'] = l_e_score return l_grid
def _e_desp_bow(self, e, grid_lm): desp = self.resource.h_e_desp.get(e, "") e_lm = text2lm(desp) return lm_cosine(e_lm, grid_lm)
def _e_gloss_bow(self, e, grid_lm): desp = self.resource.h_e_desp.get(e, "") gloss = ' '.join(desp.split()[:self.gloss_len]) e_lm = text2lm(gloss) return lm_cosine(e_lm, grid_lm)
def _ltr_baseline(self, q_info, h_field_lm, field): q_lm = text2lm(q_info[QUERY_FIELD]) l_scores = self._extract_retrieval_scores(q_lm, h_field_lm, field) h_feature = dict(l_scores) return h_feature