def _entity_passage_features(self, q_info, l_grid, field): l_grid_sent = [grid['sent'] for grid in l_grid] q_lm = text2lm(q_info['query']) h_feature = dict() grid_lm = text2lm(' '.join(l_grid_sent)) r_model = RetrievalModel() r_model.set_from_raw( q_lm, grid_lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get(field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None) ) h_score = dict(r_model.scores()) h_feature.update(h_score) # l_grid_lm = [text2lm(sent) for sent in l_grid_sent] # l_scores = [] # for grid_lm in l_grid_lm: # r_model = RetrievalModel() # r_model.set_from_raw( # q_lm, grid_lm, # self.resource.corpus_stat.h_field_df.get(field, None), # self.resource.corpus_stat.h_field_total_df.get(field, None), # self.resource.corpus_stat.h_field_avg_len.get(field, None) # ) # l_scores.append(dict(r_model.scores())) # # h_feature.update(mean_pool_feature(l_scores)) # h_feature.update(max_pool_feature(l_scores)) h_feature = add_feature_prefix(h_feature, 'EntityPassage') return h_feature
def _extract_retrieval_scores(self, formed_q_lm, formed_doc_lm, field): r_model = RetrievalModel() r_model.set_from_raw( formed_q_lm, formed_doc_lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get(field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None)) return [(k, v) for k, v in r_model.scores() if 'lm_twoway' != k]
def _extract_simple_scores(self, formed_q_lm, formed_doc_lm): r_model = RetrievalModel() r_model.set_from_raw( formed_q_lm, formed_doc_lm, ) l_score = [['cosine', lm_cosine(formed_q_lm, formed_doc_lm)], ['coordinate', r_model.coordinate()]] return l_score
def _e_desp_retrieval(self, e, grid_lm): desp = self.resource.h_e_desp.get(e, "") e_lm = text2lm(desp) r_m = RetrievalModel() r_m.set_from_raw(grid_lm, e_lm) z = float(max(sum([item[1] for item in grid_lm.items()]), 1.0)) coor = r_m.coordinate() / z lm = r_m.lm() / z l_score = [['coor', coor], ['lm', lm]] return l_score
def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss): """ extract e-d features do: get top k nlss form doc lm retrieval, as a whole of individually sum up to features :param q_info: query info :param ana: :param doc_info: :param l_qe_nlss: :return: h_feature: entity features for this nlss set """ l_top_nlss = self._find_top_k_nlss_for_q(q_info, ana, l_qe_nlss) l_top_sent = [nlss[0] for nlss in l_top_nlss] l_top_sent.append(' '.join(l_top_sent)) if not l_top_sent: l_top_sent.append('') # place holder for empty nlss e l_h_per_sent_feature = [] l_field_doc_lm = [ text2lm(doc_info.get(field, ""), clean=True) for field in self.l_target_fields ] for sent in l_top_sent: h_per_sent_feature = {} h_sent_lm = text2lm(sent, clean=True) for field, lm in zip(self.l_target_fields, l_field_doc_lm): r_model = RetrievalModel() r_model.set_from_raw( h_sent_lm, lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get( field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None)) l_retrieval_score = r_model.scores() q_len = float( max(sum([item[1] for item in h_sent_lm.items()]), 1)) h_per_sent_feature.update( dict([(field + name, score / q_len) for name, score in l_retrieval_score])) l_h_per_sent_feature.append(h_per_sent_feature) h_max_feature = max_pool_feature(l_h_per_sent_feature[:-1]) h_mean_feature = add_feature_prefix(l_h_per_sent_feature[-1], 'Conca') h_feature = h_max_feature h_feature.update(h_mean_feature) return h_feature
def _desp_passage_features(self, e_id, l_grid, field): l_grid_sent = [grid['sent'] for grid in l_grid] q_lm = text2lm(self.resource.h_e_desp.get(e_id, "")) grid_lm = text2lm(' '.join(l_grid_sent)) r_model = RetrievalModel() r_model.set_from_raw( q_lm, grid_lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get(field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None) ) h_score = dict(r_model.scores()) del h_score['lm_twoway'] h_feature = add_feature_prefix(h_score, 'DespPassage') return h_feature
def _lm_nlss_filter(self, l_nlss, doc_info): logging.info('filter [%d] nlss via boe', len(l_nlss)) l_nlss_lmscore = [] h_d_lm = text2lm(doc_info.get(body_field, "")) for nlss in l_nlss: h_s_lm = text2lm(nlss[0]) r_model = RetrievalModel() r_model.set_from_raw(h_s_lm, h_d_lm) lm = r_model.lm() l_nlss_lmscore.append((nlss, lm)) l_nlss_lmscore.sort(key=lambda item: item[1], reverse=True) l_this_nlss = [item[0] for item in l_nlss_lmscore] if l_nlss_lmscore: logging.info('best lm [%f]', l_nlss_lmscore[0][1]) return l_this_nlss
def __init__(self, **kwargs): super(QeDTextMatchFeatureExtractor, self).__init__(**kwargs) logging.info('initializing QeDTextMatchFeatureExtractor') self.retrieval_model = RetrievalModel(**kwargs) logging.info('QeDTextMatchFeatureExtractor init with target entity fields: %s', json.dumps(self.l_entity_fields))