def _entity_passage_features(self, q_info, l_grid, field): l_grid_sent = [grid['sent'] for grid in l_grid] q_lm = text2lm(q_info['query']) h_feature = dict() grid_lm = text2lm(' '.join(l_grid_sent)) r_model = RetrievalModel() r_model.set_from_raw( q_lm, grid_lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get(field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None) ) h_score = dict(r_model.scores()) h_feature.update(h_score) # l_grid_lm = [text2lm(sent) for sent in l_grid_sent] # l_scores = [] # for grid_lm in l_grid_lm: # r_model = RetrievalModel() # r_model.set_from_raw( # q_lm, grid_lm, # self.resource.corpus_stat.h_field_df.get(field, None), # self.resource.corpus_stat.h_field_total_df.get(field, None), # self.resource.corpus_stat.h_field_avg_len.get(field, None) # ) # l_scores.append(dict(r_model.scores())) # # h_feature.update(mean_pool_feature(l_scores)) # h_feature.update(max_pool_feature(l_scores)) h_feature = add_feature_prefix(h_feature, 'EntityPassage') return h_feature
def _extract_retrieval_scores(self, formed_q_lm, formed_doc_lm, field): r_model = RetrievalModel() r_model.set_from_raw( formed_q_lm, formed_doc_lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get(field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None)) return [(k, v) for k, v in r_model.scores() if 'lm_twoway' != k]
def _extract_simple_scores(self, formed_q_lm, formed_doc_lm): r_model = RetrievalModel() r_model.set_from_raw( formed_q_lm, formed_doc_lm, ) l_score = [['cosine', lm_cosine(formed_q_lm, formed_doc_lm)], ['coordinate', r_model.coordinate()]] return l_score
def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss): """ extract e-d features do: get top k nlss form doc lm retrieval, as a whole of individually sum up to features :param q_info: query info :param ana: :param doc_info: :param l_qe_nlss: :return: h_feature: entity features for this nlss set """ l_top_nlss = self._find_top_k_nlss_for_q(q_info, ana, l_qe_nlss) l_top_sent = [nlss[0] for nlss in l_top_nlss] l_top_sent.append(' '.join(l_top_sent)) if not l_top_sent: l_top_sent.append('') # place holder for empty nlss e l_h_per_sent_feature = [] l_field_doc_lm = [ text2lm(doc_info.get(field, ""), clean=True) for field in self.l_target_fields ] for sent in l_top_sent: h_per_sent_feature = {} h_sent_lm = text2lm(sent, clean=True) for field, lm in zip(self.l_target_fields, l_field_doc_lm): r_model = RetrievalModel() r_model.set_from_raw( h_sent_lm, lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get( field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None)) l_retrieval_score = r_model.scores() q_len = float( max(sum([item[1] for item in h_sent_lm.items()]), 1)) h_per_sent_feature.update( dict([(field + name, score / q_len) for name, score in l_retrieval_score])) l_h_per_sent_feature.append(h_per_sent_feature) h_max_feature = max_pool_feature(l_h_per_sent_feature[:-1]) h_mean_feature = add_feature_prefix(l_h_per_sent_feature[-1], 'Conca') h_feature = h_max_feature h_feature.update(h_mean_feature) return h_feature
def _desp_passage_features(self, e_id, l_grid, field): l_grid_sent = [grid['sent'] for grid in l_grid] q_lm = text2lm(self.resource.h_e_desp.get(e_id, "")) grid_lm = text2lm(' '.join(l_grid_sent)) r_model = RetrievalModel() r_model.set_from_raw( q_lm, grid_lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get(field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None) ) h_score = dict(r_model.scores()) del h_score['lm_twoway'] h_feature = add_feature_prefix(h_score, 'DespPassage') return h_feature
def _lm_nlss_filter(self, l_nlss, doc_info): logging.info('filter [%d] nlss via boe', len(l_nlss)) l_nlss_lmscore = [] h_d_lm = text2lm(doc_info.get(body_field, "")) for nlss in l_nlss: h_s_lm = text2lm(nlss[0]) r_model = RetrievalModel() r_model.set_from_raw(h_s_lm, h_d_lm) lm = r_model.lm() l_nlss_lmscore.append((nlss, lm)) l_nlss_lmscore.sort(key=lambda item: item[1], reverse=True) l_this_nlss = [item[0] for item in l_nlss_lmscore] if l_nlss_lmscore: logging.info('best lm [%f]', l_nlss_lmscore[0][1]) return l_this_nlss
def _e_desp_retrieval(self, e, grid_lm): desp = self.resource.h_e_desp.get(e, "") e_lm = text2lm(desp) r_m = RetrievalModel() r_m.set_from_raw(grid_lm, e_lm) z = float(max(sum([item[1] for item in grid_lm.items()]), 1.0)) coor = r_m.coordinate() / z lm = r_m.lm() / z l_score = [['coor', coor], ['lm', lm]] return l_score
def class_print_help(cls, inst=None): super(QeDTextMatchFeatureExtractor, cls).class_print_help(inst) RetrievalModel.class_print_help(inst)
def __init__(self, **kwargs): super(QeDTextMatchFeatureExtractor, self).__init__(**kwargs) logging.info('initializing QeDTextMatchFeatureExtractor') self.retrieval_model = RetrievalModel(**kwargs) logging.info('QeDTextMatchFeatureExtractor init with target entity fields: %s', json.dumps(self.l_entity_fields))
class QeDTextMatchFeatureExtractor(Configurable): feature_name_pre = Unicode('QeD') l_entity_fields = List(Unicode, default_value=ENTITY_TEXT_FIELDS).tag(config=True) def __init__(self, **kwargs): super(QeDTextMatchFeatureExtractor, self).__init__(**kwargs) logging.info('initializing QeDTextMatchFeatureExtractor') self.retrieval_model = RetrievalModel(**kwargs) logging.info('QeDTextMatchFeatureExtractor init with target entity fields: %s', json.dumps(self.l_entity_fields)) @classmethod def class_print_help(cls, inst=None): super(QeDTextMatchFeatureExtractor, cls).class_print_help(inst) RetrievalModel.class_print_help(inst) def extract(self, q_info, d_info, external_resource): """ :param q_info: grounded query info :param d_info: doc with textual fields :param external_resource: make sure h_entity_fields is loaded :return: matching features for each entities in the grounded fields, in same tree structure h_match_info: qid: docno: match: tree structure for each entities in grounded field """ assert external_resource.h_entity_fields is not None h_match_info = dict() h_match_info['qid'] = q_info['qid'] h_match_info['docno'] = d_info['docno'] l_q_grounded = q_info[GROUND_FIELD]['query'] l_q_matched_feature = [] for grounded_sf in l_q_grounded: matched_sf = dict() matched_sf['surface'] = grounded_sf['surface'] matched_sf['loc'] = grounded_sf['loc'] l_matched_entities = [] for grounded_e in grounded_sf['entities']: e_id = grounded_e['id'] e_name = external_resource.h_entity_fields.get(e_id, {}).get(e_name_field, "") h_feature = self._extract_per_entity(e_id, d_info, external_resource) l_matched_entities.append(({'id': e_id, 'f': h_feature, e_name_field: e_name})) matched_sf['entities'] = l_matched_entities l_q_matched_feature.append(matched_sf) h_match_info[MATCH_FIELD] = l_q_matched_feature return h_match_info def _extract_per_entity(self, e_id, d_info, external_resource): h_feature = dict() h_e_fields = external_resource.h_entity_fields.get(e_id, {}) l_e_text_fields = [(field, h_e_fields.get(field, "")) for field in self.l_entity_fields] corpus_stat = external_resource.corpus_stat for field, text in l_e_text_fields: h_q_lm = text2lm(text, clean=True) for doc_field in TARGET_TEXT_FIELDS: doc_text = d_info.get(doc_field, "") h_d_lm = text2lm(doc_text, clean=True) self.retrieval_model.set(h_q_lm, h_d_lm, doc_field, corpus_stat) l_sim_scores = self.retrieval_model.scores() l_feature = [(self.feature_name_pre + field.title() + doc_field.title() + name, score) for name, score in l_sim_scores] h_feature.update(dict(l_feature)) return h_feature