def _boe_nlss_filter(self, q_info, q_ana, l_nlss, doc_info): e_id = q_ana['id'] logging.info('filter [%d] nlss via boe', len(l_nlss)) l_ana = sum([ form_boe_per_field(doc_info, field) for field in self.l_target_fields ], []) s_e = set([ana['id'] for ana in l_ana if ana['id'] != e_id]) h_e_sf = dict([(ana['id'], ana['surface']) for ana in l_ana]) l_keep_nlss = [] l_keep_nlss_e = [] for nlss in l_nlss: keep_flag = False meet_e = None for e in nlss[1]: if e in s_e: keep_flag = True meet_e = e break if keep_flag: l_keep_nlss.append(nlss) l_keep_nlss_e.append({'matched_e': [meet_e, h_e_sf[meet_e]]}) if self.intermediate_out: h = {} h['qid'] = q_info['qid'] h['ana'] = q_ana h['docno'] = doc_info['docno'] h['boe_nlss'] = zip(l_keep_nlss_e, l_keep_nlss) print >> self.intermediate_out, json.dumps(h) logging.info('[%s] boe filtered [%d]->[%d]', e_id, len(l_nlss), len(l_keep_nlss)) return l_keep_nlss
def _construct_e_nlss_cash_info(self, q_info, h_nlss): """ e -> [pos in l_this_nlss] :return: """ logging.info('constructing nlss cash for q [%s]', q_info['qid']) l_q_ana = form_boe_per_field(q_info, QUERY_FIELD) l_qe = list(set([ana['id'] for ana in l_q_ana])) self.h_qe_idx = dict(zip(l_qe, range(len(l_qe)))) self.ll_this_nlss = [] self.ll_this_nlss_lm = [] self.l_h_e_nlss_idx = [] for qe in l_qe: logging.info('forming nlss cash for qe [%s]', qe) l_this_nlss = h_nlss.get(qe, []) l_this_nlss_lm = [text2lm(sent, clean=True) for sent, __ in l_this_nlss] h_e = dict() for p in xrange(len(l_this_nlss)): l_e = l_this_nlss[p][1] for e in l_e: if e in qe: continue if e not in h_e: h_e[e] = [] h_e[e].append(p) logging.info('qe [%s] [%d] nlss, [%d] tail e', qe, len(l_this_nlss), len(h_e)) self.ll_this_nlss.append(l_this_nlss) self.ll_this_nlss_lm.append(l_this_nlss_lm) self.l_h_e_nlss_idx.append(h_e) logging.info('q [%s] nlss cash constructed', q_info['qid'])
def extract_per_entity(self, q_info, ana, doc_info): h_feature = dict() qe = ana['id'] qid = q_info['qid'] logging.info('start extracting [%s]-[%s]-[%s]', qid, qe, doc_info['docno']) if qid != self.current_qid: self.current_qid = qid self._construct_e_nlss_cash_info(q_info, self.resource.l_h_nlss[0]) for field in self.l_target_fields: l_field_ana = form_boe_per_field(doc_info, field) h_field_lm = text2lm(doc_info.get(field, ""), clean=True) if 'emb_vote' in self.l_features: h_feature.update(add_feature_prefix( self._connected_emb_vote(qe, l_field_ana), field + '_')) if 'edge_cnt' in self.l_features: h_feature.update(add_feature_prefix( self._edge_cnt(qe, l_field_ana), field + '_')) if 'edge_retrieval' in self.l_features: h_feature.update(add_feature_prefix( self._edge_retrieval(qe, l_field_ana, h_field_lm, field), field + '_')) if 'local_grid' in self.l_features: h_feature.update(add_feature_prefix( self._local_grid(q_info, qe, l_field_ana, doc_info, field), field + '_')) if 'qe_grid' in self.l_features: h_feature.update(add_feature_prefix( self._qe_grid(q_info, qe, doc_info, field), field + '_')) if 'nlss_grid' in self.l_features: h_feature.update(add_feature_prefix( self._nlss_grid(q_info, qe, l_field_ana, doc_info, field), field + '_')) if 'ltr_base' in self.l_features: h_feature.update(add_feature_prefix( self._ltr_baseline(q_info, h_field_lm, field), field + '_')) if 'local_vote' in self.l_features: h_feature.update(add_feature_prefix( self._local_vote(q_info, qe, l_field_ana, doc_info, field), field + '_' )) if 'grid_retrieval' in self.l_features: h_feature.update(add_feature_prefix( self._grid_retrieval(qe, h_field_lm, doc_info, field), field + '_' )) if 'edge_grid' in self.l_features: h_feature.update(add_feature_prefix( self._edge_grid(qe, doc_info, field), field + '_' )) return h_feature
def construct_per_doc(doc_info, l_target_field): doc_info['e_grid'] = {} for field in l_target_field: if field not in doc_info: continue l_ana = form_boe_per_field(doc_info, field) text = doc_info.get(field, "") l_e_grid = construct_per_text(text, l_ana) doc_info['e_grid'][field] = l_e_grid return doc_info
def get_top_frequency(doc_info): l_ana = form_boe_per_field(doc_info, body_field) l_e = [ana['id'] for ana in l_ana] l_name = [ana['surface'] for ana in l_ana] h_e_name = dict(zip(l_e, l_name)) h_e_tf = term2lm(l_e) l_e_tf = h_e_tf.items() l_e_tf.sort(key=lambda item: item[1], reverse=True) top_e_name = "" if l_e_tf: top_e_name = h_e_name[l_e_tf[0][0]] return l_e_tf, top_e_name
def form_boe(self, h_info): """ for each field in h_info: l_e = [{id:, loc:, sf:}] :param h_info: :return: h_field_boe """ h_field_boe = dict() for field in self.l_target_field: if field in h_info: h_field_boe[field] = form_boe_per_field(h_info, field) return h_field_boe
def filter_to_title_entity(l_nlss, doc_info): l_title_ana = form_boe_per_field(doc_info, title_field) s_e = set([ana['id'] for ana in l_title_ana]) l_keep_nlss = [] for nlss in l_nlss: l_e = nlss[-1] keep = False for e in l_e: if e in s_e: keep = True break if keep: l_keep_nlss.append(nlss) logging.debug('filter to title entity related only: [%d]->[%d]', len(l_nlss), len(l_keep_nlss)) return l_keep_nlss
def check_title_e_rank(doc_info): """ get the rank of title entity in frequency :param doc_info: :return: """ l_e_tf, top_e_name = get_top_frequency(doc_info) h_e_rank = dict(zip([item[0] for item in l_e_tf], range(1, 1 + len(l_e_tf)))) l_ana = form_boe_per_field(doc_info, title_field) if not l_ana: return None title_e = l_ana[0]['id'] rank = h_e_rank.get(title_e, 0) if rank != 1: if l_e_tf: top_e, top_tf = l_e_tf[0] title_e, title_tf = l_e_tf[rank - 1] print doc_info[title_field] + '\t' + top_e_name + '\t%s\t%d\t%s\t%d' % (title_e, title_tf, top_e, top_tf) return rank
def dump_per_doc(in_name, out_name): e_out = open(out_name + '.entity', 'w') v_out = open(out_name + '.vocab', 'w') l_e = [] l_v = [] for line in open(in_name): h = json.loads(line) for field in [QUERY_FIELD] + TARGET_TEXT_FIELDS: l_ana = form_boe_per_field(h, field) text = h.get(field, "").lower() l_e.extend([ana['id'] for ana in l_ana]) l_v.extend(text.split()) l_e = list(set(l_e)) print >> e_out, '\n'.join(l_e) l_v = list(set(l_v)) print >> v_out, '\n'.join(l_e + l_v) e_out.close() v_out.close() print '[%s] vocab to [%s.entity + .vocab]' % (in_name, out_name) return
def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss): """ for each field # of e share nlss # of e connected in Freebase # of e emb cosine > 0.2 :param q_info: :param ana: the current query ana e :param doc_info: :param l_qe_nlss: :return: """ qe = ana['id'] h_feature = {} for field in self.l_target_fields: logging.info('start extracting [%s] in [%s]', qe, field) l_e = form_boe_per_field(doc_info, field) l_e = [e['id'] for e in l_e if e['id'] != qe] nlss_cnt = self._count_co_nlss(qe, l_e, l_qe_nlss) emb_sim_cnt = self._count_meaningful_emb_sim(qe, l_e) kg_edge_cnt = self._count_kg_edge(qe, l_e) h_feature[field + 'nlss_cnt'] = nlss_cnt h_feature[field + 'emb_sim_cnt'] = emb_sim_cnt h_feature[field + 'kg_edge_cnt'] = kg_edge_cnt h_feature[field + 'nb_e'] = len(l_e) if self.intermediate_data_out_name: h_mid = {'qid': q_info['qid'], 'docno': doc_info['docno'], 'id': ana['id']} h_mid['nb_nlss'] = len(l_qe_nlss) l_nlss_e = sum( [ nlss[1] for nlss in l_qe_nlss], []) s_nlss_e = set(l_nlss_e) h_mid['nb_nlss_e'] = len(s_nlss_e) h_mid.update(h_feature) print >> self.intermediate_out, json.dumps(h_mid) return h_feature