def _data_io(self, line): """ convert data to the input for the model :param line: the json formatted data :return: v_e, v_w, v_label v_e: entities in the doc v_w: initial weight, TF v_label: 1 or -1, salience or not, if label not given, will be 0 """ h = json.loads(line) l_e = h[self.spot_field].get(self.in_field, []) s_salient_e = set(h[self.spot_field].get(self.salience_field, [])) h_e_tf = term2lm(l_e) l_e_tf = sorted(h_e_tf.items(), key=lambda item: -item[1])[:self.max_e_per_doc] l_e = [item[0] for item in l_e_tf] z = float(sum([item[1] for item in l_e_tf])) l_w = [item[1] / z for item in l_e_tf] l_label = [1 if e in s_salient_e else 0 for e in l_e] v_e = Variable(torch.LongTensor(l_e)).cuda() if use_cuda else Variable( torch.LongTensor(l_e)) v_w = Variable( torch.FloatTensor(l_w)).cuda() if use_cuda else Variable( torch.FloatTensor(l_w)) v_label = Variable( torch.LongTensor(l_label)).cuda() if use_cuda else Variable( torch.FloatTensor(l_label)) return v_e, v_w, v_label
def get_top_k_e(l_e, max_e_per_d): h_e_tf = term2lm(l_e) l_e_tf = sorted(h_e_tf.items(), key=lambda item: -item[1])[:max_e_per_d] l_e = [item[0] for item in l_e_tf] z = float(sum([item[1] for item in l_e_tf])) l_w = [item[1] / z for item in l_e_tf] return l_e, l_w
def get_top_k_e(cls, l_term, max_number): h_e_tf = term2lm(l_term) l_e_tf = sorted(h_e_tf.items(), key=lambda item: -item[1])[:max_number] l_term = [item[0] for item in l_e_tf] z = float(sum([item[1] for item in l_e_tf])) l_w = [item[1] / z for item in l_e_tf] return l_term, l_w
def _filter_doc_e(self, l_doc_e): h_doc_e_tf = term2lm(l_doc_e) l_doc_e_tf = sorted(h_doc_e_tf.items(), key=lambda item: -item[1])[:self.max_e_per_d] l_doc_e = [item[0] for item in l_doc_e_tf] z = float(sum([item[1] for item in l_doc_e_tf])) v_doc_e_w = np.array([item[1] / z for item in l_doc_e_tf]) return l_doc_e, v_doc_e_w
def _get_salience_e_tf(self, h_info): h_salience_e = {} if not self.salience_field: return h_salience_e l_ana = h_info['spot'].get(self.salience_field, []) l_e_id = self._get_e_id_from_ana(l_ana) l_hashed_e_id = [self.h_entity_id.get(e_id, 0) for e_id in l_e_id] h_salience_e = term2lm([eid for eid in l_hashed_e_id if eid != 0]) return h_salience_e
def _form_doc_e_lm(self, h_doc_info): l_h_doc_e_lm = [] for field in self.l_text_fields: l_e = [] if field in h_doc_info[self.tagger]: l_e = [ana[0] for ana in h_doc_info[self.tagger][field]] h_lm = term2lm(l_e) l_h_doc_e_lm.append(h_lm) return l_h_doc_e_lm
def get_top_frequency(doc_info): l_ana = form_boe_per_field(doc_info, body_field) l_e = [ana['id'] for ana in l_ana] l_name = [ana['surface'] for ana in l_ana] h_e_name = dict(zip(l_e, l_name)) h_e_tf = term2lm(l_e) l_e_tf = h_e_tf.items() l_e_tf.sort(key=lambda item: item[1], reverse=True) top_e_name = "" if l_e_tf: top_e_name = h_e_name[l_e_tf[0][0]] return l_e_tf, top_e_name
def _form_prf_field_lm(self, qid): l_rank_info = self.h_q_rank_info.get(qid, []) h_field_l_doc_lm = {} for field in TARGET_TEXT_FIELDS: l_doc_lm = [] for doc, score, h_info in l_rank_info[:self.prf_d]: l_ana = h_info.get(self.tagger, {}).get(field, []) l_e = [ana[0] for ana in l_ana] lm = term2lm(l_e) l_doc_lm.append(lm) h_field_l_doc_lm[field] = l_doc_lm return h_field_l_doc_lm
def load_facc1_dict(in_name): h_doc_ana = {} for line in open(in_name): docno, oid = line.strip().split('\t') if docno not in h_doc_ana: h_doc_ana[docno] = [oid] else: h_doc_ana[docno].append(oid) h_doc_olm = {} for docno, l_e in h_doc_ana.items(): lm = term2lm(l_e) h_doc_olm[docno] = lm logging.info('[%d] doc facc1 dict loaded', len(h_doc_olm)) return h_doc_olm
def _edge_retrieval(self, qe, l_field_ana, h_field_lm, field): """ for each edge in this doc field get edge sent's lm calc retrieval scores sum up retrieval score to final feature :param qe: :param l_field_ana: :param h_field_lm: :return: """ z = max(float(len(l_field_ana)), 1.0) h_feature = {} p = self.h_qe_idx[qe] h_e_nlss_idx = self.l_h_e_nlss_idx[p] l_this_nlss_lm = self.ll_this_nlss_lm[p] l_e = [ana['id'] for ana in l_field_ana if ana['id'] in h_e_nlss_idx] l_h_retrieval_scores = [] l_h_avg_retrieval_scores = [] h_e_tf = term2lm(l_e) avg_sent_per_e = 0 for e, tf in h_e_tf.items(): l_sent_lm = [l_this_nlss_lm[pos] for pos in h_e_nlss_idx[e]] avg_sent_per_e += len(l_sent_lm) l_this_e_h_scores = [] for sent_lm in l_sent_lm: l_scores = self._extract_retrieval_scores(sent_lm, h_field_lm, field) l_scores = [(name, v * tf / z) for name, v in l_scores if 'lm_twoway' not in name] h_retrieval_score = dict(l_scores) l_h_retrieval_scores.append(h_retrieval_score) l_this_e_h_scores.append(h_retrieval_score) h_this_e_avg_score = mean_pool_feature(l_this_e_h_scores) l_h_avg_retrieval_scores.append(h_this_e_avg_score) avg_sent_per_e /= float(max(len(h_e_tf), 1.0)) avg_sent_per_e = max(avg_sent_per_e, 1.0) h_sum_retrieval_score = sum_pool_feature(l_h_retrieval_scores) h_sum_retrieval_score = dict([(k, v / avg_sent_per_e) for k, v in h_sum_retrieval_score.items()]) h_feature.update(h_sum_retrieval_score) h_feature.update(sum_pool_feature(l_h_avg_retrieval_scores)) """ make sure not too small values """ h_feature = dict([(k, max(v, -100)) for k, v in h_feature.items()]) return h_feature
def _hash_spots(self, h_info, h_hashed): h_hashed['spot'] = dict() h_salience_e = self._get_salience_e_tf(h_info) for field, l_ana in h_info['spot'].items(): l_e_id = self._get_e_id_from_ana(l_ana) l_hashed_e_id = [self.h_entity_id.get(e_id, 0) for e_id in l_e_id] # l_ana_id = [self.h_entity_id.get(ana['id'], 0) for ana in l_ana] if not l_hashed_e_id: this_field_data = { "entities": [], "features": [], salience_gold: [] } if self.with_position: this_field_data['loc'] = [] h_hashed['spot'][field] = this_field_data continue l_hashed_id_tf = term2lm( [eid for eid in l_hashed_e_id if eid != 0]).items() l_hashed_id_tf.sort(key=lambda item: -item[1]) l_hashed_id_tf = l_hashed_id_tf[:self.max_e_per_d] l_kepted_hashed_e_id = [item[0] for item in l_hashed_id_tf] ll_feature = [[tf] for eid, tf in l_hashed_id_tf] if self.with_feature: ll_feature = self._add_node_features(l_ana, l_kepted_hashed_e_id, ll_feature) l_salience = self._get_given_salience(l_ana, l_kepted_hashed_e_id) l_field_salience = self._get_field_salience( l_kepted_hashed_e_id, h_salience_e) l_salience = [ max(item) for item in zip(l_salience, l_field_salience) ] this_field_data = { "entities": l_kepted_hashed_e_id, "features": ll_feature, salience_gold: l_salience } if self.with_position: ll_position = self._add_entity_loc(l_ana, l_kepted_hashed_e_id) this_field_data['loc'] = ll_position h_hashed['spot'][field] = this_field_data
def _rm3_per_q(self, l_doc_score): """ perform rm3 on on q's ranking :param l_doc_score: docno, ranking score :return: """ l_doc_score = l_doc_score[:self.top_k_doc] z = float(sum([math.exp(score) for _, score in l_doc_score])) l_doc_score = [(item[0], math.exp(item[1]) / z) for item in l_doc_score] l_h_doc_tf = [] for doc, _ in l_doc_score: doc_info = self.h_doc_info.get(doc, {}) if not doc_info: l_h_doc_tf.append({}) continue l_e = [item[0] for item in doc_info['tagme']['bodyText']] h_e_tf = term2lm(l_e) l_h_doc_tf.append(h_e_tf) l_rm3_e = rm3(l_doc_score, l_h_doc_tf, None, None, None, False) return l_rm3_e
def process_one_doc(h_doc): h_spot = h_doc.get('spot', {}) l_abs_e = h_spot.get('abstract', []) l_body_e = h_spot.get('bodyText', []) l_title_e = h_spot.get('title', []) docno = h_doc['docno'] s_a_e = set(l_abs_e) s_b_e = set(l_body_e) s_t_e = set(l_title_e) nb_abs_e = len(set(l_abs_e)) nb_abs_e_in_body = len([e for e in set(l_abs_e) if e in s_b_e]) nb_abs_e_in_title = len([e for e in set(l_abs_e) if e in s_t_e]) first_e_salient = 0 freq_e_salience = 0 if l_body_e: first_e_salient = int(l_body_e[0] in s_a_e) h_b_e = term2lm(l_body_e) l_b_e_tf = h_b_e.items() l_b_e_tf.sort(key=lambda item: -item[1]) freq_e_salience = int(l_b_e_tf[0][0] in s_a_e) return docno, nb_abs_e, nb_abs_e_in_title, nb_abs_e_in_body, freq_e_salience, first_e_salient
def _calc_grid_scores(self, l_grid, doc_lm): """ sent -> e scores include: frequency: emb_sim: desp_emb: desp_bow: gloss_emb: gloss_bow: :param l_grid: :return: for grid->'entity'->['id': e id, 'name':score], grid_score = {name:score} """ logging.info('start calculating grid scores') for grid in l_grid: l_e = [ana['id'] for ana in grid.get(SPOT_FIELD)] h_e_tf = term2lm(l_e) grid_sent = grid['sent'] grid_lm = text2lm(grid_sent) grid_emb = avg_embedding(self.resource.embedding, grid_sent) l_e_score = [] for e, tf in h_e_tf.items(): h_e_score = {'id': e, 'freq': tf} h_e_score['uw_emb'] = self._e_grid_emb(e, grid_emb) # h_e_score['gloss_emb'] = self._e_gloss_emb(e, grid_emb) # h_e_score['gloss_bow'] = self._e_gloss_bow(e, grid_lm) h_e_score['desp_emb'] = self._e_desp_emb(e, grid_emb) h_e_score['desp_bow'] = self._e_desp_bow(e, grid_lm) h_e_score['ESA'] = self._e_desp_bow(e, doc_lm) l_score = self._e_desp_retrieval(e, grid_lm) h_e_score.update(add_feature_prefix(dict(l_score), 'desp_')) l_e_score.append(h_e_score) grid['e_score'] = l_e_score return l_grid
def extract(self, qid, docno, h_q_info, h_doc_info): h_feature = {} emb_model = self.embedding emb_name = "" l_q_e = [ ana['entities'][0]['id'] for ana in h_q_info[self.tagger]['query'] if ana['entities'][0]['id'] in emb_model ] for field, l_ana in h_doc_info[self.tagger].items(): if field not in self.l_target_fields: continue l_doc_e = [ ana['entities'][0]['id'] for ana in l_ana if ana['entities'][0]['id'] in emb_model ] l_doc_e_weight = [] if self.use_entity_weight: l_doc_e_weight = [ ana['entities'][0]['score'] for ana in l_ana if ana['entities'][0]['id'] in emb_model ] elif self.use_entity_salience: l_doc_e_weight = [ ana['entities'][0].get('salience', 1) for ana in l_ana if ana['entities'][0]['id'] in emb_model ] if self.salience_activation: assert self.salience_activation in self.act_func l_doc_e_weight = [ self.act_func[self.salience_activation](max(w, 1e-6)) for w in l_doc_e_weight ] if self.non_tf_salience: h_e_tf = term2lm(l_doc_e) for p in xrange(len(l_doc_e)): l_doc_e_weight[p] /= float(h_e_tf[l_doc_e[p]]) l_sim_mtx = [] m_sim_mtx = self.h_distance_func[self.distance](l_q_e, l_doc_e, emb_model) l_sim_mtx.append(m_sim_mtx) l_total_bin_score = [] for d in xrange(len(l_sim_mtx)): l_this_bin_score = [] m_sim_mtx = l_sim_mtx[d] for pool_name in self.pool_func: assert pool_name in self.h_pool_func l_this_bin_score.extend(self.h_pool_func[pool_name]( m_sim_mtx, [])) if l_doc_e_weight: l_this_bin_score.extend([ (item[0] + '_weight', item[1]) for item in self.h_pool_func[pool_name]( m_sim_mtx, l_doc_e_weight) ]) if len(l_sim_mtx) > 1: l_this_bin_score = [('D%03d' % d + item[0], item[1]) for item in l_this_bin_score] l_total_bin_score.extend(l_this_bin_score) for bin_name, score in l_total_bin_score: feature_name = '_'.join([ self.feature_name_pre, emb_name, field.title(), bin_name.title() ]) h_feature[feature_name] = score return h_feature