Пример #1
0
 def _boe_nlss_filter(self, q_info, q_ana, l_nlss, doc_info):
     e_id = q_ana['id']
     logging.info('filter [%d] nlss via boe', len(l_nlss))
     l_ana = sum([
         form_boe_per_field(doc_info, field)
         for field in self.l_target_fields
     ], [])
     s_e = set([ana['id'] for ana in l_ana if ana['id'] != e_id])
     h_e_sf = dict([(ana['id'], ana['surface']) for ana in l_ana])
     l_keep_nlss = []
     l_keep_nlss_e = []
     for nlss in l_nlss:
         keep_flag = False
         meet_e = None
         for e in nlss[1]:
             if e in s_e:
                 keep_flag = True
                 meet_e = e
                 break
         if keep_flag:
             l_keep_nlss.append(nlss)
             l_keep_nlss_e.append({'matched_e': [meet_e, h_e_sf[meet_e]]})
     if self.intermediate_out:
         h = {}
         h['qid'] = q_info['qid']
         h['ana'] = q_ana
         h['docno'] = doc_info['docno']
         h['boe_nlss'] = zip(l_keep_nlss_e, l_keep_nlss)
         print >> self.intermediate_out, json.dumps(h)
     logging.info('[%s] boe filtered [%d]->[%d]', e_id, len(l_nlss),
                  len(l_keep_nlss))
     return l_keep_nlss
Пример #2
0
 def _construct_e_nlss_cash_info(self, q_info, h_nlss):
     """
     e -> [pos in l_this_nlss]
     :return:
     """
     logging.info('constructing nlss cash for q [%s]', q_info['qid'])
     l_q_ana = form_boe_per_field(q_info, QUERY_FIELD)
     l_qe = list(set([ana['id'] for ana in l_q_ana]))
     self.h_qe_idx = dict(zip(l_qe, range(len(l_qe))))
     self.ll_this_nlss = []
     self.ll_this_nlss_lm = []
     self.l_h_e_nlss_idx = []
     for qe in l_qe:
         logging.info('forming nlss cash for qe [%s]', qe)
         l_this_nlss = h_nlss.get(qe, [])
         l_this_nlss_lm = [text2lm(sent, clean=True) for sent, __ in l_this_nlss]
         h_e = dict()
         for p in xrange(len(l_this_nlss)):
             l_e = l_this_nlss[p][1]
             for e in l_e:
                 if e in qe:
                     continue
                 if e not in h_e:
                     h_e[e] = []
                 h_e[e].append(p)
         logging.info('qe [%s] [%d] nlss, [%d] tail e', qe, len(l_this_nlss), len(h_e))
         self.ll_this_nlss.append(l_this_nlss)
         self.ll_this_nlss_lm.append(l_this_nlss_lm)
         self.l_h_e_nlss_idx.append(h_e)
     logging.info('q [%s] nlss cash constructed', q_info['qid'])
Пример #3
0
    def extract_per_entity(self, q_info, ana, doc_info):
        h_feature = dict()
        qe = ana['id']
        qid = q_info['qid']
        logging.info('start extracting [%s]-[%s]-[%s]',
                     qid, qe, doc_info['docno'])
        if qid != self.current_qid:
            self.current_qid = qid
            self._construct_e_nlss_cash_info(q_info, self.resource.l_h_nlss[0])
        for field in self.l_target_fields:
            l_field_ana = form_boe_per_field(doc_info, field)
            h_field_lm = text2lm(doc_info.get(field, ""), clean=True)
            if 'emb_vote' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._connected_emb_vote(qe, l_field_ana),
                    field + '_'))
            if 'edge_cnt' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._edge_cnt(qe, l_field_ana),
                    field + '_'))
            if 'edge_retrieval' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._edge_retrieval(qe, l_field_ana, h_field_lm, field),
                    field + '_'))
            if 'local_grid' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._local_grid(q_info, qe, l_field_ana, doc_info, field),
                    field + '_'))
            if 'qe_grid' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._qe_grid(q_info, qe, doc_info, field),
                    field + '_'))
            if 'nlss_grid' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._nlss_grid(q_info, qe, l_field_ana, doc_info, field),
                    field + '_'))
            if 'ltr_base' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._ltr_baseline(q_info, h_field_lm, field),
                    field + '_'))
            if 'local_vote' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._local_vote(q_info, qe, l_field_ana, doc_info, field),
                    field + '_'
                ))
            if 'grid_retrieval' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._grid_retrieval(qe, h_field_lm, doc_info, field),
                    field + '_'
                ))
            if 'edge_grid' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._edge_grid(qe, doc_info, field),
                    field + '_'
                ))

        return h_feature
Пример #4
0
def construct_per_doc(doc_info, l_target_field):
    doc_info['e_grid'] = {}
    for field in l_target_field:
        if field not in doc_info:
            continue
        l_ana = form_boe_per_field(doc_info, field)
        text = doc_info.get(field, "")
        l_e_grid = construct_per_text(text, l_ana)
        doc_info['e_grid'][field] = l_e_grid
    return doc_info
Пример #5
0
def get_top_frequency(doc_info):
    l_ana = form_boe_per_field(doc_info, body_field)
    l_e = [ana['id'] for ana in l_ana]
    l_name = [ana['surface'] for ana in l_ana]
    h_e_name = dict(zip(l_e, l_name))
    h_e_tf = term2lm(l_e)
    l_e_tf = h_e_tf.items()
    l_e_tf.sort(key=lambda item: item[1], reverse=True)
    top_e_name = ""
    if l_e_tf:
        top_e_name = h_e_name[l_e_tf[0][0]]
    return l_e_tf, top_e_name
Пример #6
0
 def form_boe(self, h_info):
     """
     for each field in h_info:
         l_e = [{id:, loc:, sf:}]
     :param h_info:
     :return: h_field_boe
     """
     h_field_boe = dict()
     for field in self.l_target_field:
         if field in h_info:
             h_field_boe[field] = form_boe_per_field(h_info, field)
     return h_field_boe
def filter_to_title_entity(l_nlss, doc_info):
    l_title_ana = form_boe_per_field(doc_info, title_field)
    s_e = set([ana['id'] for ana in l_title_ana])
    l_keep_nlss = []
    for nlss in l_nlss:
        l_e = nlss[-1]
        keep = False
        for e in l_e:
            if e in s_e:
                keep = True
                break
        if keep:
            l_keep_nlss.append(nlss)
    logging.debug('filter to title entity related only: [%d]->[%d]',
                  len(l_nlss), len(l_keep_nlss))
    return l_keep_nlss
Пример #8
0
def check_title_e_rank(doc_info):
    """
    get the rank of title entity in frequency
    :param doc_info:
    :return:
    """
    l_e_tf, top_e_name = get_top_frequency(doc_info)
    h_e_rank = dict(zip([item[0] for item in l_e_tf], range(1, 1 + len(l_e_tf))))
    l_ana = form_boe_per_field(doc_info, title_field)
    if not l_ana:
        return None
    title_e = l_ana[0]['id']
    rank = h_e_rank.get(title_e, 0)
    if rank != 1:
        if l_e_tf:
            top_e, top_tf = l_e_tf[0]
            title_e, title_tf = l_e_tf[rank - 1]
            print doc_info[title_field] + '\t' + top_e_name + '\t%s\t%d\t%s\t%d' % (title_e, title_tf, top_e, top_tf)
    return rank
Пример #9
0
def dump_per_doc(in_name, out_name):
    e_out = open(out_name + '.entity', 'w')
    v_out = open(out_name + '.vocab', 'w')
    l_e = []
    l_v = []
    for line in open(in_name):
        h = json.loads(line)
        for field in [QUERY_FIELD] + TARGET_TEXT_FIELDS:
            l_ana = form_boe_per_field(h, field)
            text = h.get(field, "").lower()
            l_e.extend([ana['id'] for ana in l_ana])
            l_v.extend(text.split())

    l_e = list(set(l_e))
    print >> e_out, '\n'.join(l_e)
    l_v = list(set(l_v))
    print >> v_out, '\n'.join(l_e + l_v)
    e_out.close()
    v_out.close()
    print '[%s] vocab to [%s.entity + .vocab]' % (in_name, out_name)
    return
Пример #10
0
    def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss):
        """
        for each field
            # of e share nlss
            # of e connected in Freebase
            # of e emb cosine > 0.2
        :param q_info:
        :param ana: the current query ana e
        :param doc_info:
        :param l_qe_nlss:
        :return:
        """
        qe = ana['id']
        h_feature = {}
        for field in self.l_target_fields:
            logging.info('start extracting [%s] in [%s]', qe, field)
            l_e = form_boe_per_field(doc_info, field)
            l_e = [e['id'] for e in l_e if e['id'] != qe]

            nlss_cnt = self._count_co_nlss(qe, l_e, l_qe_nlss)
            emb_sim_cnt = self._count_meaningful_emb_sim(qe, l_e)
            kg_edge_cnt = self._count_kg_edge(qe, l_e)

            h_feature[field + 'nlss_cnt'] = nlss_cnt
            h_feature[field + 'emb_sim_cnt'] = emb_sim_cnt
            h_feature[field + 'kg_edge_cnt'] = kg_edge_cnt
            h_feature[field + 'nb_e'] = len(l_e)

        if self.intermediate_data_out_name:
            h_mid = {'qid': q_info['qid'], 'docno': doc_info['docno'], 'id': ana['id']}
            h_mid['nb_nlss'] = len(l_qe_nlss)
            l_nlss_e = sum(
                [ nlss[1] for nlss in l_qe_nlss],
                [])
            s_nlss_e = set(l_nlss_e)
            h_mid['nb_nlss_e'] = len(s_nlss_e)
            h_mid.update(h_feature)
            print >> self.intermediate_out, json.dumps(h_mid)
        return h_feature