Пример #1
0
    def _data_io(self, line):
        """
        convert data to the input for the model
        :param line: the json formatted data
        :return: v_e, v_w, v_label
        v_e: entities in the doc
        v_w: initial weight, TF
        v_label: 1 or -1, salience or not, if label not given, will be 0
        """

        h = json.loads(line)
        l_e = h[self.spot_field].get(self.in_field, [])
        s_salient_e = set(h[self.spot_field].get(self.salience_field, []))
        h_e_tf = term2lm(l_e)
        l_e_tf = sorted(h_e_tf.items(),
                        key=lambda item: -item[1])[:self.max_e_per_doc]
        l_e = [item[0] for item in l_e_tf]
        z = float(sum([item[1] for item in l_e_tf]))
        l_w = [item[1] / z for item in l_e_tf]
        l_label = [1 if e in s_salient_e else 0 for e in l_e]
        v_e = Variable(torch.LongTensor(l_e)).cuda() if use_cuda else Variable(
            torch.LongTensor(l_e))
        v_w = Variable(
            torch.FloatTensor(l_w)).cuda() if use_cuda else Variable(
                torch.FloatTensor(l_w))
        v_label = Variable(
            torch.LongTensor(l_label)).cuda() if use_cuda else Variable(
                torch.FloatTensor(l_label))

        return v_e, v_w, v_label
Пример #2
0
def get_top_k_e(l_e, max_e_per_d):
    h_e_tf = term2lm(l_e)
    l_e_tf = sorted(h_e_tf.items(), key=lambda item: -item[1])[:max_e_per_d]
    l_e = [item[0] for item in l_e_tf]
    z = float(sum([item[1] for item in l_e_tf]))
    l_w = [item[1] / z for item in l_e_tf]
    return l_e, l_w
Пример #3
0
 def get_top_k_e(cls, l_term, max_number):
     h_e_tf = term2lm(l_term)
     l_e_tf = sorted(h_e_tf.items(), key=lambda item: -item[1])[:max_number]
     l_term = [item[0] for item in l_e_tf]
     z = float(sum([item[1] for item in l_e_tf]))
     l_w = [item[1] / z for item in l_e_tf]
     return l_term, l_w
Пример #4
0
 def _filter_doc_e(self, l_doc_e):
     h_doc_e_tf = term2lm(l_doc_e)
     l_doc_e_tf = sorted(h_doc_e_tf.items(),
                         key=lambda item: -item[1])[:self.max_e_per_d]
     l_doc_e = [item[0] for item in l_doc_e_tf]
     z = float(sum([item[1] for item in l_doc_e_tf]))
     v_doc_e_w = np.array([item[1] / z for item in l_doc_e_tf])
     return l_doc_e, v_doc_e_w
Пример #5
0
 def _get_salience_e_tf(self, h_info):
     h_salience_e = {}
     if not self.salience_field:
         return h_salience_e
     l_ana = h_info['spot'].get(self.salience_field, [])
     l_e_id = self._get_e_id_from_ana(l_ana)
     l_hashed_e_id = [self.h_entity_id.get(e_id, 0) for e_id in l_e_id]
     h_salience_e = term2lm([eid for eid in l_hashed_e_id if eid != 0])
     return h_salience_e
Пример #6
0
 def _form_doc_e_lm(self, h_doc_info):
     l_h_doc_e_lm = []
     for field in self.l_text_fields:
         l_e = []
         if field in h_doc_info[self.tagger]:
             l_e = [ana[0] for ana in h_doc_info[self.tagger][field]]
         h_lm = term2lm(l_e)
         l_h_doc_e_lm.append(h_lm)
     return l_h_doc_e_lm
Пример #7
0
def get_top_frequency(doc_info):
    l_ana = form_boe_per_field(doc_info, body_field)
    l_e = [ana['id'] for ana in l_ana]
    l_name = [ana['surface'] for ana in l_ana]
    h_e_name = dict(zip(l_e, l_name))
    h_e_tf = term2lm(l_e)
    l_e_tf = h_e_tf.items()
    l_e_tf.sort(key=lambda item: item[1], reverse=True)
    top_e_name = ""
    if l_e_tf:
        top_e_name = h_e_name[l_e_tf[0][0]]
    return l_e_tf, top_e_name
Пример #8
0
    def _form_prf_field_lm(self, qid):
        l_rank_info = self.h_q_rank_info.get(qid, [])
        h_field_l_doc_lm = {}
        for field in TARGET_TEXT_FIELDS:
            l_doc_lm = []
            for doc, score, h_info in l_rank_info[:self.prf_d]:
                l_ana = h_info.get(self.tagger, {}).get(field, [])
                l_e = [ana[0] for ana in l_ana]
                lm = term2lm(l_e)
                l_doc_lm.append(lm)
            h_field_l_doc_lm[field] = l_doc_lm

        return h_field_l_doc_lm
Пример #9
0
def load_facc1_dict(in_name):
    h_doc_ana = {}
    for line in open(in_name):
        docno, oid = line.strip().split('\t')
        if docno not in h_doc_ana:
            h_doc_ana[docno] = [oid]
        else:
            h_doc_ana[docno].append(oid)

    h_doc_olm = {}
    for docno, l_e in h_doc_ana.items():
        lm = term2lm(l_e)
        h_doc_olm[docno] = lm
    logging.info('[%d] doc facc1 dict loaded', len(h_doc_olm))
    return h_doc_olm
Пример #10
0
    def _edge_retrieval(self, qe, l_field_ana, h_field_lm, field):
        """
        for each edge in this doc field
            get edge sent's lm
            calc retrieval scores
        sum up retrieval score to final feature
        :param qe:
        :param l_field_ana:
        :param h_field_lm:
        :return:
        """
        z = max(float(len(l_field_ana)), 1.0)
        h_feature = {}
        p = self.h_qe_idx[qe]
        h_e_nlss_idx = self.l_h_e_nlss_idx[p]
        l_this_nlss_lm = self.ll_this_nlss_lm[p]
        l_e = [ana['id'] for ana in l_field_ana if ana['id'] in h_e_nlss_idx]

        l_h_retrieval_scores = []
        l_h_avg_retrieval_scores = []
        h_e_tf = term2lm(l_e)
        avg_sent_per_e = 0
        for e, tf in h_e_tf.items():
            l_sent_lm = [l_this_nlss_lm[pos] for pos in h_e_nlss_idx[e]]
            avg_sent_per_e += len(l_sent_lm)
            l_this_e_h_scores = []
            for sent_lm in l_sent_lm:
                l_scores = self._extract_retrieval_scores(sent_lm, h_field_lm, field)
                l_scores = [(name, v * tf / z) for name, v in l_scores if 'lm_twoway' not in name]
                h_retrieval_score = dict(l_scores)
                l_h_retrieval_scores.append(h_retrieval_score)
                l_this_e_h_scores.append(h_retrieval_score)

            h_this_e_avg_score = mean_pool_feature(l_this_e_h_scores)
            l_h_avg_retrieval_scores.append(h_this_e_avg_score)
        avg_sent_per_e /= float(max(len(h_e_tf), 1.0))
        avg_sent_per_e = max(avg_sent_per_e, 1.0)
        h_sum_retrieval_score = sum_pool_feature(l_h_retrieval_scores)
        h_sum_retrieval_score = dict([(k, v / avg_sent_per_e)
                                      for k, v in h_sum_retrieval_score.items()])
        h_feature.update(h_sum_retrieval_score)
        h_feature.update(sum_pool_feature(l_h_avg_retrieval_scores))

        """
        make sure not too small values
        """
        h_feature = dict([(k, max(v, -100)) for k, v in h_feature.items()])
        return h_feature
Пример #11
0
    def _hash_spots(self, h_info, h_hashed):
        h_hashed['spot'] = dict()
        h_salience_e = self._get_salience_e_tf(h_info)
        for field, l_ana in h_info['spot'].items():
            l_e_id = self._get_e_id_from_ana(l_ana)
            l_hashed_e_id = [self.h_entity_id.get(e_id, 0) for e_id in l_e_id]
            # l_ana_id = [self.h_entity_id.get(ana['id'], 0) for ana in l_ana]
            if not l_hashed_e_id:
                this_field_data = {
                    "entities": [],
                    "features": [],
                    salience_gold: []
                }
                if self.with_position:
                    this_field_data['loc'] = []
                h_hashed['spot'][field] = this_field_data
                continue

            l_hashed_id_tf = term2lm(
                [eid for eid in l_hashed_e_id if eid != 0]).items()
            l_hashed_id_tf.sort(key=lambda item: -item[1])
            l_hashed_id_tf = l_hashed_id_tf[:self.max_e_per_d]
            l_kepted_hashed_e_id = [item[0] for item in l_hashed_id_tf]

            ll_feature = [[tf] for eid, tf in l_hashed_id_tf]
            if self.with_feature:
                ll_feature = self._add_node_features(l_ana,
                                                     l_kepted_hashed_e_id,
                                                     ll_feature)

            l_salience = self._get_given_salience(l_ana, l_kepted_hashed_e_id)
            l_field_salience = self._get_field_salience(
                l_kepted_hashed_e_id, h_salience_e)
            l_salience = [
                max(item) for item in zip(l_salience, l_field_salience)
            ]

            this_field_data = {
                "entities": l_kepted_hashed_e_id,
                "features": ll_feature,
                salience_gold: l_salience
            }
            if self.with_position:
                ll_position = self._add_entity_loc(l_ana, l_kepted_hashed_e_id)
                this_field_data['loc'] = ll_position
            h_hashed['spot'][field] = this_field_data
Пример #12
0
    def _rm3_per_q(self, l_doc_score):
        """
        perform rm3 on on q's ranking
        :param l_doc_score: docno, ranking score
        :return:
        """
        l_doc_score = l_doc_score[:self.top_k_doc]
        z = float(sum([math.exp(score) for _, score in l_doc_score]))
        l_doc_score = [(item[0], math.exp(item[1]) / z)
                       for item in l_doc_score]

        l_h_doc_tf = []
        for doc, _ in l_doc_score:
            doc_info = self.h_doc_info.get(doc, {})
            if not doc_info:
                l_h_doc_tf.append({})
                continue
            l_e = [item[0] for item in doc_info['tagme']['bodyText']]
            h_e_tf = term2lm(l_e)
            l_h_doc_tf.append(h_e_tf)
        l_rm3_e = rm3(l_doc_score, l_h_doc_tf, None, None, None, False)
        return l_rm3_e
def process_one_doc(h_doc):
    h_spot = h_doc.get('spot', {})
    l_abs_e = h_spot.get('abstract', [])
    l_body_e = h_spot.get('bodyText', [])
    l_title_e = h_spot.get('title', [])
    docno = h_doc['docno']
    s_a_e = set(l_abs_e)
    s_b_e = set(l_body_e)
    s_t_e = set(l_title_e)

    nb_abs_e = len(set(l_abs_e))
    nb_abs_e_in_body = len([e for e in set(l_abs_e) if e in s_b_e])
    nb_abs_e_in_title = len([e for e in set(l_abs_e) if e in s_t_e])
    first_e_salient = 0
    freq_e_salience = 0
    if l_body_e:
        first_e_salient = int(l_body_e[0] in s_a_e)
        h_b_e = term2lm(l_body_e)
        l_b_e_tf = h_b_e.items()
        l_b_e_tf.sort(key=lambda item: -item[1])
        freq_e_salience = int(l_b_e_tf[0][0] in s_a_e)

    return docno, nb_abs_e, nb_abs_e_in_title, nb_abs_e_in_body, freq_e_salience, first_e_salient
Пример #14
0
    def _calc_grid_scores(self, l_grid, doc_lm):
        """
        sent -> e scores
        include:
            frequency:
            emb_sim:
            desp_emb:
            desp_bow:
            gloss_emb:
            gloss_bow:
        :param l_grid:
        :return: for grid->'entity'->['id': e id, 'name':score], grid_score = {name:score}
        """
        logging.info('start calculating grid scores')
        for grid in l_grid:
            l_e = [ana['id'] for ana in grid.get(SPOT_FIELD)]
            h_e_tf = term2lm(l_e)
            grid_sent = grid['sent']
            grid_lm = text2lm(grid_sent)
            grid_emb = avg_embedding(self.resource.embedding, grid_sent)

            l_e_score = []
            for e, tf in h_e_tf.items():
                h_e_score = {'id': e, 'freq': tf}
                h_e_score['uw_emb'] = self._e_grid_emb(e, grid_emb)
                # h_e_score['gloss_emb'] = self._e_gloss_emb(e, grid_emb)
                # h_e_score['gloss_bow'] = self._e_gloss_bow(e, grid_lm)
                h_e_score['desp_emb'] = self._e_desp_emb(e, grid_emb)
                h_e_score['desp_bow'] = self._e_desp_bow(e, grid_lm)
                h_e_score['ESA'] = self._e_desp_bow(e, doc_lm)
                l_score = self._e_desp_retrieval(e, grid_lm)
                h_e_score.update(add_feature_prefix(dict(l_score), 'desp_'))
                l_e_score.append(h_e_score)
            grid['e_score'] = l_e_score

        return l_grid
Пример #15
0
    def extract(self, qid, docno, h_q_info, h_doc_info):
        h_feature = {}
        emb_model = self.embedding
        emb_name = ""
        l_q_e = [
            ana['entities'][0]['id'] for ana in h_q_info[self.tagger]['query']
            if ana['entities'][0]['id'] in emb_model
        ]
        for field, l_ana in h_doc_info[self.tagger].items():
            if field not in self.l_target_fields:
                continue
            l_doc_e = [
                ana['entities'][0]['id'] for ana in l_ana
                if ana['entities'][0]['id'] in emb_model
            ]
            l_doc_e_weight = []
            if self.use_entity_weight:
                l_doc_e_weight = [
                    ana['entities'][0]['score'] for ana in l_ana
                    if ana['entities'][0]['id'] in emb_model
                ]
            elif self.use_entity_salience:
                l_doc_e_weight = [
                    ana['entities'][0].get('salience', 1) for ana in l_ana
                    if ana['entities'][0]['id'] in emb_model
                ]
                if self.salience_activation:
                    assert self.salience_activation in self.act_func
                    l_doc_e_weight = [
                        self.act_func[self.salience_activation](max(w, 1e-6))
                        for w in l_doc_e_weight
                    ]
                    if self.non_tf_salience:
                        h_e_tf = term2lm(l_doc_e)
                        for p in xrange(len(l_doc_e)):
                            l_doc_e_weight[p] /= float(h_e_tf[l_doc_e[p]])

            l_sim_mtx = []
            m_sim_mtx = self.h_distance_func[self.distance](l_q_e, l_doc_e,
                                                            emb_model)
            l_sim_mtx.append(m_sim_mtx)

            l_total_bin_score = []
            for d in xrange(len(l_sim_mtx)):
                l_this_bin_score = []
                m_sim_mtx = l_sim_mtx[d]
                for pool_name in self.pool_func:
                    assert pool_name in self.h_pool_func
                    l_this_bin_score.extend(self.h_pool_func[pool_name](
                        m_sim_mtx, []))
                    if l_doc_e_weight:
                        l_this_bin_score.extend([
                            (item[0] + '_weight', item[1])
                            for item in self.h_pool_func[pool_name](
                                m_sim_mtx, l_doc_e_weight)
                        ])
                if len(l_sim_mtx) > 1:
                    l_this_bin_score = [('D%03d' % d + item[0], item[1])
                                        for item in l_this_bin_score]
                l_total_bin_score.extend(l_this_bin_score)

            for bin_name, score in l_total_bin_score:
                feature_name = '_'.join([
                    self.feature_name_pre, emb_name,
                    field.title(),
                    bin_name.title()
                ])
                h_feature[feature_name] = score

        return h_feature