示例#1
0
    def _entity_passage_features(self, q_info, l_grid, field):
        l_grid_sent = [grid['sent'] for grid in l_grid]
        q_lm = text2lm(q_info['query'])
        h_feature = dict()
        grid_lm = text2lm(' '.join(l_grid_sent))
        r_model = RetrievalModel()
        r_model.set_from_raw(
            q_lm, grid_lm,
            self.resource.corpus_stat.h_field_df.get(field, None),
            self.resource.corpus_stat.h_field_total_df.get(field, None),
            self.resource.corpus_stat.h_field_avg_len.get(field, None)
        )
        h_score = dict(r_model.scores())
        h_feature.update(h_score)

        # l_grid_lm = [text2lm(sent) for sent in l_grid_sent]
        # l_scores = []
        # for grid_lm in l_grid_lm:
        #     r_model = RetrievalModel()
        #     r_model.set_from_raw(
        #         q_lm, grid_lm,
        #         self.resource.corpus_stat.h_field_df.get(field, None),
        #         self.resource.corpus_stat.h_field_total_df.get(field, None),
        #         self.resource.corpus_stat.h_field_avg_len.get(field, None)
        #     )
        #     l_scores.append(dict(r_model.scores()))
        # # h_feature.update(mean_pool_feature(l_scores))
        # h_feature.update(max_pool_feature(l_scores))

        h_feature = add_feature_prefix(h_feature, 'EntityPassage')
        return h_feature
示例#2
0
 def _extract_retrieval_scores(self, formed_q_lm, formed_doc_lm, field):
     r_model = RetrievalModel()
     r_model.set_from_raw(
         formed_q_lm, formed_doc_lm,
         self.resource.corpus_stat.h_field_df.get(field, None),
         self.resource.corpus_stat.h_field_total_df.get(field, None),
         self.resource.corpus_stat.h_field_avg_len.get(field, None))
     return [(k, v) for k, v in r_model.scores() if 'lm_twoway' != k]
示例#3
0
 def _extract_simple_scores(self, formed_q_lm, formed_doc_lm):
     r_model = RetrievalModel()
     r_model.set_from_raw(
         formed_q_lm,
         formed_doc_lm,
     )
     l_score = [['cosine', lm_cosine(formed_q_lm, formed_doc_lm)],
                ['coordinate', r_model.coordinate()]]
     return l_score
示例#4
0
 def _e_desp_retrieval(self, e, grid_lm):
     desp = self.resource.h_e_desp.get(e, "")
     e_lm = text2lm(desp)
     r_m = RetrievalModel()
     r_m.set_from_raw(grid_lm, e_lm)
     z = float(max(sum([item[1] for item in grid_lm.items()]), 1.0))
     coor = r_m.coordinate() / z
     lm = r_m.lm() / z
     l_score = [['coor', coor], ['lm', lm]]
     return l_score
示例#5
0
    def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss):
        """
        extract e-d features

        do:
            get top k nlss
            form doc lm
            retrieval, as a whole of individually
            sum up to features
        :param q_info: query info
        :param ana:
        :param doc_info:
        :param l_qe_nlss:
        :return: h_feature: entity features for this nlss set
        """

        l_top_nlss = self._find_top_k_nlss_for_q(q_info, ana, l_qe_nlss)

        l_top_sent = [nlss[0] for nlss in l_top_nlss]
        l_top_sent.append(' '.join(l_top_sent))
        if not l_top_sent:
            l_top_sent.append('')  # place holder for empty nlss e
        l_h_per_sent_feature = []
        l_field_doc_lm = [
            text2lm(doc_info.get(field, ""), clean=True)
            for field in self.l_target_fields
        ]
        for sent in l_top_sent:
            h_per_sent_feature = {}
            h_sent_lm = text2lm(sent, clean=True)
            for field, lm in zip(self.l_target_fields, l_field_doc_lm):
                r_model = RetrievalModel()
                r_model.set_from_raw(
                    h_sent_lm, lm,
                    self.resource.corpus_stat.h_field_df.get(field, None),
                    self.resource.corpus_stat.h_field_total_df.get(
                        field, None),
                    self.resource.corpus_stat.h_field_avg_len.get(field, None))
                l_retrieval_score = r_model.scores()
                q_len = float(
                    max(sum([item[1] for item in h_sent_lm.items()]), 1))

                h_per_sent_feature.update(
                    dict([(field + name, score / q_len)
                          for name, score in l_retrieval_score]))
            l_h_per_sent_feature.append(h_per_sent_feature)

        h_max_feature = max_pool_feature(l_h_per_sent_feature[:-1])
        h_mean_feature = add_feature_prefix(l_h_per_sent_feature[-1], 'Conca')

        h_feature = h_max_feature
        h_feature.update(h_mean_feature)
        return h_feature
示例#6
0
 def _desp_passage_features(self, e_id, l_grid, field):
     l_grid_sent = [grid['sent'] for grid in l_grid]
     q_lm = text2lm(self.resource.h_e_desp.get(e_id, ""))
     grid_lm = text2lm(' '.join(l_grid_sent))
     r_model = RetrievalModel()
     r_model.set_from_raw(
         q_lm, grid_lm,
         self.resource.corpus_stat.h_field_df.get(field, None),
         self.resource.corpus_stat.h_field_total_df.get(field, None),
         self.resource.corpus_stat.h_field_avg_len.get(field, None)
     )
     h_score = dict(r_model.scores())
     del h_score['lm_twoway']
     h_feature = add_feature_prefix(h_score, 'DespPassage')
     return h_feature
示例#7
0
    def _lm_nlss_filter(self, l_nlss, doc_info):
        logging.info('filter [%d] nlss via boe', len(l_nlss))
        l_nlss_lmscore = []
        h_d_lm = text2lm(doc_info.get(body_field, ""))

        for nlss in l_nlss:
            h_s_lm = text2lm(nlss[0])
            r_model = RetrievalModel()
            r_model.set_from_raw(h_s_lm, h_d_lm)
            lm = r_model.lm()
            l_nlss_lmscore.append((nlss, lm))
        l_nlss_lmscore.sort(key=lambda item: item[1], reverse=True)
        l_this_nlss = [item[0] for item in l_nlss_lmscore]
        if l_nlss_lmscore:
            logging.info('best lm [%f]', l_nlss_lmscore[0][1])
        return l_this_nlss
示例#8
0
 def __init__(self, **kwargs):
     super(QeDTextMatchFeatureExtractor, self).__init__(**kwargs)
     logging.info('initializing QeDTextMatchFeatureExtractor')
     self.retrieval_model = RetrievalModel(**kwargs)
     logging.info('QeDTextMatchFeatureExtractor init with target entity fields: %s',
                  json.dumps(self.l_entity_fields))