示例#1
0
    def _entity_passage_features(self, q_info, l_grid, field):
        l_grid_sent = [grid['sent'] for grid in l_grid]
        q_lm = text2lm(q_info['query'])
        h_feature = dict()
        grid_lm = text2lm(' '.join(l_grid_sent))
        r_model = RetrievalModel()
        r_model.set_from_raw(
            q_lm, grid_lm,
            self.resource.corpus_stat.h_field_df.get(field, None),
            self.resource.corpus_stat.h_field_total_df.get(field, None),
            self.resource.corpus_stat.h_field_avg_len.get(field, None)
        )
        h_score = dict(r_model.scores())
        h_feature.update(h_score)

        # l_grid_lm = [text2lm(sent) for sent in l_grid_sent]
        # l_scores = []
        # for grid_lm in l_grid_lm:
        #     r_model = RetrievalModel()
        #     r_model.set_from_raw(
        #         q_lm, grid_lm,
        #         self.resource.corpus_stat.h_field_df.get(field, None),
        #         self.resource.corpus_stat.h_field_total_df.get(field, None),
        #         self.resource.corpus_stat.h_field_avg_len.get(field, None)
        #     )
        #     l_scores.append(dict(r_model.scores()))
        # # h_feature.update(mean_pool_feature(l_scores))
        # h_feature.update(max_pool_feature(l_scores))

        h_feature = add_feature_prefix(h_feature, 'EntityPassage')
        return h_feature
示例#2
0
 def _extract_retrieval_scores(self, formed_q_lm, formed_doc_lm, field):
     r_model = RetrievalModel()
     r_model.set_from_raw(
         formed_q_lm, formed_doc_lm,
         self.resource.corpus_stat.h_field_df.get(field, None),
         self.resource.corpus_stat.h_field_total_df.get(field, None),
         self.resource.corpus_stat.h_field_avg_len.get(field, None))
     return [(k, v) for k, v in r_model.scores() if 'lm_twoway' != k]
示例#3
0
 def _extract_simple_scores(self, formed_q_lm, formed_doc_lm):
     r_model = RetrievalModel()
     r_model.set_from_raw(
         formed_q_lm,
         formed_doc_lm,
     )
     l_score = [['cosine', lm_cosine(formed_q_lm, formed_doc_lm)],
                ['coordinate', r_model.coordinate()]]
     return l_score
示例#4
0
    def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss):
        """
        extract e-d features

        do:
            get top k nlss
            form doc lm
            retrieval, as a whole of individually
            sum up to features
        :param q_info: query info
        :param ana:
        :param doc_info:
        :param l_qe_nlss:
        :return: h_feature: entity features for this nlss set
        """

        l_top_nlss = self._find_top_k_nlss_for_q(q_info, ana, l_qe_nlss)

        l_top_sent = [nlss[0] for nlss in l_top_nlss]
        l_top_sent.append(' '.join(l_top_sent))
        if not l_top_sent:
            l_top_sent.append('')  # place holder for empty nlss e
        l_h_per_sent_feature = []
        l_field_doc_lm = [
            text2lm(doc_info.get(field, ""), clean=True)
            for field in self.l_target_fields
        ]
        for sent in l_top_sent:
            h_per_sent_feature = {}
            h_sent_lm = text2lm(sent, clean=True)
            for field, lm in zip(self.l_target_fields, l_field_doc_lm):
                r_model = RetrievalModel()
                r_model.set_from_raw(
                    h_sent_lm, lm,
                    self.resource.corpus_stat.h_field_df.get(field, None),
                    self.resource.corpus_stat.h_field_total_df.get(
                        field, None),
                    self.resource.corpus_stat.h_field_avg_len.get(field, None))
                l_retrieval_score = r_model.scores()
                q_len = float(
                    max(sum([item[1] for item in h_sent_lm.items()]), 1))

                h_per_sent_feature.update(
                    dict([(field + name, score / q_len)
                          for name, score in l_retrieval_score]))
            l_h_per_sent_feature.append(h_per_sent_feature)

        h_max_feature = max_pool_feature(l_h_per_sent_feature[:-1])
        h_mean_feature = add_feature_prefix(l_h_per_sent_feature[-1], 'Conca')

        h_feature = h_max_feature
        h_feature.update(h_mean_feature)
        return h_feature
示例#5
0
 def _desp_passage_features(self, e_id, l_grid, field):
     l_grid_sent = [grid['sent'] for grid in l_grid]
     q_lm = text2lm(self.resource.h_e_desp.get(e_id, ""))
     grid_lm = text2lm(' '.join(l_grid_sent))
     r_model = RetrievalModel()
     r_model.set_from_raw(
         q_lm, grid_lm,
         self.resource.corpus_stat.h_field_df.get(field, None),
         self.resource.corpus_stat.h_field_total_df.get(field, None),
         self.resource.corpus_stat.h_field_avg_len.get(field, None)
     )
     h_score = dict(r_model.scores())
     del h_score['lm_twoway']
     h_feature = add_feature_prefix(h_score, 'DespPassage')
     return h_feature
示例#6
0
    def _lm_nlss_filter(self, l_nlss, doc_info):
        logging.info('filter [%d] nlss via boe', len(l_nlss))
        l_nlss_lmscore = []
        h_d_lm = text2lm(doc_info.get(body_field, ""))

        for nlss in l_nlss:
            h_s_lm = text2lm(nlss[0])
            r_model = RetrievalModel()
            r_model.set_from_raw(h_s_lm, h_d_lm)
            lm = r_model.lm()
            l_nlss_lmscore.append((nlss, lm))
        l_nlss_lmscore.sort(key=lambda item: item[1], reverse=True)
        l_this_nlss = [item[0] for item in l_nlss_lmscore]
        if l_nlss_lmscore:
            logging.info('best lm [%f]', l_nlss_lmscore[0][1])
        return l_this_nlss
示例#7
0
 def _e_desp_retrieval(self, e, grid_lm):
     desp = self.resource.h_e_desp.get(e, "")
     e_lm = text2lm(desp)
     r_m = RetrievalModel()
     r_m.set_from_raw(grid_lm, e_lm)
     z = float(max(sum([item[1] for item in grid_lm.items()]), 1.0))
     coor = r_m.coordinate() / z
     lm = r_m.lm() / z
     l_score = [['coor', coor], ['lm', lm]]
     return l_score
示例#8
0
 def class_print_help(cls, inst=None):
     super(QeDTextMatchFeatureExtractor, cls).class_print_help(inst)
     RetrievalModel.class_print_help(inst)
示例#9
0
 def __init__(self, **kwargs):
     super(QeDTextMatchFeatureExtractor, self).__init__(**kwargs)
     logging.info('initializing QeDTextMatchFeatureExtractor')
     self.retrieval_model = RetrievalModel(**kwargs)
     logging.info('QeDTextMatchFeatureExtractor init with target entity fields: %s',
                  json.dumps(self.l_entity_fields))
示例#10
0
class QeDTextMatchFeatureExtractor(Configurable):
    feature_name_pre = Unicode('QeD')
    l_entity_fields = List(Unicode, default_value=ENTITY_TEXT_FIELDS).tag(config=True)

    def __init__(self, **kwargs):
        super(QeDTextMatchFeatureExtractor, self).__init__(**kwargs)
        logging.info('initializing QeDTextMatchFeatureExtractor')
        self.retrieval_model = RetrievalModel(**kwargs)
        logging.info('QeDTextMatchFeatureExtractor init with target entity fields: %s',
                     json.dumps(self.l_entity_fields))

    @classmethod
    def class_print_help(cls, inst=None):
        super(QeDTextMatchFeatureExtractor, cls).class_print_help(inst)
        RetrievalModel.class_print_help(inst)

    def extract(self, q_info, d_info, external_resource):
        """

        :param q_info: grounded query info
        :param d_info: doc with textual fields
        :param external_resource: make sure h_entity_fields is loaded
        :return: matching features for each entities in the grounded fields, in same tree structure
        h_match_info:
            qid:
            docno:
            match: tree structure for each entities in grounded field
        """
        assert external_resource.h_entity_fields is not None

        h_match_info = dict()
        h_match_info['qid'] = q_info['qid']
        h_match_info['docno'] = d_info['docno']
        l_q_grounded = q_info[GROUND_FIELD]['query']
        l_q_matched_feature = []
        for grounded_sf in l_q_grounded:
            matched_sf = dict()
            matched_sf['surface'] = grounded_sf['surface']
            matched_sf['loc'] = grounded_sf['loc']
            l_matched_entities = []
            for grounded_e in grounded_sf['entities']:
                e_id = grounded_e['id']
                e_name = external_resource.h_entity_fields.get(e_id, {}).get(e_name_field, "")
                h_feature = self._extract_per_entity(e_id, d_info, external_resource)
                l_matched_entities.append(({'id': e_id, 'f': h_feature, e_name_field: e_name}))
            matched_sf['entities'] = l_matched_entities
            l_q_matched_feature.append(matched_sf)
        h_match_info[MATCH_FIELD] = l_q_matched_feature
        return h_match_info

    def _extract_per_entity(self, e_id, d_info, external_resource):
        h_feature = dict()
        h_e_fields = external_resource.h_entity_fields.get(e_id, {})
        l_e_text_fields = [(field, h_e_fields.get(field, ""))
                           for field in self.l_entity_fields]
        corpus_stat = external_resource.corpus_stat
        for field, text in l_e_text_fields:
            h_q_lm = text2lm(text, clean=True)
            for doc_field in TARGET_TEXT_FIELDS:
                doc_text = d_info.get(doc_field, "")
                h_d_lm = text2lm(doc_text, clean=True)
                self.retrieval_model.set(h_q_lm, h_d_lm, doc_field, corpus_stat)
                l_sim_scores = self.retrieval_model.scores()

                l_feature = [(self.feature_name_pre + field.title()
                              + doc_field.title() + name, score)
                             for name, score in l_sim_scores]
                h_feature.update(dict(l_feature))

        return h_feature