示例#1
0
    def _find_top_k_similar_entities(self, query, h_doc_e_texts):
        """
        find top k most similar entities in h_doc_e_texts, judged by each entity fields
        just use lm score
        :param query:
        :param h_doc_e_texts:
        :return:
        """
        q_lm = text2lm(query)
        h_field_top_k_entities = {}

        for e_field in self.l_entity_fields:
            l_e_score = []
            for e, h_field_texts in h_doc_e_texts.items():
                e_text = h_field_texts.get(e_field, "")
                if not e_text:
                    continue
                h_e_lm = text2lm(e_text.lower())
                term_stat = TermStat()
                term_stat.set_from_raw(q_lm, h_e_lm, {})
                lm_score = term_stat.lm()
                l_e_score.append((e, lm_score))
            l_e_score.sort(key=lambda item: -item[1])
            h_field_top_k_entities[e_field] = [
                item[0] for item in l_e_score[:self.top_k]
            ]
        return h_field_top_k_entities
示例#2
0
    def _extract_q_doc_e_topk_merged_text_sim(self, query,
                                              h_field_top_k_entities,
                                              h_doc_e_texts):
        """
        form an expanded documents with top k entities from each_e_field
        calc textual similarities between q and the expanded documents
        :param query:
        :param h_field_top_k_entities: top k most similar entities in each e fields
        :param h_doc_e_texts: entities' texts
        :return:
        """
        h_feature = {}

        l_field_expanded_texts = []
        for e_field, l_topk_e in h_field_top_k_entities.items():
            text = ""
            for e in l_topk_e:
                text += h_doc_e_texts.get(e, {}).get(e_field, "") + ' '
            l_field_expanded_texts.append((e_field, text))

        q_lm = text2lm(query)
        total_df = self.h_corpus_stat[body_field]['total_df']
        avg_doc_len = 100.0
        h_doc_df = self.h_field_h_df[body_field]
        for e_field, text in l_field_expanded_texts:
            exp_lm = text2lm(text, clean=True)
            term_stat = TermStat()
            term_stat.set_from_raw(q_lm, exp_lm, h_doc_df, total_df,
                                   avg_doc_len)
            l_sim_score = term_stat.mul_scores()
            for sim, score in l_sim_score:
                if sim in self.l_model:
                    h_feature[self.feature_name_pre + 'Exp' + e_field.title() +
                              sim.title()] = score
        return h_feature
示例#3
0
    def _extract_per_e(self, h_q_info, e):

        h_feature = {}
        total_df = self.h_corpus_stat['bodyText']['total_df']
        avg_doc_len = self.h_corpus_stat['bodyText']['average_len']
        h_doc_df = self.h_field_h_df['bodyText']
        q_lm = text2lm(h_q_info['query'])

        for e_field in self.l_entity_fields:
            cnt = 0
            e_text = ""
            if e in self.h_entity_texts:
                if e_field in self.h_entity_texts[e]:
                    e_text = self.h_entity_texts[e][e_field]
                    if type(e_text) == list:
                        e_text = ' '.join(e_text)
            e_lm = text2lm(e_text, clean=True)
            cnt += 1
            term_stat = TermStat()
            term_stat.set_from_raw(q_lm, e_lm, h_doc_df, total_df, avg_doc_len)
            l_sim_score = term_stat.mul_scores()
            for sim, score in l_sim_score:
                if sim in self.s_model:
                    h_feature[self.feature_name_pre + e_field.title() + sim.title()] = score

        return h_feature
示例#4
0
    def extract(self, qid, docno, h_q_info, h_doc_info):
        h_feature = {}

        l_q_e = [
            ana['entities'][0]['id'] for ana in h_q_info[self.tagger]['query']
        ]

        for field in self.l_text_fields:
            total_df = self.h_corpus_stat[field]['total_df']
            avg_doc_len = self.h_corpus_stat[field]['average_len']
            h_doc_df = self.h_field_h_df[field]
            h_doc_tf = {}
            if field in h_doc_info:
                h_doc_tf = text2lm(h_doc_info[field].lower())

            for e_field in self.l_entity_fields:
                h_sim_score = {}
                cnt = 0
                for e in l_q_e:
                    if e not in self.h_entity_texts:
                        continue
                    if e_field not in self.h_entity_texts[e]:
                        continue
                    e_text = self.h_entity_texts[e][e_field]
                    if type(e_text) == list:
                        e_text = ' '.join(e_text)
                    h_tf = text2lm(e_text, clean=True)
                    cnt += 1
                    term_stat = TermStat()
                    term_stat.set_from_raw(h_tf, h_doc_tf, h_doc_df, total_df,
                                           avg_doc_len)
                    # if field == 'title':
                    #     title_ts = term_stat
                    for sim, score in term_stat.mul_scores():
                        if sim in self.s_model:
                            if sim not in h_sim_score:
                                h_sim_score[sim] = score
                            else:
                                h_sim_score[sim] += score

                if cnt:
                    for sim in h_sim_score:
                        h_sim_score[sim] /= cnt
                for sim, score in h_sim_score.items():
                    h_feature[self.feature_name_pre + e_field.title() +
                              field.title() + sim.title()] = score

        return h_feature
示例#5
0
    def extract_for_text(self, query, docno, h_q_info, h_doc_info):
        h_feature = {}
        # logging.info('extracting IR fusion for q [%s], doc [%s]', query, docno)
        # logging.info('q_info %s', json.dumps(h_q_info))
        # logging.info('doc_info %s', json.dumps(h_doc_info))

        h_tf = text2lm(query.lower())
        # title_ts = None
        for field in self.l_text_fields:
            total_df = self.h_corpus_stat[field]['total_df']
            avg_doc_len = self.h_corpus_stat[field]['average_len']
            h_doc_df = self.h_field_h_df[field]
            h_doc_tf = {}
            if field in h_doc_info:
                h_doc_tf = text2lm(h_doc_info[field].lower(), clean=True)

            term_stat = TermStat()
            term_stat.set_from_raw(h_tf, h_doc_tf, h_doc_df, total_df,
                                   avg_doc_len)
            # if field == 'title':
            #     title_ts = term_stat
            l_sim_score = term_stat.mul_scores()
            for sim, score in l_sim_score:
                if sim in self.s_model:
                    feature_name = self.feature_name_pre + sim.title(
                    ) + field.title()
                    h_feature[feature_name] = score
        #
        # for feature, score in h_feature.items():
        #     if score != h_old_feature[feature]:
        #         logging.warn('ltr feature value different')
        #         logging.warn('old feature: %s', json.dumps(h_old_feature))
        #         logging.warn('new feature: %s', json.dumps(h_feature))
        #
        #         logging.warn('old ts: %s', title_old_ts.pretty_print())
        #         logging.warn('new ts: %s', title_ts.pretty_print())
        #         logging.warn('query: %s, h_tf: %s', query, json.dumps(h_tf))
        #         break

        return h_feature
示例#6
0
    def _extract_q_doc_e_textual_features(self, query, l_h_doc_e_lm,
                                          h_doc_e_texts):
        if not self.h_entity_texts:
            return {}
        h_feature = {}
        q_lm = text2lm(query)
        for p in xrange(len(self.l_text_fields)):
            field = self.l_text_fields[p]
            if self.l_top_k:
                self.top_k = self.l_top_k[p]
            h_doc_e_lm = l_h_doc_e_lm[p]
            total_df = self.h_corpus_stat[field]['total_df']
            avg_doc_len = self.h_corpus_stat[field]['average_len']
            h_doc_df = self.h_field_h_df[field]
            l_h_scores = []
            l_e_tf = []
            for e, e_tf in h_doc_e_lm.items():
                h_scores = {}
                l_e_tf.append(e_tf)
                h_e_texts = h_doc_e_texts.get(e, {})
                for e_field in self.l_entity_fields:
                    text = h_e_texts.get(e_field, "")
                    e_lm = text2lm(text, clean=True)
                    term_stat = TermStat()
                    term_stat.set_from_raw(q_lm, e_lm, h_doc_df, total_df,
                                           avg_doc_len)
                    l_sim_score = term_stat.mul_scores()
                    for sim, score in l_sim_score:
                        if sim in self.l_model:
                            h_scores[e_field.title() + sim.title()] = score

                l_h_scores.append(h_scores)

            h_pooled_scores = self._merge_entity_sim(l_h_scores, l_e_tf)

            for name, score in h_pooled_scores.items():
                h_feature[self.feature_name_pre + field.title() + name] = score
        logging.debug(json.dumps(h_feature))
        return h_feature
示例#7
0
    def _calc_esearch_per_pair(self, h_q_info, h_doc_info):
        # h_res = {}
        if 'tagme' not in h_doc_info:
            return []
        l_e_name = [(ana[0], ana[-1])
                    for ana in h_doc_info['tagme'][body_field]]
        query = h_q_info['query']
        q_lm = text2lm(query, clean=True)
        total_df, avg_len = self.h_corpus_stat[body_field]['total_df'], 100.0
        l_e_score = []
        for e, name in l_e_name:
            desp = self.h_entity_texts[e]['desp']
            e_lm = text2lm(desp, clean=True)
            term_stat = TermStat()
            term_stat.set_from_raw(q_lm, e_lm, self.h_field_h_df[body_field],
                                   total_df, avg_len)
            lm_dir = term_stat.lm_dir()
            l_e_score.append((e, name, lm_dir))

        l_e_score.sort(key=lambda item: -item[-1])
        # h_res['e_lm_dir'] = l_e_score[:10]

        return l_e_score[:10]