Пример #1
0
def dynamic_load(trec, qrel, q_info, doc_info):
    if (type(trec) is str) | (type(trec) is unicode):
        l_q_rank = load_trec_ranking_with_score(trec)
    else:
        l_q_rank = trec
    if (type(qrel) is str) | (type(qrel) is unicode):
        h_qrel = load_trec_labels_dict(qrel)
    else:
        h_qrel = qrel
    if (type(q_info) is str) | (type(q_info) is unicode):
        h_q_info = load_json_info(q_info, 'qid')
    else:
        h_q_info = q_info
    if (type(doc_info) is str) | (type(doc_info) is unicode):
        h_doc_info = load_json_info(doc_info, 'docno')
    else:
        h_doc_info = doc_info
    return l_q_rank, h_qrel, h_q_info, h_doc_info
Пример #2
0
 def __init__(self, **kwargs):
     super(KNRMCenter, self).__init__(**kwargs)
     self.k_nrm = self.h_model[self.model_name](**kwargs)
     self.hyper_para = HyperParameter(**kwargs)
     if self.embedding_npy_in:
         logging.info('loading embedding for model [%s]', self.model_name)
         emb_mtx = np.load(self.embedding_npy_in)
         self.k_nrm.set_embedding(emb_mtx)
     else:
         logging.info('model [%s] not using embedding', self.model_name)
     self.ranker, self.learner = self.k_nrm.build()
     logging.info('built ranking model:')
     self.ranker.summary()
     logging.info('pairwise training model:')
     self.learner.summary()
     if self.io_format == 'raw':
         self.h_q_info = load_json_info(self.q_info_in, 'qid')
         self.h_doc_info = load_json_info(self.doc_info_in, 'docno')
         self.h_qrel = load_trec_labels_dict(self.qrel_in)
Пример #3
0
    def _load_data(self):
        self.h_qrel = load_trec_labels_dict(self.qrel_in)
        logging.info('loaded qrel [%s]', self.qrel_in)

        logging.info('loading q info')
        # l_h_data = [json.loads(line) for line in open(self.q_info_in)]
        # l_qid = [h['qid'] for h in l_h_data]
        # self.h_q_info = dict(zip(l_qid, l_h_data))
        self.h_q_info = load_json_info(self.q_info_in, 'qid', unpack=True)
        logging.info('loaded [%d] q info [%s]', len(self.h_q_info),
                     self.q_info_in)

        logging.info('loading doc info')
        self.h_doc_info = load_json_info(self.doc_info_in,
                                         'docno',
                                         unpack=False)
        # for line in open(self.doc_info_in):
        #     docno = json.loads(line)['docno']
        #     self.h_doc_info[docno] = line.strip()
        logging.info('loaded [%d] doc info [%s]', len(self.h_doc_info),
                     self.doc_info_in)
Пример #4
0
    def _load_data(self):
        """
        load data from the initialized data path
        load h_qrel, h_qid_q_info, h_q_doc_score
        :return:
        """
        self._h_qrel = load_trec_labels_dict(self.qrel_in)
        self._h_qid_q_info = load_json_info(self.q_info_in, 'qid')

        l_q_ranking_score = load_trec_ranking_with_score(
            self.q_doc_candidate_in)

        for qid, ranking_score in l_q_ranking_score:
            self._h_q_doc_score[qid] = dict(ranking_score[:self.rank_top_k])
            logging.debug('q [%s] [%d] candidate docs', qid,
                          len(self._h_q_doc_score[qid]))
        logging.info('feature extraction data pre loaded')
        return
Пример #5
0
    def _load_data(self):
        """
        load data from the initialized data path
        load h_qrel, h_qid_q_info, h_q_doc_score
        :return:
        """
        self._h_qrel = load_trec_labels_dict(self.qrel_in)
        self._h_qid_q_info = load_json_info(self.q_info_in, key_field='qid')

        l_q_ranking_score = load_trec_ranking_with_score(
            self.q_doc_candidate_in)
        if self.ext_base_rank:
            l_q_ext_base = load_trec_ranking_with_score(self.ext_base_rank)
            for q, l_rank in l_q_ext_base:
                for doc, score in l_rank:
                    self.h_ext_base[q + '\t' + doc] = score
            logging.info('external base ranking scores loaded [%s]',
                         self.ext_base_rank)
        for qid, ranking_score in l_q_ranking_score:
            self._h_q_doc_score[qid] = dict(ranking_score[:self.rank_top_k])
            logging.debug('q [%s] [%d] candidate docs', qid,
                          len(self._h_q_doc_score[qid]))
        logging.info('feature extraction data pre loaded')
        return
Пример #6
0
combine two info with same key (!)
input:
    doc info 1 + 2
output:
    update doc 1 with doc 2
"""

import json
from knowledge4ir.utils import load_json_info
import sys

if 4 != len(sys.argv):
    print "merge two info file"
    print "3 para: json info 1 + 2 + out name"
    sys.exit()

h_info_a = load_json_info(sys.argv[1], 'docno')
print "[%d] in [%s]" % (len(h_info_a), sys.argv[1])
h_info_b = load_json_info(sys.argv[2], 'docno')
print "[%d] in [%s]" % (len(h_info_b), sys.argv[2])

for key in h_info_a.keys():
    h = h_info_b.get(key, {})
    h_info_a[key].update(h)
print "combined, dumping..."
out = open(sys.argv[3], 'w')
for key, value in h_info_a.items():
    print >> out, json.dumps(value)
out.close()
print "finished"
 def __init__(self, **kwargs):
     super(RankEvaAtQMeta, self).__init__(**kwargs)
     assert self.q_meta_in
     self.h_q_meta = load_json_info(self.q_meta_in, key_field='qid')
Пример #8
0
 def _load_edge(self):
     if not self.entity_edge_path:
         return
     logging.info('loading entity edges from [%s]', self.entity_edge_path)
     self.h_e_edge = load_json_info(self.entity_edge_path, 'id')
     logging.info('[%d] entities\'s edge loaded', len(self.h_e_edge))