def fetch_doc_text(trec_rank_in, doc_text_in, out_name): l_q_ranking = load_trec_ranking_with_score(trec_rank_in) ll_docno = [[docno for docno, __ in rank] for __, rank in l_q_ranking] s_target_docno = set(sum(ll_docno, [])) logging.info('[%d] target docno', len(s_target_docno)) err_cnt = 0 cnt = 0 out = open(out_name, 'w') for line in open(doc_text_in): cols = line.strip().split('\t') if len(cols) < 2: logging.warning('text format error %s', json.dumps(cols)) err_cnt += 1 continue docno, text = cols[0], cols[-1] if docno in s_target_docno: logging.info('find [%s]', docno) h = dict() h['docno'] = docno h['bodyText'] = text h['title'] = ' '.join(text.split()[:10]) print >>out, docno + '\t' + json.dumps(h) cnt += 1 out.close() logging.info('finished [%s], found [%d], err [%d]', doc_text_in, cnt, err_cnt)
def pipe_extract(self, trec_rank_in, out_name): """ the main pipe run :param trec_rank_in: trec rank format input candidate q-document pairs to extract features :param out_name: the extracted matching features, one json per line :return: """ l_q_ranking = load_trec_ranking_with_score(trec_rank_in) out = open(out_name, 'w') for q, ranking in l_q_ranking: logging.info('start extracting for [%s]', q) q_info = self.h_q_info[q] for docno, score in ranking: logging.info('with doc [%s-%s]', q, docno) d_info = self.h_d_info.get(docno, {'docno': docno}) h_matched_feature = dict() for this_extractor in self.l_feature_extractor: h_this_matched_feature = this_extractor.extract( q_info, d_info, self.resource) h_matched_feature = self._mul_update( h_matched_feature, h_this_matched_feature) h_matched_feature['base_score'] = score print >> out, json.dumps(h_matched_feature) logging.info('[%s-%s] match feature extracted', q, docno) logging.info('q [%s] match features extracted', q) logging.info('ranking pairs [%s] matching features extracted to [%s]', trec_rank_in, out_name) return
def __init__(self, **kwargs): super(LeToRQDocERefRankFeatureExtractorC, self).__init__(**kwargs) self.h_corpus_stat = {} self.h_field_df = {} self.l_h_q_ref_ranking = [ dict(load_trec_ranking_with_score(ranking_in)) for ranking_in in self.l_ref_rank ]
def test_data_reader(self, in_name, s_target_qid=None): if self.io_format == 'raw': l_q_rank = load_trec_ranking_with_score(in_name) x, y = pointwise_reader(l_q_rank, self.h_qrel, self.h_q_info, self.doc_info_in, s_target_qid) else: x, y = load_data(os.path.join(in_name, 'pointwise'), self.k_nrm.s_target_inputs, s_target_qid) return x, y
def facc1_prf(trec_in, facc1_in, out_name): l_q_rank = load_trec_ranking_with_score(trec_in) h_doc_olm = load_facc1_dict(facc1_in) l_q_e_rank = [] for q, rank in l_q_rank: l_e_rank = prf(rank, h_doc_olm) l_q_e_rank.append([q, l_e_rank]) dump_trec_ranking_with_score(l_q_e_rank, out_name)
def _load_candidate_doc(self): l_q_rank = [[q, rank[:self.doc_per_q]] for q, rank in load_trec_ranking_with_score(self.trec_rank)] self.h_q_rank = dict(l_q_rank) for q, rank in l_q_rank: self.h_q_meta[q] = { 'nb_d': len(rank), 'avg_doc_len': 0 } for q, rank in l_q_rank: for d, score in rank: if d not in self.h_d_l_q: self.h_d_l_q[d] = [] self.h_d_l_q[d].append(q) logging.info('load candidate doc done')
def _load_data(self): """ load data from the initialized data path load h_qrel, h_qid_q_info, h_q_doc_score :return: """ self._h_qrel = load_trec_labels_dict(self.qrel_in) self._h_qid_q_info = load_json_info(self.q_info_in, key_field='qid') l_q_ranking_score = load_trec_ranking_with_score( self.q_doc_candidate_in) if self.ext_base_rank: l_q_ext_base = load_trec_ranking_with_score(self.ext_base_rank) for q, l_rank in l_q_ext_base: for doc, score in l_rank: self.h_ext_base[q + '\t' + doc] = score logging.info('external base ranking scores loaded [%s]', self.ext_base_rank) for qid, ranking_score in l_q_ranking_score: self._h_q_doc_score[qid] = dict(ranking_score[:self.rank_top_k]) logging.debug('q [%s] [%d] candidate docs', qid, len(self._h_q_doc_score[qid])) logging.info('feature extraction data pre loaded') return
def mul_load_candidate_doc(in_dir): l_name, l_s_id = [], [] logging.info('load doc partitions') for dir_name, sub_dir, f_names in os.walk(in_dir): for f_name in f_names: l_name.append(f_name) l_q_rank = load_trec_ranking_with_score( os.path.join(dir_name, f_name)) s_doc = set( sum([[doc for doc, score in rank] for q, rank in l_q_rank], [])) s_qid = set([q for q, __ in l_q_rank]) s_id = s_doc.union(s_qid) l_s_id.append(s_doc.union(s_qid)) logging.info('[%s][%d] doc', f_name, len(s_id)) return l_name, l_s_id
def dynamic_load(trec, qrel, q_info, doc_info): if (type(trec) is str) | (type(trec) is unicode): l_q_rank = load_trec_ranking_with_score(trec) else: l_q_rank = trec if (type(qrel) is str) | (type(qrel) is unicode): h_qrel = load_trec_labels_dict(qrel) else: h_qrel = qrel if (type(q_info) is str) | (type(q_info) is unicode): h_q_info = load_json_info(q_info, 'qid') else: h_q_info = q_info if (type(doc_info) is str) | (type(doc_info) is unicode): h_doc_info = load_json_info(doc_info, 'docno') else: h_doc_info = doc_info return l_q_rank, h_qrel, h_q_info, h_doc_info
def _load_data(self): """ load data from the initialized data path load h_qrel, h_qid_q_info, h_q_doc_score :return: """ self._h_qrel = load_trec_labels_dict(self.qrel_in) self._h_qid_q_info = load_json_info(self.q_info_in, 'qid') l_q_ranking_score = load_trec_ranking_with_score( self.q_doc_candidate_in) for qid, ranking_score in l_q_ranking_score: self._h_q_doc_score[qid] = dict(ranking_score[:self.rank_top_k]) logging.debug('q [%s] [%d] candidate docs', qid, len(self._h_q_doc_score[qid])) logging.info('feature extraction data pre loaded') return
def extract(self): l_q_rank = load_trec_ranking_with_score(self.trec_rank_in) l_qid = [] l_docno = [] l_h_feature = [] l_label = [] for q, ranking in l_q_rank: q_info = self.h_q_info[q] logging.info('start extracting q [%s]', q) for docno, base_score in ranking: doc_info = self.h_doc_info.get(docno, {'docno': docno}) if type(doc_info) is str: doc_info = json.loads(doc_info) label = self.h_qrel.get(q, {}).get(docno, 0) h_feature = dict() h_feature['base'] = base_score for extractor in self.l_extractor: h_feature.update(extractor.extract_pair(q_info, doc_info)) l_qid.append(q) l_docno.append(docno) l_h_feature.append(h_feature) l_label.append(label) logging.debug('[%s][%s] feature %s', q, docno, json.dumps(h_feature)) logging.info('extraction finished, dumping...') h_name = dump_svm_from_raw(self.out_name, l_qid, l_docno, l_label, l_h_feature) logging.info('ranking features dumped to [%s]', self.out_name) json.dump(h_name, open(self.out_name + '_name.json', 'w'), indent=1) logging.info('ranking name dumped to [%s_name.json]', self.out_name) self._close_extractor() return
def __init__(self, **kwargs): super(BoeRm3, self).__init__(**kwargs) self.l_q_rank = load_trec_ranking_with_score(self.trec_rank_in) self.h_doc_info = load_doc_info_json(self.doc_info_in)
input: trec, out pre, q number per file output: outpre.xx """ from knowledge4ir.utils import load_trec_ranking_with_score, dump_trec_ranking_with_score import sys import math if 4 != len(sys.argv): print "3 para: trec + out pre + q per file" sys.exit(-1) ll_qid_rank = load_trec_ranking_with_score(sys.argv[1]) q_per_file = int(sys.argv[3]) total_cnt = int(math.ceil(float(len(ll_qid_rank)) / q_per_file)) out_pre = sys.argv[2] l_name = ['%d' % i for i in xrange(1, total_cnt + 1)] max_len = len(l_name[-1]) l_name = [ out_pre + '.' + '0' * (max_len - len(name)) + name for name in l_name ] st = 0 for name in l_name: dump_trec_ranking_with_score(ll_qid_rank[st:st + q_per_file], name) st += q_per_file
def _load_data(self): logging.info('start loading data') self.h_qrel = load_trec_labels_dict(self.qrel_in) self.h_q_rank = dict(load_trec_ranking_with_score(self.q_rank_in)) self.h_doc_info = load_doc_info_json(self.doc_info_in) logging.info('data loaded')
def _load_boe_rm3(self): if not self.boe_rm3_path: return l_q_e_score = load_trec_ranking_with_score(self.boe_rm3_path) self.h_q_boe_rm3 = dict(l_q_e_score) return