def __init__(self, **kwargs): super(CrossValidator, self).__init__(**kwargs) assert self.model_name in h_model_name logging.info('using model [%s] with conf in [%s]', self.model_name, self.model_conf) conf = Config() if self.model_conf: conf = load_py_config(self.model_conf) self.model = h_model_name[self.model_name](config=conf) logging.info('ranking model initialized') self.l_hyper_para = [] self._load_hyper_para()
List, ) from knowledge4ir.utils import load_py_config, set_basic_log set_basic_log() class OffsetConvertPara(Configurable): in_name = Unicode(help='tagme json results').tag(config=True) out_name = Unicode(help='out name').tag(config=True) spot_field = Unicode(SPOT_FIELD, help='boe fields: spot|tagme').tag(config=True) l_target_fields = List( Unicode, default_value=TARGET_TEXT_FIELDS, help='target fields to convert').tag(config=True) if 2 != len(sys.argv): print "convert offset from char to token in tagme's ana" print "1 para: config" OffsetConvertPara.class_print_help() sys.exit(-1) para = OffsetConvertPara(config=load_py_config(sys.argv[1])) out = open(para.out_name, 'w') for p, line in enumerate(open(para.in_name)): if not p % 100: print "converted [%d] lines" % p h = json.loads(line) h = convert_offset(h, para) print >> out, json.dumps(h) out.close() print "done"
def _load_hyper_para(self): self.l_hyper_para = [ HyperParameter(config=load_py_config(para_in)) for para_in in self.l_hyper_para_in ] logging.info('[%d] hyper parameters loaded', len(self.l_hyper_para))
logging.info('query meta prepared, dumping...') out = open(self.out_name, 'w') for q, h_meta in self.h_q_meta.items(): h_meta['qid'] = q print >> out, json.dumps(h_meta) out.close() logging.info('results at [%s]', self.out_name) if __name__ == '__main__': import sys from knowledge4ir.utils import ( load_py_config, set_basic_log, ) set_basic_log() if 2 != len(sys.argv): print "prep query meta, now only avg doc len" QueryMetaPrep.class_print_help() sys.exit(-1) preper = QueryMetaPrep(config=load_py_config(sys.argv[1])) preper.process()
logging.info('[%d] doc [%d] pair', p, pair_cnt) h = json.loads(line) h_ana = h.get('spot', {}) l_source_ana = h_ana.get(self.source_field, []) l_target_ana = h_ana.get(self.target_field, []) pair_cnt += len(l_source_ana) * len(l_target_ana) l_s_e = [ana['entities'][0]['id'] for ana in l_source_ana] l_t_e = [ana['entities'][0]['id'] for ana in l_target_ana] for s_e in l_s_e: for t_e in l_t_e: print >> out, s_e + ' ' + t_e out.close() logging.info('finished with [%d] pair', pair_cnt) if __name__ == '__main__': from knowledge4ir.utils import load_py_config, set_basic_log import sys set_basic_log() if 2 != len(sys.argv): print "dump field context" print "1 para" FieldContext.class_print_help() sys.exit(-1) runner = FieldContext(config=load_py_config(sys.argv[1])) runner.process()
l_ndcg = [] for q, eva in l_q_eva: if not q in self.h_q_meta: logging.warn('q [%s] has no meta data', q) l_ndcg.append(eva[0]) l_avg_doc_len.append(self.h_q_meta[q]['avg_doc_len']) l_bin_res, l_bin_range = bin_score(l_avg_doc_len, l_ndcg, self.nb_bin) h_res = { 'avg_doc_len_bin': l_bin_res, 'avg_doc_len_bin_rage': l_bin_range, } json.dump(h_res, open(out_name, 'w'), indent=1) logging.info('finished, results at [%s]', out_name) if __name__ == '__main__': import sys from knowledge4ir.utils import ( load_py_config, set_basic_log, ) set_basic_log() if 4 != len(sys.argv): print "3 para: config + eva in + out" RankEvaAtQMeta.class_print_help() sys.exit(-1) aligner = RankEvaAtQMeta(config=load_py_config(sys.argv[1])) aligner.process(*sys.argv[2:])
if not doc_info: l_h_doc_tf.append({}) continue l_e = [item[0] for item in doc_info['tagme']['bodyText']] h_e_tf = term2lm(l_e) l_h_doc_tf.append(h_e_tf) l_rm3_e = rm3(l_doc_score, l_h_doc_tf, None, None, None, False) return l_rm3_e def process(self): ll_qid_rm3 = [] for qid, l_doc_score in self.l_q_rank: l_rm3_e = self._rm3_per_q(l_doc_score) ll_qid_rm3.append([qid, l_rm3_e]) logging.info('qid [%s] processed with [%d] prf entity', qid, len(l_rm3_e)) dump_trec_ranking_with_score(ll_qid_rm3, self.out_name) logging.info('finished') if __name__ == '__main__': set_basic_log(logging.INFO) if 2 != len(sys.argv): print "perform RM3 on BOE" print "1 para: config" BoeRm3.class_print_help() sys.exit(-1) prf_worker = BoeRm3(config=load_py_config(sys.argv[1])) prf_worker.process()
:param qid: :param query: :param docno: :param score: :param h_spot_sent: :return: """ l_res = [] sent_p = 0 for spot, l_sent in h_spot_sent.items(): for sent in l_sent: sent_no = docno + '_%04d' % sent_p sent_p += 1 line = '\t'.join( [qid, query, docno, "%d" % score, spot, sent_no, sent]) l_res.append(line) return l_res if __name__ == '__main__': set_basic_log(logging.INFO) if 2 != len(sys.argv): print "fetch spot support sentences from PRF" print "1 para: config:" SpotSentence.class_print_help() sys.exit(-1) spot = SpotSentence(config=load_py_config(sys.argv[1])) spot.process()
def process(self): logging.info('start aligning eval and e att results') out = open(self.out_name, 'w') l_key_l_res = [] for qid in self.l_h_qid_e_att[0].keys(): key_ndcg, l_lines = self._form_one_q(qid) if key_ndcg is None: continue l_key_l_res.append((key_ndcg, l_lines)) logging.info('q [%s] results get', qid) logging.info('sort...') l_key_l_res.sort(key=lambda item: item[0]) for key, l_lines in l_key_l_res: print >> out, '\n'.join(l_lines) out.close() logging.info('finished') if __name__ == '__main__': from knowledge4ir.utils import (load_py_config, set_basic_log) set_basic_log(logging.INFO) if 2 != len(sys.argv): print "get e att aligned results for manual analysis" PrettyCompEAtt.class_print_help() sys.exit(-1) ana = PrettyCompEAtt(config=load_py_config(sys.argv[1])) ana.process()
l_h_feature = [json.load(open(f_name)) for f_name in l_f_name] h_name = l_h_feature[0] for h_feature in l_h_feature[1:]: assert h_name == h_feature def _combine(self, l_svm_in, out_name): lines = sum([open(svm_in).read().splitlines() for svm_in in l_svm_in], []) out = open(out_name, 'w') print >> out, '\n'.join(lines) h = json.load(open(l_svm_in[0] + self.feature_name_suffix)) json.dump(h, open(out_name + self.feature_name_suffix, 'w'), indent=1) return if __name__ == '__main__': import sys from knowledge4ir.utils import load_py_config, set_basic_log set_basic_log() if 3 > len(sys.argv): print "combine svm" print "2+ para: in prefix + out prefix + config (can be default)" CombineSVMFeature.class_print_help() sys.exit(-1) if len(sys.argv) > 3: combiner = CombineSVMFeature(config=load_py_config(sys.argv[3])) else: combiner = CombineSVMFeature() combiner.process(sys.argv[1], sys.argv[2])
return None, None l_pos, ll_nlss_words = [], [] for support_info in h['supports']: e_id = support_info['id'] l_sent = support_info['sentences'] if e_id not in self.h_e_id: continue e_pos = self.h_e_id[e_id] for sent in l_sent: l_words = tokenize_and_remove_punctuation(sent.lower()) l_w_id = [self.h_w_id.get(w, 0) for w in l_words][:self.max_nlss_len] l_pos.append(e_pos) ll_nlss_words.append(l_w_id) return l_pos, ll_nlss_words if __name__ == '__main__': from knowledge4ir.utils import ( load_py_config, set_basic_log, ) set_basic_log(logging.INFO) if 2 != len(sys.argv): print "1 para, config" ExtSemanticPrep.class_print_help() sys.exit(-1) prep = ExtSemanticPrep(config=load_py_config(sys.argv[1])) prep.process()
h_qid_sent[qid] = [(sentno, sent, score)] else: h_qid_sent[qid].append((sentno, sent, score)) # sort each item # keep only top 100 to disk # out = open(out_name, 'w') l = h_qid_sent.keys() l.sort(key=lambda item: int(item)) for qid in l: h_qid_sent[qid].sort(key=lambda item: -item[-1]) h_qid_sent[qid] = h_qid_sent[qid][:100] # print >> out, '%s\t%s' % (qid, json.dumps(h_qid_sent[qid])) # out.close() logging.info( 'qid -> prf sentences prepared, start dumping to one json dict') json.dump(h_qid_sent, open(out_name, 'w'), indent=1) logging.info('prf sentence json dict dumped to [%s]', out_name) if __name__ == '__main__': set_basic_log(logging.INFO) if 2 != len(sys.argv): print "rank sent via cosine embedding" print "1 para: config" SpotSentAttention.class_print_help() sys.exit(-1) atter = SpotSentAttention(config=load_py_config(sys.argv[1])) atter.generate_ranking()
class SplitConf(Configurable): in_name = Unicode(help='in put conf').tag(config=True) out_pre = Unicode(help='output conf prefix').tag(config=True) place_holder = Unicode('##', help='to replace string').tag(config=True) l_target_str = List(Unicode, default_value=["01","02","03","04","05","06","07","08","09","10", "11","12","13","14","15","16","17","18","19","20", "21","22","23","24","25","26","27","28","29","30"] ).tag(config=True) def process(self): lines = open(self.in_name).read().splitlines() for suf in self.l_target_str: out = open(self.out_pre + '.' + suf, 'w') new_lines = [line.replace(self.place_holder, suf) for line in lines] print >> out, '\n'.join(new_lines) out.close() print "[%s] done" % (self.out_pre + '.' + suf) if __name__ == '__main__': from knowledge4ir.utils import load_py_config if 2 != len(sys.argv): print "split conf, 1 para: config" SplitConf.class_print_help() sys.exit(-1) conf_spliter = SplitConf(config=load_py_config(sys.argv[1])) conf_spliter.process()
h_stat = self._get_stats(h_doc) l_h_stat.append(h_stat) score = h_doc.get('eval', {}).get(self.target_metric, 0) l_score.append(score) logging.info('all results loaded, start binning') h_stat_bin = dict() for stat in self.l_target_stat: logging.info('binning [%s]', stat) l_stat = [h_stat[stat] for h_stat in l_h_stat] l_bin_res, l_bin_range = bin_score(l_stat, list(l_score), self.nb_bin) h_stat_bin[stat] = l_bin_res h_stat_bin[stat + '_range'] = l_bin_range logging.info('[%s] bin %s', stat, json.dumps(l_bin_res)) json.dump(h_stat_bin, open(out_name, 'w'), indent=1) logging.info('finished') if __name__ == '__main__': import sys from knowledge4ir.utils import load_py_config, set_basic_log set_basic_log() if 4 > len(sys.argv): print "3 para, config + predicted results to analysis + out name" EvaVsStat.class_print_help() sys.exit(-1) ana = EvaVsStat(config=load_py_config(sys.argv[1])) ana.process(*sys.argv[2:])
set_word.update(h_res['set_word']) set_e.update(h_res['set_e']) for key in h_total_set: h_total_set[key] += h_res.get(key, 0) h_total_set['word_vocab'] = len(set_word) h_total_set['entity_vocab'] = len(set_e) nb_d = float(h_total_set['d_cnt']) h_total_set['word_cnt'] /= nb_d h_total_set['e_cnt'] /= nb_d h_total_set['salience_e_cnt'] /= nb_d json.dump(h_total_set, open(out_name, 'w'), indent=1) print "finished" return if __name__ == '__main__': from knowledge4ir.utils import load_py_config import sys if 2 >= len(sys.argv): print "1+ para: config + in + out (the last two can be in config)" DatasetStat.class_print_help() sys.exit(-1) processor = DatasetStat(config=load_py_config(sys.argv[1])) in_name, out_name = (None, None) if len(sys.argv) < 4 else (sys.argv[2], sys.argv[3]) processor.process(in_name, out_name)
trec_in = Unicode(help='candidate ranking').tag(config=True) out_dir = Unicode(help='out_dir').tag(config=True) testing = Bool(False, help='testing').tag(config=True) with_att = Bool(False, help='whether to dump attention').tag(config=True) att_dim = Int(7, help='attention feature dimension').tag(config=True) set_basic_log(logging.INFO) if 2 != len(sys.argv): print "convert raw json ts to pairwise and pointwise training data" print "1 para, config:" MainPara.class_print_help() sys.exit(-1) para = MainPara(config=load_py_config(sys.argv[1])) global att_dim att_dim = para.att_dim s_qid = None if para.testing: s_qid = set(['%s' % i for i in range(1, 11)]) # testing pair_x, pair_y = pairwise_reader(para.trec_in, para.qrel_in, para.q_info_in, para.doc_info_in, s_qid, para.with_att) logging.info('dumping pairwise x, y') dump_data(pair_x, pair_y, os.path.join(para.out_dir, 'pairwise')) point_x, point_y = pointwise_reader(para.trec_in, para.qrel_in, para.q_info_in, para.doc_info_in, s_qid, para.with_att)
l_label.append(label) logging.debug('[%s][%s] feature %s', q, docno, json.dumps(h_feature)) logging.info('extraction finished, dumping...') h_name = dump_svm_from_raw(self.out_name, l_qid, l_docno, l_label, l_h_feature) logging.info('ranking features dumped to [%s]', self.out_name) json.dump(h_name, open(self.out_name + '_name.json', 'w'), indent=1) logging.info('ranking name dumped to [%s_name.json]', self.out_name) self._close_extractor() return def _close_extractor(self): for extractor in self.l_extractor: extractor.close_resource() if __name__ == '__main__': import sys set_basic_log(logging.INFO) if 2 != len(sys.argv): print "1 para: config" BoeLeToRFeatureExtractCenter.class_print_help() sys.exit(-1) center = BoeLeToRFeatureExtractCenter(config=load_py_config(sys.argv[1])) center.extract()
l_sent = h_e_nlss.get(tail, []) l_aligned.append([edge, tail, {'nlss_sent': l_sent}]) return l_aligned def process(resource, out_name): out = open(out_name, 'w') for e, h_edge_info in resource.h_e_edge.items(): l_edge = h_edge_info.get('edges', []) logging.info('aligning for [%s]', e) l_nlss = resource.l_h_nlss[0].get(e, []) h_aligned_info = align_per_entity(e, l_nlss, l_edge) print >> out, json.dumps(h_aligned_info) out.close() logging.info('finished') if __name__ == '__main__': from knowledge4ir.utils import set_basic_log, load_py_config set_basic_log() if 3 != len(sys.argv): print "2 para: resource config + output" JointSemanticResource.class_print_help() sys.exit(-1) resource = JointSemanticResource(config=load_py_config(sys.argv[1])) process(resource, sys.argv[2])
""" overfit a data """ from knowledge4ir.model.cross_validator import CrossValidator import sys import logging from knowledge4ir.utils import (load_py_config, set_basic_log) set_basic_log(logging.INFO) if len(sys.argv) < 4: print "overfit data" print "3 para: config in + data in + out dir" print "config:" CrossValidator.class_print_help() sys.exit(-1) conf = load_py_config(sys.argv[1]) in_name = sys.argv[2] out_dir = sys.argv[3] cv = CrossValidator(config=conf) cv.train_test_files(in_name, in_name, out_dir)
self.feature_names_split[dim]) for (dim, h_total_eva), name in zip(enumerate(l_h_total_eva), self.feature_names_split): h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p) l_mean_eva = sorted(h_mean_eva.items(), key=lambda item: item[0]) logging.info('finished predicted [%d] docs, eva %s', p, json.dumps(l_mean_eva)) with open(self.test_out + "_" + name.replace(" ", "_") + '.eval', 'w') as o: json.dump(l_mean_eva, o, indent=1) for out in outs: out.close() if __name__ == '__main__': import sys from knowledge4ir.utils import (load_py_config, set_basic_log) set_basic_log(logging.INFO) if 2 != len(sys.argv): print "hashing corpus, 1 para, config:" FeatureBasedBaseline.class_print_help() sys.exit(-1) runner = FeatureBasedBaseline(config=load_py_config(sys.argv[1])) runner.process()
prediction[eid] = 1.0 / rank eva = self.evaluator.evaluate(prediction, labels) h_out = { 'docno': data['docno'], body_field: { 'predict': zip(l_e, prediction), }, 'eval': eva, } h_total_eva = add_svm_feature(h_total_eva, eva) h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p) print >> out, json.dumps(h_out) if not p % 1000: logging.info('predicted [%d] docs, eva %s', p, json.dumps(h_mean_eva)) if __name__ == '__main__': import sys from knowledge4ir.utils import (load_py_config, set_basic_log) set_basic_log(logging.INFO) runner = SummarizationBaseline(config=load_py_config(sys.argv[1])) runner.process()
continue l_e_score = h_prediction[field][self.predict_field] h_e_score = dict(l_e_score) l_e = h_hashed_info['spot'][field]['entities'] l_score = [h_e_score[e] for e in l_e] h_hashed_info['spot'][field][self.predict_field] = l_score return h_hashed_info if __name__ == '__main__': from knowledge4ir.utils import ( load_py_config, set_basic_log, sys, ) set_basic_log(logging.INFO) if 4 > len(sys.argv): print "align predicted res with raw or hashed corpu" print "3+ para: corpus_in + predicted in + out name + config (opt)" AlignPredicted.class_print_help() sys.exit(-1) if 5 <= len(sys.argv): aligner = AlignPredicted(config=load_py_config(sys.argv[4])) else: aligner = AlignPredicted() corpus_in, predicted_in, out_name = sys.argv[1:4] aligner.align_predict_to_corpus(corpus_in, predicted_in, out_name)