def main(_log, _config): p = _config modelname = file2name[p['modelfn']] mod_model = importlib.import_module('models.%s' % p['modelfn']) model_cls = getattr(mod_model, modelname) model_params_raw = {k: v for k, v in p.items() if k in model_cls.params or k == 'modelfn'} list_of_model_params = get_model_param(model_params_raw) expids = list() for model_params in list_of_model_params: model = model_cls(model_params, rnd_seed=p['seed']) expid = model.params_to_string(model_params, True) expids.append(expid) raw_expid = model.params_to_string(model_params_raw, True) for train_years in train_test_years: for i in range(len(train_test_years[train_years])): test_year, val_year = train_test_years[train_years][i], train_test_years[train_years][1 - i] train_years='wt09_10' test_year='wt11' val_year='wt11' pred_dirs, val_dirs = list(), list() for expid in expids: pred_dir = '%s/train_%s/%s/predict_per_epoch/test_%s/%s' % (p['parentdir'], train_years, p['expname'], test_year, expid) val_dir = '%s/train_%s/%s/predict_per_epoch/test_%s/%s' % (p['parentdir'], train_years, p['expname'], val_year, expid) if not os.path.isdir(pred_dir) or not os.path.isdir(val_dir): warnings.warn('No such dir {0}/{1}'.format(pred_dir, val_dir), RuntimeWarning) continue pred_dirs.append(pred_dir) val_dirs.append(val_dir) output_file='%s/train_%s/%s/evaluations/statdocpair/%s_v-%s_t-%s/%s'%(p['outdir'], train_years,\ p['expname'], '-'.join(train_years.split('_')), val_year[2:], test_year[2:], raw_expid) try: if not os.path.isdir(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) except OSError as e: pass _log.info('evaluate {0} on {1} based on val {2} \ over docpairs benchmark and output to {3}'.format(expid, test_year, val_year, output_file)) test_qids = year_qids[test_year] qrelf = get_qrelf(qrelfdir, test_year) qid_cwid_label = read_qrel(qrelf, test_qids, include_spam=False) year_pkey_docpairs = create_docpairs(qid_cwid_label, test_qids, qid_year) best_pred_dir, argmax_epoch, argmax_run, argmax_ndcg, argmax_err = get_epoch_from_val(pred_dirs, val_dirs) qid_cwid_invrank, _, runid = read_run(os.path.join(best_pred_dir, argmax_run)) pkey_qid_acc = eval_docpair_predaccuracy(qid_cwid_invrank, year_pkey_docpairs, test_year) sored_data = sorted(pkey_qid_acc) print("pkey_qid_acc : ",sored_data) '''
def trec_run_predscore(_log, trec_run_dir, qid_cwid_pred): sysn_qid_cwid_score = dict() for f in os.listdir(trec_run_dir): _, qid_cwid_score, trecsys_name = read_run(os.path.join(trec_run_dir, f)) for qid in qid_cwid_score: if qid not in qid_cwid_pred: _log.error('{0} is not included qid_cwid_pred'.format((qid))) continue for cwid in qid_cwid_score[qid]: if cwid in qid_cwid_pred[qid]: qid_cwid_score[qid][cwid] = qid_cwid_pred[qid][cwid] else: qid_cwid_score[qid][cwid]=-float('Inf') sysn_qid_cwid_score[trecsys_name] = dict(qid_cwid_score) return sysn_qid_cwid_score
def main(_log, _config): p = _config modelname = file2name[p['modelfn']] mod_model = importlib.import_module('models.%s' % p['modelfn']) model_cls = getattr(mod_model, modelname) model_params_raw = { k: v for k, v in p.items() if k in model_cls.params or k == 'modelfn' } list_of_model_params = get_model_param(model_params_raw) expids = list() for model_params in list_of_model_params: model = model_cls(model_params, rnd_seed=p['seed']) expid = model.params_to_string(model_params, True) expids.append(expid) raw_expid = model.params_to_string(model_params_raw, True) for train_years in train_test_years: for i in range(len(train_test_years[train_years])): test_year, val_year = train_test_years[train_years][ i], train_test_years[train_years][1 - i] pred_dirs, val_dirs = list(), list() for expid in expids: pred_dir = '%s/train_%s/%s/predict_per_epoch/test_%s/%s' % ( p['parentdir'], train_years, p['expname'], test_year, expid) val_dir = '%s/train_%s/%s/predict_per_epoch/test_%s/%s' % ( p['parentdir'], train_years, p['expname'], val_year, expid) if not os.path.isdir(pred_dir) or not os.path.isdir(val_dir): warnings.warn( 'No such dir {0}/{1}'.format(pred_dir, val_dir), RuntimeWarning) continue pred_dirs.append(pred_dir) val_dirs.append(val_dir) output_ql='%s/train_%s/%s/evaluations/rerank-ql/%s_v-%s_t-%s/%s'%(p['outdir'], train_years,\ p['expname'], '-'.join(train_years.split('_')), val_year[2:], test_year[2:], raw_expid) output_rrall='%s/train_%s/%s/evaluations/rerank-all/%s_v-%s_t-%s/%s'%(p['outdir'], train_years,\ p['expname'], '-'.join(train_years.split('_')), val_year[2:], test_year[2:], raw_expid) reranked_run_dir='%s/train_%s/%s/reranking/trecrun/v%s-t%s_%s'%(p['outdir'],train_years,\ p['expname'],val_year, test_year,raw_expid) reranked_metric_dir='%s/train_%s/%s/reranking/ndcgerr/v%s-t%s_%s'%(p['outdir'],train_years,\ p['expname'],val_year, test_year,raw_expid) try: if not os.path.isdir(reranked_run_dir): os.makedirs(reranked_run_dir) except OSError as e: print(e) try: if not os.path.isdir(reranked_metric_dir): os.makedirs(reranked_metric_dir) except OSError as e: print(e) try: if not os.path.isdir(os.path.dirname(output_ql)): os.makedirs(os.path.dirname(output_ql)) except OSError as e: print(e) try: if not os.path.isdir(os.path.dirname(output_rrall)): os.makedirs(os.path.dirname(output_rrall)) except OSError as e: print(e) _log.info('evaluate {0} on {1} based on val {2} \ over docpairs benchmark and output to {3}, {4}'.format( expid, test_year, val_year, output_ql, output_rrall)) trec_run_dir = '{0}/{1}'.format(trec_run_basedir, test_year) eval_trecrun_dir = '{0}/{1}'.format(eval_trec_run_basedir, test_year) test_qids = year_qids[test_year] qrelf = get_qrelf(qrelfdir, test_year) qid_cwid_label = read_qrel(qrelf, test_qids, include_spam=False) best_pred_dir, argmax_epoch, argmax_run, argmax_ndcg, argmax_err = get_epoch_from_val( pred_dirs, val_dirs) # create re-rank qid_cwid_pred, _, pred_expid = read_run( os.path.join(best_pred_dir, argmax_run)) sysn_qid_cwid_pred = trec_run_predscore(_log, trec_run_dir, qid_cwid_pred) print_rerun(sysn_qid_cwid_pred, reranked_run_dir, pred_expid, val_year, test_year) # eval re-rank for runfile in os.listdir(reranked_run_dir): outfile = '%s/%s.ndcg_err' % (reranked_metric_dir, runfile[:-3]) with open(outfile, 'w') as outf: subprocess.Popen( [perlf, qrelf, '%s/%s' % (reranked_run_dir, runfile)], stdout=outf) _log.info( 'finished {0} train on {1}, val on {2}, test on {3}'. format(runfile, train_years, val_year, test_year)) # read in eval and generate results trecrun_qid_ndcgerr = read_eval_res(eval_trecrun_dir) while True: rr_trecrun_qid_ndcgerr = read_eval_res(reranked_metric_dir) _log.error('mismatched #run {0} != {1}'.format( len(trecrun_qid_ndcgerr), len(rr_trecrun_qid_ndcgerr))) if len(trecrun_qid_ndcgerr) == len(rr_trecrun_qid_ndcgerr): break # latency for subprocess.Popen time.sleep(2) # orig_rank, orig_score, qidscores, rr_rank, rr_score, qidscores orig_rr_ndcg_rank, orig_rr_err_rank = get_rank( rr_trecrun_qid_ndcgerr, trecrun_qid_ndcgerr) if test_year in fold_names: # query likelihood benchmark cols = [ 'QL-Variants', 'Measures', 'TREC', 'Trec-Rank', 'Rerank', 'Rerank-Rank', 'Comparison', 'p-value' ] tabledict = dict() measure_ind = {'ERR': 1, 'nDCG': 0} for j, col in enumerate(cols): tabledict[col] = list() for method in ['cwindri']: for measure in ['ERR', 'nDCG']: if j == 0: tabledict[col].append(method) elif j == 1: tabledict[col].append(measure) # original trec score elif j == 2: if measure == 'ERR': tabledict[col].append( '%.3f' % orig_rr_err_rank[method][1]) elif measure == 'nDCG': tabledict[col].append( '%.3f' % orig_rr_ndcg_rank[method][1]) # original trec rank elif j == 3: if measure == 'ERR': tabledict[col].append( orig_rr_err_rank[method][0]) elif measure == 'nDCG': tabledict[col].append( orig_rr_ndcg_rank[method][0]) # reranked score elif j == 4: if measure == 'ERR': tabledict[col].append( '%.3f' % orig_rr_err_rank[method][4]) elif measure == 'nDCG': tabledict[col].append( '%.3f' % orig_rr_ndcg_rank[method][4]) # reranked rank elif j == 5: if measure == 'ERR': tabledict[col].append( orig_rr_err_rank[method][3]) elif measure == 'nDCG': tabledict[col].append( orig_rr_ndcg_rank[method][3]) # comparison: (r-t)/t * 100 % elif j == 6: if measure == 'ERR': comp = (orig_rr_err_rank[method][4] - orig_rr_err_rank[method][1] ) / orig_rr_err_rank[method][1] tabledict[col].append('%.0f%%' % (comp * 100)) elif measure == 'nDCG': comp = (orig_rr_ndcg_rank[method][4] - orig_rr_ndcg_rank[method][1] ) / orig_rr_ndcg_rank[method][1] tabledict[col].append('%.0f%%' % (comp * 100)) # comparison: p-value elif j == 7: if measure == 'ERR': _, p_err_diff = ttest_rel( orig_rr_err_rank[method][2], orig_rr_err_rank[method][5]) tabledict[col].append('%.3f' % (p_err_diff)) elif measure == 'nDCG': _, p_ndcg_diff = ttest_rel( orig_rr_ndcg_rank[method][2], orig_rr_ndcg_rank[method][5]) tabledict[col].append('%.3f' % (p_ndcg_diff)) dftable = pd.DataFrame(tabledict, columns=cols, index=None) _log.info('\n' + dftable.to_string()) dftable.to_csv(output_ql + '.csv', float_format='%.3f', header=True, index=False, sep=',', mode='w') _log.info('finished ql benchmark {0} {1} {2} {3}'.format( expid, train_years, val_year, test_year)) # re-rank all benchmark def comparison(orig_rr_rank): count = 0 percents = list() for r in orig_rr_rank: orig_rank, orig_score, orig_scores, rr_rank, rr_score, rr_scores = orig_rr_rank[ r] if rr_rank < orig_rank: count += 1 # compute micro avg qid_chg = (rr_score - orig_score) / orig_score percents.append(qid_chg) return count, np.mean(percents), np.median(percents) cols = ['Measures', '#Total Runs', '#Improved', 'Avg', 'Median'] orig_rr_ranks = [orig_rr_ndcg_rank, orig_rr_err_rank] tabledict = list() for i, measure in enumerate(['nDCG', 'ERR']): tabledict.append(dict()) count, avg_chg, median_chg = comparison(orig_rr_ranks[i]) for j, col in enumerate(cols): if j == 0: tabledict[i][col] = measure elif j == 1: tabledict[i][col] = len(orig_rr_ranks[i]) elif j == 2: tabledict[i][col] = count elif j == 3: tabledict[i][col] = '%.0f%%' % (avg_chg * 100) elif j == 4: tabledict[i][col] = '%.0f%%' % (median_chg * 100) dftable = pd.DataFrame(tabledict, columns=cols, index=None) _log.info('\n' + dftable.to_string()) dftable.to_csv(output_rrall + '.csv', float_format='%.3f', header=True, index=False, sep=',', mode='w') _log.info('finished rerank all benchmark {0} {1} {2} {3}'.format( expid, train_years, val_year, test_year)) # rank vs percentage of change def rank_improve(orig_rr_rank): oscore_percent = list() for r in orig_rr_rank: orig_rank, orig_score, orig_scores, rr_rank, rr_score, rr_scores = orig_rr_rank[ r] percent = (rr_score - orig_score) / orig_score oscore_percent.append((orig_score, percent)) return [ p for s, p in sorted( oscore_percent, key=lambda s_p: s_p[0], reverse=True) ] def plot_curve(ranks, ndcg_ps, err_ps, outfilename): fig, ax = plt.subplots() rects1 = ax.scatter(ranks, ndcg_ps, s=25, c='b', marker="^", lw=0) rects2 = ax.scatter(ranks, err_ps, s=25, c='r', marker="o", lw=0) vals = ax.get_yticks() ax.set_yticklabels(['{:3.0f}%'.format(x * 100) for x in vals]) ax.set_xlabel( 'Rank of runs from TREC sorted by corresponding measures') ax.set_ylabel('Relative improvement based on Err/nDCG') ax.legend((rects1, rects2), ('Improvements based on nDCG', 'Improvements based on ERR')) plt.grid(b=False, linestyle='--') fig.savefig(outfilename + '.pdf', format='pdf') plt.close() ndcg_ps = rank_improve(orig_rr_ndcg_rank) err_ps = rank_improve(orig_rr_err_rank) ranks = range(1, len(ndcg_ps) + 1) plot_curve(ranks, ndcg_ps, err_ps, output_rrall)
def main(_log, _config): p = _config modelname = file2name[p['modelfn']] mod_model = importlib.import_module('models.%s' % p['modelfn']) model_cls = getattr(mod_model, modelname) model_params_raw = { k: v for k, v in p.items() if k in model_cls.params or k == 'modelfn' } list_of_model_params = get_model_param(model_params_raw) expids = list() for model_params in list_of_model_params: model = model_cls(model_params, rnd_seed=p['seed']) expid = model.params_to_string(model_params, True) expids.append(expid) raw_expid = model.params_to_string(model_params_raw, True) for train_years in train_test_years: for i in range(len(train_test_years[train_years])): test_year, val_year = train_test_years[train_years][ i], train_test_years[train_years][1 - i] pred_dirs, val_dirs = list(), list() for expid in expids: default_dir = lambda year: '%s/train_%s/%s/predict_per_epoch/test_%s/%s' % \ (p['parentdir'], train_years, p['expname'], year, expid) pred_dir = default_dir(test_year) while not os.path.isdir(pred_dir): print pred_dir, 'is not a valid pred_dir' pred_dir = raw_input('Enter new pred_dir: ') val_dir = default_dir(val_year) while not os.path.isdir(val_dir): print val_dir, 'is not a valid val_dir' val_dir = raw_input('Enter new val_dir: ') pred_dirs.append(pred_dir) val_dirs.append(val_dir) # we want to avoid making the output file too long, so we truncate it to the PATH_MAX output_file_long='%s/train_%s/%s/evaluations/statdocpair/%s_v-%s_t-%s/%s'%(p['outdir'], train_years,\ p['expname'], '-'.join(train_years.split('_')), val_year[2:], test_year[2:], raw_expid) output_file = output_file_long[:PATH_MAX] if not os.path.isdir(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) _log.info('evaluate {0} on {1} based on val {2} \ over docpairs benchmark and output to {3}'.format( expid, test_year, val_year, output_file)) test_qids = year_qids[test_year] qrelf = get_qrelf(qrelfdir, test_year) qid_cwid_label = read_qrel(qrelf, test_qids, include_spam=False) year_pkey_docpairs = create_docpairs(qid_cwid_label, test_qids, qid_year) best_pred_dir, argmax_epoch, argmax_run, argmax_ndcg, argmax_err = get_epoch_from_val( pred_dirs, val_dirs) qid_cwid_invrank, _, runid = read_run( os.path.join(best_pred_dir, argmax_run)) pkey_qid_acc = eval_docpair_predaccuracy(qid_cwid_invrank, year_pkey_docpairs, test_year) dftable = df(pkey_qid_acc, index=sorted(list(qid_cwid_invrank.keys())) + [0, -1]) _log.info('\n' + dftable.to_string()) dftable.to_csv(output_file + '.csv', float_format='%.3f', header=True, index=True, sep=',', mode='w') _log.info('finished {0} {1} {2} {3}'.format( expid, train_years, val_year, test_year))