def test_graph(hyperparams, nodes, config=None): nodes = scoring_and_counting(hyperparams, nodes, config=config) metric_values = { "MRR_movie": metrics.mrr(nodes["movie_higher_values"]), "HITS@10_movie": metrics.hits_n(nodes["movie_higher_values"], 10), "HITS@3_movie": metrics.hits_n(nodes["movie_higher_values"], 3), "HITS@1_movie": metrics.hits_n(nodes["movie_higher_values"], 1), "MRR_r": metrics.mrr(nodes["rating_higher_values"]), "HITS@5_r": metrics.hits_n(nodes["rating_higher_values"], 5), "HITS@3_r": metrics.hits_n(nodes["rating_higher_values"], 3), "HITS@2_r": metrics.hits_n(nodes["rating_higher_values"], 2), "HITS@1_r": metrics.hits_n(nodes["rating_higher_values"], 1) } nodes.update(metric_values) summaries = [tf.summary.scalar(k, v) for k, v in metric_values.items()] + [ tf.summary.histogram("rating score rankings", nodes["rating_higher_values"]), tf.summary.histogram("movie score rankings", nodes["movie_higher_values"]) ] nodes["test_summary"] = tf.summary.merge(summaries) return nodes
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=50, verbose=False, reranking_th=-100.0, ignore_noanswer=False, ignore_allanswer=False): ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer, ignore_allanswer=ignore_allanswer) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir) map_svm = metrics.map(svm) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) ''' print "%13s %5s" %("IR", "SVM") print "MRR: %5.2f %5.2f" %(mrr_se, mrr_svm) print "MAP: %5.4f %5.4f" %(map_se, map_svm) print "AvgRec: %5.2f %5.2f" %(avg_acc1_ir, avg_acc1_svm) print "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SVM", "IR", "SVM", "IR", "SVM", "IR", "SVM") ''' rec1_se =-10 rec1_svm = -10 for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): #print "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2) if (rec1_se<-5): rec1_se = p_se rec1_svm = p_svm ''' print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X" ''' print "Table view" print " MRR MAP P@1" print "REF_FILE %5.2f %5.2f %5.2f" % (mrr_se, map_se*100, rec1_se) print "SVM %5.2f %5.2f %5.2f" % (mrr_svm, map_svm*100, rec1_svm)
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print "acc\tf1\tMAP\tMRR\tAvgRec" print "%.4f %4.4f %4.4f %4.4f %4.4f" % (acc, f1, map_svm, mrr_svm, avg_acc1_svm)
def eval_reranker(resPredIterable, th=10, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(resPredIterable, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * (conf_matrix['true']['true'] + conf_matrix['false']['false']) / (conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) mrr_svm = metrics.mrr(svm, th) map_svm = metrics.map(svm, th) scores = {} scores['map'] = map_svm scores['accuracy'] = acc scores['precision'] = p scores['recall'] = r scores['f1'] = f1 scores['mrr'] = mrr_svm return scores
def eval_search_engine(res_fname, format, th=10): ir = read_res_file(res_fname, format) # evaluate IR rec = metrics.recall_of_1(ir, th) acc = metrics.accuracy(ir, th) acc1 = metrics.accuracy1(ir, th) acc2 = metrics.accuracy2(ir, th) mrr = metrics.mrr(ir, th) print("%13s" % "IR") print("MRRof1: %5.2f" % mrr) for i, (r, a, a1, a2) in enumerate(zip(rec, acc, acc1, acc2), 1): print( "REC-1@%02d: %6.2f ACC@%02d: %6.2f AC1@%02d: %6.2f AC2@%02d: %4.0f" % (i, r, i, a, i, a1, i, a2)) print() print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" ) print( "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" ) print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" ) print("AC2 - the absolute number of correct answers at @X")
def eval_search_engine(res_fname, format, th=50): ir = read_res_file(res_fname, format) # evaluate IR rec = metrics.recall_of_1(ir, th) acc = metrics.accuracy(ir, th) acc1 = metrics.accuracy1(ir, th) acc2 = metrics.accuracy2(ir, th) mrr = metrics.mrr(ir, th) # MAP map_ir = metrics.map(ir) print "%10s" %"IR" print "MRR: %5.2f" % mrr print "MAP: %5.2f" % map_ir for i, (r, a, a1, a2) in enumerate(zip(rec, acc, acc1, acc2), 1): print "REC-1@%02d: %6.2f ACC@%02d: %6.2f AC1@%02d: %6.2f AC2@%02d: %4.0f" %(i, r, i, a, i, a1, i, a2) print print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X"
def get_cv_evaluation_results(qid_aid_label_list, y_pred): predictions_dict = get_cv_ranked_predictions_dict(qid_aid_label_list, y_pred) logging.debug("Num of questions: %d" % (len(predictions_dict))) mrr_score = m.mrr(predictions_dict, 1000) map_score = m.map(predictions_dict) * 100 p1_score = m.recall_of_1(predictions_dict, 1000)[0] return mrr_score, map_score, p1_score
def infer(train_data, test_data, user_size, item_size): config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: ############################### CREATE MODEL ############################# iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes) model = NCF.NCF(FLAGS.embedding_size, user_size, item_size, FLAGS.lr, FLAGS.optim, FLAGS.initializer, FLAGS.loss_func, FLAGS.activation, FLAGS.regularizer, iterator, FLAGS.topK, FLAGS.dropout, is_training=True) model.build() # train_init_op = iterator.make_initializer(train_data) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("model files do not exist") exit(1) ############################### Training #################################### total_time = 0 count = 0 for epoch in range(FLAGS.epochs): ################################ EVALUATION ################################## sess.run(model.iterator.make_initializer(test_data)) model.is_training = False HR, MRR, NDCG = [], [], [] start_time = time.time() try: while True: prediction, label = model.step(sess, None) count = count + 1 label = int(label[0]) HR.append(metrics.hit(label, prediction)) MRR.append(metrics.mrr(label, prediction)) NDCG.append(metrics.ndcg(label, prediction)) except tf.errors.OutOfRangeError: hr = np.array(HR).mean() mrr = np.array(MRR).mean() ndcg = np.array(NDCG).mean() print("Epoch %d testing " %epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) print("HR is %.3f, MRR is %.3f, NDCG is %.3f" %(hr, mrr, ndcg)) total_time += time.time() - start_time print("Total Epochs: %d on inference " %(epoch+1)) print("Total recommendations: %d" % (count * FLAGS.batch_size)) print("Approximate accelerator time in seconds is: %.2f" % total_time) print("Approximate accelerator performance in recommendations/second is: %.2f" % (float(count * FLAGS.batch_size)/float(total_time)))
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=50, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir) map_svm = metrics.map(svm) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print "%13s %5s" %("IR", "SVM") print "MRR: %5.2f %5.2f" %(mrr_se, mrr_svm) print "MAP: %5.4f %5.4f" %(map_se, map_svm) print "AvgRec: %5.2f %5.2f" %(avg_acc1_ir, avg_acc1_svm) print "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SVM", "IR", "SVM", "IR", "SVM", "IR", "SVM") for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2) print print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X"
def get_evaluation_results(df, y_pred, skip_all_positives_and_all_negatives=True): predictions_dict = get_ranked_predictions_dict( df, y_pred, skip_all_positives_and_all_negatives= skip_all_positives_and_all_negatives) logging.debug("Num of questions: %d" % (len(predictions_dict))) mrr_score = m.mrr(predictions_dict, 1000) map_score = m.map(predictions_dict) * 100 p1_score = m.recall_of_1(predictions_dict, 1000)[0] return mrr_score, map_score, p1_score
def calc_metrics(file): all_predictions = [] all_labels = [] impressions = read_impressions_file(file) if sample_size > 0: impressions = random.sample(impressions, sample_size) for i, impression in enumerate(impressions): preds, labels = calc_impression(impression) all_predictions.append(preds) all_labels.append(labels) if i % 100 == 99: print("Completed {} / {}".format(i + 1, len(impressions))) metrics = { "auc": group_auc(all_predictions, all_labels), "mrr": mrr(all_predictions, all_labels), "ndcg@5": ndcg(all_predictions, all_labels, 5), "ndcg@10": ndcg(all_predictions, all_labels, 10) } return metrics
def eval_model(model, data_loader, sample_prob=1.0, train=False): sample_data = data_loader.sample_valid_data(sample_prob, train=train) with torch.no_grad(): all_predictions = [] all_labels = [] for impression in sample_data: user_ids, news_ids, _, _, _, labels = impression prediction = model(user_ids, news_ids).view(-1) all_predictions.append(prediction.detach().numpy()) all_labels.append(labels.detach().numpy()) metrics = { "auc": group_auc(all_predictions, all_labels), "mrr": mrr(all_predictions, all_labels), "ndcg@5": ndcg(all_predictions, all_labels, 5), "ndcg@10": ndcg(all_predictions, all_labels, 10) } print(metrics)
def cal_score(ref_lines, probs): reranking_th = 0.0 line_count = 0 pred_lines = defaultdict(list) for ref_line in ref_lines: qid, aid, lbl = ref_line[0], ref_line[1], ref_line[2] pred_lines[qid].append((lbl, probs[line_count][0], aid)) line_count += 1 # for qid in pred_lines.keys(): # candidates = pred_lines[qid] # if all(relevant == "false" for relevant, _, _ in candidates): # del pred_lines[qid] for qid in pred_lines.keys(): pred_sorted = pred_lines[qid] max_score = max([score for rel, score, aid in pred_sorted]) if max_score >= reranking_th: pred_sorted = sorted(pred_sorted, key=itemgetter(1), reverse=True) pred_lines[qid] = [rel for rel, score, aid in pred_sorted] MAP = metrics.map(pred_lines, 10) MRR = metrics.mrr(pred_lines, 10) return MAP, MRR
def test(model, sess, test_data, all_items_idx, user_bought): model.is_training = False model.test_first = True all_items_embed = [] HR, MRR, NDCG = [], [], [] ########################## GET ALL ITEM EMBEDDING ONCE ###################### for sample in test_data.get_all_test(): item_embed = model.step(sess, sample, None, None) all_items_embed.append(item_embed[0][0]) model.test_first = False all_items_embed = np.array(all_items_embed) ########################## TEST FOR EACH USER QUERY PAIR ##################### for sample in test_data.get_instance(): item_indices = model.step(sess, sample, all_items_embed, None)[0] itemID = sample[3] reviewerID = sample[4] ranking_list = all_items_idx[item_indices].tolist() top_idx = [] u_bought = user_bought[reviewerID] if reviewerID in user_bought else [] while len( top_idx ) < FLAGS.topK: # delete those items already bought by the user candidate_item = ranking_list.pop() if candidate_item not in u_bought or candidate_item == itemID: top_idx.append(candidate_item) top_idx = np.array(top_idx) HR.append(metrics.hit(itemID, top_idx)) MRR.append(metrics.mrr(itemID, top_idx)) NDCG.append(metrics.ndcg(itemID, top_idx)) hr = np.array(HR).mean() mrr = np.array(MRR).mean() ndcg = np.array(NDCG).mean() print("HR is %.3f, MRR is %.3f, NDCG is %.3f" % (hr, mrr, ndcg))
def train(train_data,test_data,user_size,item_size): with tf.Session() as sess: iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes) model = NCF.NCF(FLAGS.embedding_size, user_size, item_size, FLAGS.lr, FLAGS.optim, FLAGS.initializer, FLAGS.loss_func, FLAGS.activation, FLAGS.regularizer, iterator, FLAGS.topK, FLAGS.dropout, is_training=True) model.build() ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("Creating model with fresh parameters.") sess.run(tf.global_variables_initializer()) count = 0 for epoch in range(FLAGS.epochs): sess.run(model.iterator.make_initializer(train_data)) model.is_training = True model.get_data() start_time = time.time() try: while True: model.step(sess, count) count += 1 except tf.errors.OutOfRangeError: print("Epoch %d training " % epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) sess.run(model.iterator.make_initializer(test_data)) model.is_training = False model.get_data() start_time = time.time() HR,MRR,NDCG = [],[],[] prediction, label = model.step(sess, None) try: while True: prediction, label = model.step(sess, None) label = int(label[0]) HR.append(metrics.hit(label, prediction)) MRR.append(metrics.mrr(label, prediction)) NDCG.append(metrics.ndcg(label, prediction)) except tf.errors.OutOfRangeError: hr = np.array(HR).mean() mrr = np.array(MRR).mean() ndcg = np.array(NDCG).mean() print("Epoch %d testing " % epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) print("HR is %.3f, MRR is %.3f, NDCG is %.3f" % (hr, mrr, ndcg)) ################################## SAVE MODEL ################################ checkpoint_path = os.path.join(FLAGS.model_dir, "NCF.ckpt") model.saver.save(sess, checkpoint_path)
def stats_cv(path=".", format="trec", prefix="svm", th=50, verbose=False): mrrs_se = [] mrrs_svm = [] abs_mrrs = [] rel_mrrs = [] maps_se = [] maps_svm = [] abs_maps = [] rel_maps = [] recalls1_se = [] recalls1_svm = [] abs_recalls = [] rel_recalls = [] oracle_mrrs = [] oracle_maps = [] oracle_recs1 = [] num_folds = 0 print "%13s %5s %7s %7s" %("IR", "SVM", "(abs)", "(rel)") for fold in sorted(os.listdir(path)): currentFold = os.path.join(path, fold) if not os.path.isdir(currentFold): continue if not fold.startswith("fold"): logging.warn("Directories containing CV folds should start with 'fold'") continue print fold # Relevancy file res_fname = os.path.join(currentFold, "%s.test.res" % prefix) if not os.path.exists(res_fname): logging.error("Relevancy file not found: %s", res_fname) sys.exit(1) # Predictions file pred_fname = os.path.join(currentFold, "%s.pred" % prefix) if not os.path.exists(pred_fname): logging.error("SVM prediction file not found: %s", pred_fname) sys.exit(1) try: ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose) except: logging.error("Failed to process input files: %s %s", res_fname, pred_fname) logging.error("Check that the input file format is correct") sys.exit(1) # MRR mrr_se = metrics.mrr(ir, th) or 1 mrr_svm = metrics.mrr(svm, th) mrrs_se.append(mrr_se) mrrs_svm.append(mrr_svm) # improvement abs_mrr_diff = mrr_svm - mrr_se rel_mrr_diff = (mrr_svm - mrr_se)*100/mrr_se abs_mrrs.append(abs_mrr_diff) rel_mrrs.append(rel_mrr_diff) print "MRR: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (mrr_se, mrr_svm, abs_mrr_diff, rel_mrr_diff) # MAP map_se = metrics.map(ir) or 1 map_svm = metrics.map(svm) maps_se.append(map_se) maps_svm.append(map_svm) # improvement abs_map_diff = map_svm - map_se rel_map_diff = (map_svm - map_se)*100/map_se abs_maps.append(abs_map_diff) rel_maps.append(rel_map_diff) print "MAP: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (map_se, map_svm, abs_map_diff, rel_map_diff) # Recall-of-1@1 rec_se = metrics.recall_of_1(ir, th)[0] or 1 rec_svm = metrics.recall_of_1(svm, th)[0] recalls1_se.append(rec_se) recalls1_svm.append(rec_svm) # improvement abs_rec_diff = rec_svm - rec_se rel_rec_diff = (rec_svm - rec_se)*100/rec_se abs_recalls.append(abs_rec_diff) rel_recalls.append(rel_rec_diff) print "P@1: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (rec_se, rec_svm, abs_rec_diff, rel_rec_diff) num_folds += 1 ''' mrr_oracle = metrics.oracle_mrr(ir, th) map_oracle = metrics.oracle_map(ir) prec_oracle = metrics.oracle_precision(ir, th)[0] rec1_oracle = metrics.oracle_recall_of_1(ir, th)[0] oracle_mrrs.append(mrr_oracle) oracle_maps.append(map_oracle) oracle_recs1.append(rec1_oracle) print "Oracle MRR: %5.2f, Oracle MAP: %5.2f, Oracle prec: %5.2f, Oracle rec@1: %5.2f" % (mrr_oracle, map_oracle, prec_oracle, rec1_oracle) ''' # mrrs avg_mrr_se, std_mrr_se = mean_and_std(mrrs_se) avg_mrr_svm, std_mrr_svm = mean_and_std(mrrs_svm) avg_abs_impr_mrr, std_abs_impr_mrr = mean_and_std(abs_mrrs) avg_rel_impr_mrr, std_rel_impr_mrr = mean_and_std(rel_mrrs) #oracle_avg_mrr, std_oracle_avg_mrr = mean_and_std(oracle_mrrs) # maps avg_map_se, std_map_se = mean_and_std(maps_se) avg_map_svm, std_map_svm = mean_and_std(maps_svm) avg_abs_impr_map, std_abs_impr_map = mean_and_std(abs_maps) avg_rel_impr_map, std_rel_impr_map = mean_and_std(rel_maps) #oracle_avg_map, std_oracle_avg_map = mean_and_std(oracle_maps) # recall avg_rec1_se, std_rec1_se = mean_and_std(recalls1_se) # se avg_rec1_svm, std_rec1_svm = mean_and_std(recalls1_svm) # svm avg_abs_impr_rec1, std_abs_impr_rec1 = mean_and_std(abs_recalls) # absolute avg_rel_impr_rec1, std_rel_impr_rec1 = mean_and_std(rel_recalls) # relative #oracle_avg_rec1, std_oracle_avg_rec1 = mean_and_std(oracle_recs1) FMT = u"%3s: %5.2f \u00B1 %4.2f %5.2f \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f" #ORACLE_FMT = u"Oracle MRR: %5.2f \u00B1 %4.2f, Oracle MAP: %5.2f \u00B1 %4.2f, Oracle P@1: %5.2f \u00B1 %4.2f" print print "Averaged over %s folds" % num_folds print "%17s %12s %14s %14s" %("IR", "SVM", "(abs)", "(rel)") print FMT % ("MRR", avg_mrr_se, std_mrr_se, avg_mrr_svm, std_mrr_svm, avg_abs_impr_mrr, std_abs_impr_mrr, avg_rel_impr_mrr, std_rel_impr_mrr) print FMT % ("MAP", avg_map_se, std_map_se, avg_map_svm, std_map_svm, avg_abs_impr_map, std_abs_impr_map, avg_rel_impr_map, std_rel_impr_map) print FMT % ("P@1", avg_rec1_se, std_rec1_se, avg_rec1_svm, std_rec1_svm, avg_abs_impr_rec1, std_abs_impr_rec1, avg_rel_impr_rec1, std_rel_impr_rec1)
def stats_cv(path=".", format="trec", prefix="svm", th=50, suf="", verbose=False, truth_file=None, ignore_noanswer=False, cut_truth_map_at_N=None): mrrs_se = [] mrrs_svm = [] abs_mrrs = [] rel_mrrs = [] maps_se = [] maps_svm = [] abs_maps = [] rel_maps = [] recalls1_se = [] recalls1_svm = [] abs_recalls = [] rel_recalls = [] num_folds = 0 truth = read_truth_file(truth_file, format, cut_truth_map_at_N) print "%13s %5s %7s %7s" % ("IR", "SVM", "(abs)", "(rel)") for fold in sorted(os.listdir(path)): currentFold = os.path.join(path, fold) if not os.path.isdir(currentFold): continue if not fold.startswith("fold"): logging.warn( "Directories containing CV folds should start with 'fold'") continue print fold # Relevancy file res_fname = os.path.join(currentFold, "%s.relevancy" % prefix) if not os.path.exists(res_fname): logging.error("Relevancy file not found: %s", res_fname) sys.exit(1) # Predictions file pred_fname = os.path.join(currentFold, "%s.pred" % (prefix + suf)) if not os.path.exists(pred_fname): logging.error("SVM prediction file not found: %s", pred_fname) sys.exit(1) try: ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, ignore_noanswer=ignore_noanswer, truth_map=truth) except: logging.error("Failed to process input files: %s %s", res_fname, pred_fname) logging.error("Check that the input file format is correct") sys.exit(1) # MRR mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) mrrs_se.append(mrr_se) mrrs_svm.append(mrr_svm) # improvement abs_mrr_diff = mrr_svm - mrr_se rel_mrr_diff = (mrr_svm - mrr_se) * 100 / mrr_se abs_mrrs.append(abs_mrr_diff) rel_mrrs.append(rel_mrr_diff) print "MRR: %5.2f %5.2f %+6.2f%% %+6.2f%%" % ( mrr_se, mrr_svm, abs_mrr_diff, rel_mrr_diff) # MAP map_se = metrics.map(ir) map_svm = metrics.map(svm) maps_se.append(map_se) maps_svm.append(map_svm) # improvement abs_map_diff = map_svm - map_se rel_map_diff = (map_svm - map_se) * 100 / map_se abs_maps.append(abs_map_diff) rel_maps.append(rel_map_diff) print "MAP: %5.2f %5.2f %+6.2f%% %+6.2f%%" % ( map_se * 100, map_svm * 100, abs_map_diff, rel_map_diff) # Recall-of-1@1 rec_se = metrics.recall_of_1(ir, th)[0] rec_svm = metrics.recall_of_1(svm, th)[0] recalls1_se.append(rec_se) recalls1_svm.append(rec_svm) # improvement abs_rec_diff = rec_svm - rec_se rel_rec_diff = (rec_svm - rec_se) * 100 / rec_se abs_recalls.append(abs_rec_diff) rel_recalls.append(rel_rec_diff) print "P@1: %5.2f %5.2f %+6.2f%% %+6.2f%%" % ( rec_se, rec_svm, abs_rec_diff, rel_rec_diff) num_folds += 1 # mrrs avg_mrr_se, std_mrr_se = mean_and_std(mrrs_se) avg_mrr_svm, std_mrr_svm = mean_and_std(mrrs_svm) avg_abs_impr_mrr, std_abs_impr_mrr = mean_and_std(abs_mrrs) avg_rel_impr_mrr, std_rel_impr_mrr = mean_and_std(rel_mrrs) # maps avg_map_se, std_map_se = mean_and_std(maps_se) avg_map_svm, std_map_svm = mean_and_std(maps_svm) avg_abs_impr_map, std_abs_impr_map = mean_and_std(abs_maps) avg_rel_impr_map, std_rel_impr_map = mean_and_std(rel_maps) # recall avg_rec1_se, std_rec1_se = mean_and_std(recalls1_se) # se avg_rec1_svm, std_rec1_svm = mean_and_std(recalls1_svm) # svm avg_abs_impr_rec1, std_abs_impr_rec1 = mean_and_std( abs_recalls) # absolute avg_rel_impr_rec1, std_rel_impr_rec1 = mean_and_std( rel_recalls) # relative FMT = u"%3s: %5.2f \u00B1 %4.2f %5.2f \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f" print print "Averaged over %s folds" % num_folds print "%17s %12s %14s %14s" % ("IR", "SVM", "(abs)", "(rel)") print FMT % ("MRR", avg_mrr_se, std_mrr_se, avg_mrr_svm, std_mrr_svm, avg_abs_impr_mrr, std_abs_impr_mrr, avg_rel_impr_mrr, std_rel_impr_mrr) print FMT % ("MAP", avg_map_se * 100, std_map_se, avg_map_svm * 100, std_map_svm, avg_abs_impr_map, std_abs_impr_map, avg_rel_impr_map, std_rel_impr_map) print FMT % ("P@1", avg_rec1_se, std_rec1_se, avg_rec1_svm, std_rec1_svm, avg_abs_impr_rec1, std_abs_impr_rec1, avg_rel_impr_rec1, std_rel_impr_rec1) print "Table view" print " MRR MAP P@1" print u"IR %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f" % ( avg_mrr_se, std_mrr_se, avg_map_se * 100, std_map_se * 100, avg_rec1_se, std_rec1_se) print u"SVM %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f" % ( avg_mrr_svm, std_mrr_svm, avg_map_svm * 100, std_map_svm * 100, avg_rec1_svm, std_rec1_svm)
def evaluate(): """Run evaluation on dev or test data.""" add_inverse_edge = FLAGS.model in \ ["source_rel_attention", "source_path_attention"] if FLAGS.clueweb_data: train_graph = clueweb_text_graph.CWTextGraph( text_kg_file=FLAGS.clueweb_data, embeddings_file=FLAGS.clueweb_embeddings, sentence_vocab_file=FLAGS.clueweb_sentences, skip_new=True, kg_file=FLAGS.kg_file, add_reverse_graph=not add_inverse_edge, add_inverse_edge=add_inverse_edge, subsample=FLAGS.subsample_text_rels ) elif FLAGS.text_kg_file: train_graph = text_graph.TextGraph( text_kg_file=FLAGS.text_kg_file, skip_new=True, max_text_len=FLAGS.max_text_len, max_vocab_size=FLAGS.max_vocab_size, min_word_freq=FLAGS.min_word_freq, kg_file=FLAGS.kg_file, add_reverse_graph=not add_inverse_edge, add_inverse_edge=add_inverse_edge, max_path_length=FLAGS.max_path_length ) else: train_graph = graph.Graph( kg_file=FLAGS.kg_file, add_reverse_graph=not add_inverse_edge, add_inverse_edge=add_inverse_edge, max_path_length=FLAGS.max_path_length ) # train_graph, _ = read_graph_data( # kg_file=FLAGS.kg_file, # add_reverse_graph=(FLAGS.model != "source_rel_attention"), # add_inverse_edge=(FLAGS.model == "source_rel_attention"), # mode="train", num_epochs=FLAGS.num_epochs, batchsize=FLAGS.batchsize, # max_neighbors=FLAGS.max_neighbors, # max_negatives=FLAGS.max_negatives # ) val_graph = None if FLAGS.dev_kg_file: val_graph, eval_data = read_graph_data( kg_file=FLAGS.dev_kg_file, add_reverse_graph=not add_inverse_edge, add_inverse_edge=add_inverse_edge, # add_reverse_graph=False, # add_inverse_edge=False, mode="dev", num_epochs=1, batchsize=FLAGS.test_batchsize, max_neighbors=FLAGS.max_neighbors, max_negatives=FLAGS.max_negatives, train_graph=train_graph, text_kg_file=FLAGS.text_kg_file ) if FLAGS.test_kg_file: _, eval_data = read_graph_data( kg_file=FLAGS.test_kg_file, add_reverse_graph=not add_inverse_edge, add_inverse_edge=add_inverse_edge, # add_reverse_graph=False, # add_inverse_edge=False, mode="test", num_epochs=1, batchsize=FLAGS.test_batchsize, max_neighbors=FLAGS.max_neighbors, max_negatives=None, train_graph=train_graph, text_kg_file=FLAGS.text_kg_file, val_graph=val_graph ) if not FLAGS.dev_kg_file and not FLAGS.test_kg_file: raise ValueError("Evalution without a dev or test file!") iterator = eval_data.dataset.make_initializable_iterator() candidate_scores, candidates, labels, model, is_train_ph, inputs = \ create_model(train_graph, iterator) # Create eval metrics # if FLAGS.dev_kg_file: batch_rr = metrics.mrr(candidate_scores, candidates, labels) mrr, mrr_update = tf.metrics.mean(batch_rr) mrr_summary = tf.summary.scalar("MRR", mrr) all_hits, all_hits_update, all_hits_summaries = [], [], [] for k in [1, 3, 10]: batch_hits = metrics.hits_at_k(candidate_scores, candidates, labels, k=k) hits, hits_update = tf.metrics.mean(batch_hits) hits_summary = tf.summary.scalar("Hits_at_%d" % k, hits) all_hits.append(hits) all_hits_update.append(hits_update) all_hits_summaries.append(hits_summary) hits = tf.group(*all_hits) hits_update = tf.group(*all_hits_update) global_step = tf.Variable(0, name="global_step", trainable=False) current_step = tf.Variable(0, name="current_step", trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) incr_current_step = tf.assign_add(current_step, 1) reset_current_step = tf.assign(current_step, 0) slim.get_or_create_global_step(graph=tf.get_default_graph()) # best_hits = tf.Variable(0., trainable=False) # best_step = tf.Variable(0, trainable=False) # with tf.control_dependencies([hits]): # update_best_hits = tf.cond(tf.greater(hits, best_hits), # lambda: tf.assign(best_hits, hits), # lambda: 0.) # update_best_step = tf.cond(tf.greater(hits, best_hits), # lambda: tf.assign(best_step, global_step), # lambda: 0) # best_hits_summary = tf.summary.scalar("Best Hits@10", best_hits) # best_step_summary = tf.summary.scalar("Best Step", best_step) nexamples = eval_data.data_graph.tuple_store.shape[0] if eval_data.data_graph.add_reverse_graph: nexamples *= 2 num_batches = math.ceil(nexamples / float(FLAGS.test_batchsize)) local_init_op = tf.local_variables_initializer() if FLAGS.analyze: entity_names = utils.read_entity_name_mapping(FLAGS.entity_names_file) session = tf.Session() # summary_writer = tf.summary.FileWriter(FLAGS.output_dir, session.graph) init_op = tf.global_variables_initializer() session.run(init_op) session.run(local_init_op) saver = tf.train.Saver(tf.trainable_variables()) ckpt_path = FLAGS.model_path + "/model.ckpt-%d" % FLAGS.global_step attention_probs = model["attention_encoder"].get_from_collection( "attention_probs" ) if FLAGS.clueweb_data: s, nbrs_s, text_nbrs_s, text_nbrs_s_emb, r, candidates, _ = inputs elif FLAGS.text_kg_file: s, nbrs_s, text_nbrs_s, r, candidates, _ = inputs else: s, nbrs_s, r, candidates, _ = inputs saver.restore(session, ckpt_path) session.run(iterator.initializer) num_attention = 5 nsteps = 0 outf_correct = open(FLAGS.output_dir + "/analyze_correct.txt", "w+") outf_incorrect = open( FLAGS.output_dir + "/analyze_incorrect.txt", "w+" ) ncorrect = 0 analyze_outputs = [candidate_scores, s, nbrs_s, r, candidates, labels, attention_probs] if FLAGS.text_kg_file: analyze_outputs.append(text_nbrs_s) while True: try: analyze_vals = session.run(analyze_outputs, {is_train_ph: False}) if FLAGS.text_kg_file: cscores, se, nbrs, qr, cands, te, nbr_attention_probs, text_nbrs = \ analyze_vals else: cscores, se, nbrs, qr, cands, te, nbr_attention_probs = analyze_vals # import pdb; pdb.set_trace() pred_ids = cscores.argmax(1) for i in range(se.shape[0]): sname = train_graph.inverse_entity_vocab[se[i]] if sname in entity_names: sname = entity_names[sname] rname = train_graph.inverse_relation_vocab[qr[i]] pred_target = cands[i, pred_ids[i]] pred_name = train_graph.inverse_entity_vocab[pred_target] if pred_name in entity_names: pred_name = entity_names[pred_name] tname = train_graph.inverse_entity_vocab[te[i][0]] if tname in entity_names: tname = entity_names[tname] if te[i][0] == pred_target: outf = outf_correct ncorrect += 1 else: outf = outf_incorrect outf.write("\n(%d) %s, %s, ? \t Pred: %s \t Target: %s" % (nsteps+i+1, sname, rname, pred_name, tname)) top_nbrs_index = np.argsort(nbr_attention_probs[i, :])[::-1] outf.write("\nTop Nbrs:") for j in range(num_attention): nbr_index = top_nbrs_index[j] if nbr_index < FLAGS.max_neighbors: nbr_id = nbrs[i, nbr_index, :] nbr_name = "" for k in range(0, nbrs.shape[-1], 2): ent_name = train_graph.inverse_entity_vocab[nbr_id[k+1]] if ent_name in entity_names: ent_name = entity_names[ent_name] rel_name = train_graph.inverse_relation_vocab[nbr_id[k]] nbr_name += "(%s, %s)" % (rel_name, ent_name) else: # Text Relation text_nbr_ids = text_nbrs[i, nbr_index - FLAGS.max_neighbors, :] text_nbr_ent = text_nbr_ids[0] ent_name = train_graph.inverse_entity_vocab[text_nbr_ent] if ent_name in entity_names: ent_name = entity_names[ent_name] rel_name = train_graph.get_relation_text(text_nbr_ids[1:]) nbr_name = "(%s, %s)" % (rel_name, ent_name) outf.write("\n\t\t %s Prob: %.4f" % (nbr_name, nbr_attention_probs[i, nbr_index])) nsteps += se.shape[0] tf.logging.info("Current hits@1: %.3f", ncorrect * 1.0 / (nsteps)) except tf.errors.OutOfRangeError: break outf_correct.close() outf_incorrect.close() return class DataInitHook(tf.train.SessionRunHook): def after_create_session(self, sess, coord): sess.run(iterator.initializer) sess.run(reset_current_step) if FLAGS.test_only: ckpt_path = FLAGS.model_path + "/model.ckpt-%d" % FLAGS.global_step slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=ckpt_path, logdir=FLAGS.output_dir, variables_to_restore=tf.trainable_variables() + [global_step], initial_op=tf.group(local_init_op, iterator.initializer), # initial_op=iterator.initializer, num_evals=num_batches, eval_op=tf.group(mrr_update, hits_update, incr_current_step), eval_op_feed_dict={is_train_ph: False}, final_op=tf.group(mrr, hits), final_op_feed_dict={is_train_ph: False}, summary_op=tf.summary.merge([mrr_summary]+ all_hits_summaries), hooks=[DataInitHook(), tf.train.LoggingTensorHook( {"mrr": mrr, "hits": hits, "step": current_step}, every_n_iter=1 )] ) else: slim.evaluation.evaluation_loop( master=FLAGS.master, checkpoint_dir=FLAGS.model_path, logdir=FLAGS.output_dir, variables_to_restore=tf.trainable_variables() + [global_step], initial_op=tf.group(local_init_op, iterator.initializer), # initial_op=iterator.initializer, num_evals=num_batches, eval_op=tf.group(mrr_update, hits_update, incr_current_step), eval_op_feed_dict={is_train_ph: False}, final_op=tf.group(mrr, hits), final_op_feed_dict={is_train_ph: False}, summary_op=tf.summary.merge([mrr_summary] + all_hits_summaries), max_number_of_evaluations=None, eval_interval_secs=60, hooks=[DataInitHook(), tf.train.LoggingTensorHook( {"mrr": mrr, "hits": hits, "step": current_step}, every_n_iter=1 )] )
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print("") print("*** Official score (MAP for SYS): %5.4f" % (map_svm)) print("") print("") print("******************************") print("*** Classification results ***") print("******************************") print("") print("Acc = %5.4f" % (acc)) print("P = %5.4f" % (p)) print("R = %5.4f" % (r)) print("F1 = %5.4f" % (f1)) print("") print("") print("********************************") print("*** Detailed ranking results ***") print("********************************") print("") print("IR -- Score for the output of the IR system (baseline).") print("SYS -- Score for the output of the tested system.") print("") print("%13s %5s" % ("IR", "SYS")) print("MAP : %5.4f %5.4f" % (map_se, map_svm)) print("AvgRec: %5.4f %5.4f" % (avg_acc1_ir, avg_acc1_svm)) print("MRR : %6.2f %6.2f" % (mrr_se, mrr_svm)) print("%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SYS", "IR", "SYS", "IR", "SYS", "IR", "SYS")) for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate( zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print( "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" % (i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2)) print() print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks where questions have at most one correct answer)" ) print( "ACC - accuracy, i.e., number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" ) print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" ) print("AC2 - the absolute number of correct answers at @X")
def train(train_data, test_data, n_user, n_item): with tf.Session() as sess: iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes) model = NCF.NCF(FLAGS.embedding_size, n_user, n_item, FLAGS.lr, FLAGS.optim, FLAGS.initializer, FLAGS.loss_func, FLAGS.activation, FLAGS.regularizer, iterator, FLAGS.topK, FLAGS.dropout, is_training=True) model.build() # 有参数就读取, 没有就重新训练 ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and ckpt.model_checkpoint_path: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) # 加载模型参数 model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("Creating model with fresh parameters.") sess.run(tf.global_variables_initializer()) count = 0 # 在训练集上训练epochs轮 for epoch in range(FLAGS.epochs): # 训练集的迭代器 sess.run(model.iterator.make_initializer(train_data)) model.is_training = True model.get_data() start_time = time.time() try: while True: # 直到生成器没数据, 也就是所有训练数据遍历一次 model.step(sess, count) count += 1 except tf.errors.OutOfRangeError: # 打印训练一轮的时间 print("Epoch %d training " % epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) # 测试集的迭代器 sess.run(model.iterator.make_initializer(test_data)) model.is_training = False model.get_data() start_time = time.time() HR, MRR, NDCG = [], [], [] pred_item, gt_item = model.step(sess, None) try: while True: # 直到生成器没数据, 也就是所有测试数据遍历一次 pred_item, gt_item = model.step(sess, None) # 对于测试集每同一批量数据的item都一样, 所以只取一个 gt_item = int(gt_item[0]) HR.append(metrics.hit(gt_item, pred_item)) MRR.append(metrics.mrr(gt_item, pred_item)) NDCG.append(metrics.ndcg(gt_item, pred_item)) # 评估值取均值 except tf.errors.OutOfRangeError: hr = np.array(HR).mean() mrr = np.array(MRR).mean() ndcg = np.array(NDCG).mean() print("Epoch %d testing " % epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) print("HR is %.3f, MRR is %.3f, NDCG is %.3f" % (hr, mrr, ndcg)) # 保存模型参数 checkpoint_path = os.path.join(FLAGS.model_dir, "NCF.ckpt") model.saver.save(sess, checkpoint_path)
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) #print "" #print "*** Official score (MAP for SYS): %5.4f" %(map_svm) #print "" #print "" #print "******************************" #print "*** Classification results ***" #print "******************************" #print "" #print "Acc = %5.4f" %(acc) #print "P = %5.4f" %(p) #print "R = %5.4f" %(r) #print "F1 = %5.4f" %(f1) #print "" #print "" #print "********************************" #print "*** Detailed ranking results ***" #print "********************************" #print "" #print "IR -- Score for the output of the IR system (baseline)." #print "SYS -- Score for the output of the tested system." #print "" #print "%13s %5s" %("IR", "SYS") #print "MAP : %5.4f %5.4f" %(map_se, map_svm) #print "AvgRec: %5.4f %5.4f" %(avg_acc1_ir, avg_acc1_svm) #print "MRR : %6.2f %6.2f" %(mrr_se, mrr_svm) print "MAP : %5.4f\tMRR : %5.4f\tAvgRec: %5.4f" % (map_svm, mrr_svm, avg_acc1_svm) #print "Acc : %5.4f" %(acc) #print "P : %5.4f" %(p) #print "R : %5.4f" %(r) #print "F1 : %5.4f" %(f1) """
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * (conf_matrix['true']['true'] + conf_matrix['false']['false']) / (conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print ("") print ("*** Official score (MAP for SYS): %5.4f" %(map_svm)) print ("") print ("") print( "******************************") print( "*** Classification results ***") print( "******************************") print( "") print( "Acc = %5.4f" %(acc)) print( "P = %5.4f" %(p)) print( "R = %5.4f" %(r)) print( "F1 = %5.4f" %(f1)) print( "") print( "") print( "********************************") print( "*** Detailed ranking results ***") print( "********************************") print( "") print( "IR -- Score for the output of the IR system (baseline).") print( "SYS -- Score for the output of the tested system.") print( "") print( "%13s %5s" %("IR", "SYS")) print( "MAP : %5.4f %5.4f" %(map_se, map_svm)) print( "AvgRec: %5.4f %5.4f" %(avg_acc1_ir, avg_acc1_svm)) print( "MRR : %6.2f %6.2f" %(mrr_se, mrr_svm)) print( "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SYS", "IR", "SYS", "IR", "SYS", "IR", "SYS")) for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print( "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2)) print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks where questions have at most one correct answer)") print( "ACC - accuracy, i.e., number of correct answers retrieved at rank @X normalized by the rank and the total number of questions") print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)") print( "AC2 - the absolute number of correct answers at @X") return map_svm
ranks = [] for i in rel_df.idx.unique(): ddf = rel_df[rel_df.idx == i] ranked = ddf.sort_values(by='total', ascending=False) r = 1 for label in ranked.is_gold: if label: ranks.append(r) r -= 1 # based on accepted eval method r += 1 if alpha == 0.0: agg_zero_ranks.extend(ranks) if alpha == 0.5: agg_half_ranks.extend(ranks) if alpha == 1.0: agg_one_ranks.extend(ranks) amrr = mrr(ranks) if amrr > best_mrr: rel_best_ranks = ranks best_mrr = amrr best_arg_mrr = alpha # just for reporting hat10 = h_at_n(ranks, n=10) hat3 = h_at_n(ranks, n=3) hat1 = h_at_n(ranks, n=1) if int(alpha * 100) % 20 == 0: print('{}\t{}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}'.format(rl, alpha, amrr, hat10, hat3, hat1)) agg_best_ranks.extend(rel_best_ranks)
saved_name = None N = assoc_model.vocab_size for ep in range(opts.epochs): # report if opts.v > 0: timeprint('starting epoch {}'.format(ep + 1)) iteration_losses.append( train_iteration(opts, assoc_model, trainer, ep % 5 == 4, log_file)) if opts.early_stopping: timeprint('evaluating after epoch {}'.format(ep + 1)) insts, all_s_ranks, all_t_ranks = eval( assoc_model, tr_graphs, te_graphs, opts, N) # save model with epoch count and remove previous if exists ep_mrr = mrr(all_s_ranks + all_t_ranks) ep_h10 = h_at_n(all_s_ranks + all_t_ranks) ep_h1 = h_at_n(all_s_ranks + all_t_ranks, n=1) timeprint('mrr: {:.4f}, h@10: {:.4f}, h@1: {:.4f}'.format( ep_mrr, ep_h10, ep_h1)) if len(dev_mrrs) < 1 or ep_mrr > min(dev_mrrs[-2:]): if len(dev_mrrs) < 1 or ep_mrr > max(dev_mrrs): best_insts = insts best_all_s_ranks = all_s_ranks best_all_t_ranks = all_t_ranks last_saved_name = saved_name saved_name = '{}-ep-{:02d}.dyn'.format( opts.model_out, ep + 1) timeprint('saving trained model to {}'.format( saved_name)) assoc_model.save(saved_name)
ranks = [] for i in rel_df.idx.unique(): ddf = rel_df[rel_df.idx == i] ranked = ddf.sort_values(by='total', ascending=False) r = 1 for label in ranked.is_gold: if label: ranks.append(r) r -= 1 # based on accepted eval method r += 1 if alpha == 0.0: agg_zero_ranks.extend(ranks) if alpha == 0.5: agg_half_ranks.extend(ranks) if alpha == 1.0: agg_one_ranks.extend(ranks) amrr = mrr(ranks) if amrr > best_mrr: rel_best_ranks = ranks best_mrr = amrr best_arg_mrr = alpha # just for reporting hat10 = h_at_n(ranks, n=10) hat3 = h_at_n(ranks, n=3) hat1 = h_at_n(ranks, n=1) if int(alpha * 100) % 20 == 0: print('{}\t{}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}'.format( rl, alpha, amrr, hat10, hat3, hat1))
log_file.write('====\n') iteration_losses = [] # will hold loss averages dev_mrrs = [] saved_name = None N = assoc_model.vocab_size for ep in range(opts.epochs): # report if opts.v > 0: timeprint('starting epoch {}'.format(ep + 1)) iteration_losses.append(train_iteration(opts, assoc_model, trainer, ep % 5 == 4, log_file)) if opts.early_stopping: timeprint('evaluating after epoch {}'.format(ep+1)) insts, all_s_ranks, all_t_ranks = eval(assoc_model, tr_graphs, te_graphs, opts, N) # save model with epoch count and remove previous if exists ep_mrr = mrr(all_s_ranks + all_t_ranks) ep_h10 = h_at_n(all_s_ranks + all_t_ranks) ep_h1 = h_at_n(all_s_ranks + all_t_ranks, n=1) timeprint('mrr: {:.4f}, h@10: {:.4f}, h@1: {:.4f}'.format(ep_mrr, ep_h10, ep_h1)) if len(dev_mrrs) < 1 or ep_mrr > min(dev_mrrs[-2:]): if len(dev_mrrs) < 1 or ep_mrr > max(dev_mrrs): best_insts = insts best_all_s_ranks = all_s_ranks best_all_t_ranks = all_t_ranks last_saved_name = saved_name saved_name = '{}-ep-{:02d}.dyn'.format(opts.model_out, ep + 1) timeprint('saving trained model to {}'.format(saved_name)) assoc_model.save(saved_name) # remove previous model(s) if last_saved_name is not None: os.remove(last_saved_name)
def eval(prev_graphs, graphs, ergm, opts, N, log_file, rerank_file): writing = log_file is not None caches = (copy.deepcopy(ergm.cache), copy.deepcopy(ergm.feature_vals)) rel_all_ranks = {} # for final results rel_pre_ranks = {} # for improvement analysis rel_erg_ranks = {} # for ergm-alone analysis all_pre_ranks = [] all_all_ranks = [] all_erg_ranks = [] insts = Counter() total_misses = Counter() overrides = Counter() rerank_ups = Counter() rerank_downs = Counter() erg_ups = Counter() erg_downs = Counter() rerank_diff = Counter() erg_diff = Counter() change_idx = 1 rels_order = list(graphs.items()) for rel, te_gr in rels_order: if rel == 'co_hypernym': continue # set up if writing: timeprint('testing relation {}'.format(rel)) log_file.write('relation: {}\n'.format(rel)) # add incrementally, eval each edge, revert tr_gr = prev_graphs[rel] # to filter known connections s_assoc_cache = ergm.source_ranker_cache(rel) t_assoc_cache = ergm.target_ranker_cache(rel) override_rel = opts.rule_override and rel in SYMMETRIC_RELATIONS all_ranks = [] pre_ranks = [] erg_ranks = [] if override_rel and writing: log_file.write('RELATION OVERRIDE\n') node_order = list(range(N)) # DO NOT RANDOMIZE THIS - NEED TO PREDICT BOTH SIDES for node in tqdm(node_order): s_trues, s_unch_loc_ranks, s_loc_gold_ranks, s_gold_reranked, s_gold_ergs, s_pls, change_idx = \ node_loop(change_idx, ergm, rel, node, s_assoc_cache, caches, tr_gr, te_gr, override_rel, opts.rerank, True, log_file, rerank_file) t_trues, t_unch_loc_ranks, t_loc_gold_ranks, t_gold_reranked, t_gold_ergs, t_pls, change_idx = \ node_loop(change_idx, ergm, rel, node, t_assoc_cache, caches, tr_gr, te_gr, override_rel, opts.rerank, False, log_file, rerank_file) total_trues = s_trues + t_trues insts[rel] += (total_trues) if override_rel: overrides[rel] += total_trues ulr = s_unch_loc_ranks + t_unch_loc_ranks lgr = s_loc_gold_ranks + t_loc_gold_ranks grr = s_gold_reranked + t_gold_reranked ger = s_gold_ergs + t_gold_ergs total_misses[rel] += (len(ulr)) pre_ranks.extend(lgr) if override_rel: erg_ranks.extend(lgr) all_ranks.extend(lgr) else: all_ranks.extend(ulr + grr) erg_ranks.extend(ulr + ger) for pl in s_pls + t_pls: if pl[3] < pl[2]: rerank_ups[rel] += 1 if pl[3] > pl[2]: rerank_downs[rel] += 1 if pl[4] < pl[2]: erg_ups[rel] += 1 if pl[4] > pl[2]: erg_downs[rel] += 1 rerank_diff[rel] += (pl[2] - pl[3]) erg_diff[rel] += (pl[2] - pl[4]) rel_all_ranks[rel] = all_ranks rel_pre_ranks[rel] = pre_ranks rel_erg_ranks[rel] = erg_ranks all_all_ranks.extend(all_ranks) all_pre_ranks.extend(pre_ranks) all_erg_ranks.extend(erg_ranks) if writing: log_file.write('\nper relation:\n') for rel in list(graphs.keys()): if insts[rel] > 0 and insts[rel] - total_misses[rel] > 0: log_file.write('\n{}:\n'.format(rel)) log_file.write('{} instances, {} misses\n'.format(insts[rel], total_misses[rel])) log_file.write('reranks: {} up, {} down\n'.format(rerank_ups[rel], rerank_downs[rel])) log_file.write('ERGM only: {} up, {} down\n'.format(erg_ups[rel], erg_downs[rel])) log_file.write('rank diff: {}, ERGM only: {}\n'.format(rerank_diff[rel], erg_diff[rel])) log_file.write('metrics: pre-rank\trerank\tERGM only\n') log_file.write('average rank: {:.5f}\t{:.5f}\t{:.5f}\n'.format(np.average(rel_pre_ranks[rel]), np.average(rel_all_ranks[rel]), np.average(rel_erg_ranks[rel]))) log_file.write('mrr: {:.4f}\t{:.4f}\t{:.4f}\n'.format(mrr(rel_pre_ranks[rel]), mrr(rel_all_ranks[rel]), mrr(rel_erg_ranks[rel]))) log_file.write( 'mq: {:.4f}\t{:.4f}\t{:.4f}\n'.format(mq(rel_pre_ranks[rel], N), mq(rel_all_ranks[rel], N), mq(rel_erg_ranks[rel], N))) log_file.write('h@100: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(rel_pre_ranks[rel], n=100), h_at_n(rel_all_ranks[rel], n=100), h_at_n(rel_erg_ranks[rel], n=100))) log_file.write( 'h@10: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(rel_pre_ranks[rel]), h_at_n(rel_all_ranks[rel]), h_at_n(rel_erg_ranks[rel]))) log_file.write('h@1: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(rel_pre_ranks[rel], n=1), h_at_n(rel_all_ranks[rel], n=1), h_at_n(rel_erg_ranks[rel], n=1))) log_file.write('\ntotals:\n') log_file.write('total number of instances: {}\n'.format(sum(insts.values()))) log_file.write('total misses: {}\n'.format(sum(total_misses.values()))) log_file.write('overrides: {}\n'.format(sum(overrides.values()))) log_file.write( 'rerank improvements: {}; regressions: {}\n'.format(sum(rerank_ups.values()), sum(rerank_downs.values()))) log_file.write( 'only ERGM improvements: {}; regressions: {}\n'.format(sum(erg_ups.values()), sum(erg_downs.values()))) log_file.write( 'total rank diffs: rerank {}, only ERGM {}\n'.format(sum(rerank_diff.values()), sum(erg_diff.values()))) log_file.write('metrics: pre-rank\trerank\tERGM only\n') log_file.write( 'average rank: {:.5f}\t{:.5f}\t{:.5f}\n'.format(np.average(all_pre_ranks), np.average(all_all_ranks), np.average(all_erg_ranks))) log_file.write( 'mrr: {:.4f}\t{:.4f}\t{:.4f}\n'.format(mrr(all_pre_ranks), mrr(all_all_ranks), mrr(all_erg_ranks))) log_file.write( 'mq: {:.4f}\t{:.4f}\t{:.4f}\n'.format(mq(all_pre_ranks, N), mq(all_all_ranks, N), mq(all_erg_ranks, N))) log_file.write( 'h@100: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(all_pre_ranks, n=100), h_at_n(all_all_ranks, n=100), h_at_n(all_erg_ranks, n=100))) log_file.write('h@10: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(all_pre_ranks), h_at_n(all_all_ranks), h_at_n(all_erg_ranks))) log_file.write('h@1: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(all_pre_ranks, n=1), h_at_n(all_all_ranks, n=1), h_at_n(all_erg_ranks, n=1))) print('number of instances:', sum(insts.values())) print('total misses:', sum(total_misses.values())) print('overrides:', sum(overrides.values())) print('average rank:', np.average(all_all_ranks)) print('mrr: {:.4f}'.format(mrr(all_all_ranks))) print('mq:', mq(all_all_ranks, N)) print('h@100: {:.5f}'.format(h_at_n(all_all_ranks, n=100))) print('h@10: {:.5f}'.format(h_at_n(all_all_ranks))) print('h@1: {:.5f}'.format(h_at_n(all_all_ranks, n=1))) return mrr(all_all_ranks), h_at_n(all_all_ranks, n=10), h_at_n(all_all_ranks, n=3), h_at_n(all_all_ranks, n=1)
def train(train_data, test_data, user_size, item_size): config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: ############################### CREATE MODEL ############################# iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes) model = NCF.NCF(FLAGS.embedding_size, user_size, item_size, FLAGS.lr, FLAGS.optim, FLAGS.initializer, FLAGS.loss_func, FLAGS.activation, FLAGS.regularizer, iterator, FLAGS.topK, FLAGS.dropout, is_training=True) model.build() # train_init_op = iterator.make_initializer(train_data) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("Creating model with fresh parameters.") sess.run(tf.global_variables_initializer()) ############################### Training #################################### count = 0 for epoch in range(FLAGS.epochs): sess.run(model.iterator.make_initializer(train_data)) model.is_training = True start_time = time.time() try: while True: model.step(sess, count) count += 1 except tf.errors.OutOfRangeError: print("Epoch %d training " %epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) ################################ EVALUATION ################################## sess.run(model.iterator.make_initializer(test_data)) model.is_training = False start_time = time.time() HR, MRR, NDCG = [], [], [] try: while True: prediction, label = model.step(sess, None) label = int(label[0]) HR.append(metrics.hit(label, prediction)) MRR.append(metrics.mrr(label, prediction)) NDCG.append(metrics.ndcg(label, prediction)) except tf.errors.OutOfRangeError: hr = np.array(HR).mean() mrr = np.array(MRR).mean() ndcg = np.array(NDCG).mean() print("Epoch %d testing " %epoch + "Took: " + time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time))) print("HR is %.3f, MRR is %.3f, NDCG is %.3f" %(hr, mrr, ndcg)) ################################## SAVE MODEL ################################ checkpoint_path = os.path.join(FLAGS.model_dir, "NCF.ckpt") model.saver.save(sess, checkpoint_path)