output_null_log_odds_file = os.path.join(FLAGS.output_dir, "output_null_log_odds_file_{}.json".format(step)) # time5 = time() write_predictions(val_examples, all_output_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file) # time6 = time() # print('write all val predictions', time6-time5) val_total_loss_value = np.average(val_total_loss) # call the official evaluation script val_summary = tf.Summary() # time7 = time() val_eval_res = external_call(val_file_json, output_prediction_file) # time8 = time() # print('external call', time8-time7) val_f1 = val_eval_res['f1'] val_followup = val_eval_res['followup'] val_yesno = val_eval_res['yes/no'] val_heq = val_eval_res['HEQ'] val_dheq = val_eval_res['DHEQ'] heq_list.append(val_heq) dheq_list.append(val_dheq) yesno_list.append(val_yesno) followup_list.append(val_followup) val_summary.value.add(tag="followup", simple_value=val_followup) val_summary.value.add(tag="val_yesno", simple_value=val_yesno)
print('batch dropped because too large!') output_prediction_file = os.path.join(FLAGS.output_dir, "predictions_{}.json".format(step)) output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions_{}.json".format(step)) write_predictions(val_examples, all_selected_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file) val_total_loss_value = np.average(val_total_loss) # call the official evaluation script val_summary = tf.Summary() val_eval_res = external_call(val_file_json, output_prediction_file) val_f1 = val_eval_res['f1'] val_followup = val_eval_res['followup'] val_yesno = val_eval_res['yes/no'] val_heq = val_eval_res['HEQ'] val_dheq = val_eval_res['DHEQ'] heq_list.append(val_heq) dheq_list.append(val_dheq) val_summary.value.add(tag="followup", simple_value=val_followup) val_summary.value.add(tag="val_yesno", simple_value=val_yesno) val_summary.value.add(tag="val_heq", simple_value=val_heq) val_summary.value.add(tag="val_dheq", simple_value=val_dheq)
def evaluate(dev_file, tokenizer): """ Adapted from the original TF implementation's eval script """ val_summary_writer = SummaryWriter() val_total_loss = [] all_results = [] all_output_features = [] f1_list = [] heq_list = [] dheq_list = [] yesno_list, followup_list = [], [] if args.eval_checkpoint: checkpoints_to_evaluate = ['checkpoint-' + args.eval_checkpoint] elif args.eval_all_checkpoints: checkpoints_to_evaluate = listdir(args.output_dir + 'saved_checkpoints/') else: # choose the checkpoint directory ending in the highest number (i.e. the last saved checkpoint) checkpoints_to_evaluate = ['checkpoint-' + str( max([int(f.split('-')[1]) for f in listdir(args.output_dir + 'saved_checkpoints/') if isdir(join(args.output_dir + 'saved_checkpoints/', f))]))] for checkpoint in checkpoints_to_evaluate: state_dict_path = '{}saved_checkpoints/{}/state_dict.pt'.format(args.output_dir, checkpoint) model = MTLModel(args) model.load_state_dict(torch.load(state_dict_path)) dev_features, dev_example_tracker, dev_variation_tracker, dev_example_features_nums, \ dev_num_batches, dev_examples = load_data(dev_file, tokenizer) print("***** Running evaluation *****") print(" Num orig examples = ", len(dev_examples)) print(" Num dev_features = ", len(dev_features)) print(" Num dev batches = ", dev_num_batches) print(" Batch size = ", args.batch_size) set_seed(args) dev_batches = cqa_gen_example_aware_batches_v2(dev_features, dev_example_tracker, dev_variation_tracker, dev_example_features_nums, batch_size=args.batch_size, num_epoches=1, shuffle=False) dev_iterator = tqdm(dev_batches, desc="Iteration", disable=False, total=dev_num_batches) for step, batch in enumerate(dev_iterator): model.eval() batch_results = [] batch_features, batch_slice_mask, batch_slice_num, output_features = batch all_output_features.extend(output_features) fd = convert_features_to_feed_dict(args, batch_features) # feed_dict fd_output = convert_features_to_feed_dict(args, output_features) turn_features = get_turn_features(fd['metadata']) fd['history_answer_marker'] = fix_history_answer_marker_for_bhae(fd['history_answer_marker'].cpu(), turn_features) with torch.no_grad(): inputs = { "fd": fd, "batch_slice_mask": batch_slice_mask, "batch_slice_num": batch_slice_num, } if args.do_MTL: (start_logits, end_logits), yesno_logits, followup_logits, attention_weights = model(**inputs) else: start_logits, end_logits, attention_weights = model(**inputs) key = (tuple([dev_examples[f.example_index].qas_id for f in output_features]), step) attention_dict[key] = {'batch_slice_mask': batch_slice_mask, 'attention_weights_res': attention_weights, 'batch_slice_num': batch_slice_num, 'len_batch_features': len(batch_features), 'len_output_features': len(output_features)} for each_unique_id, each_start_logits, each_end_logits, each_yesno_logits, each_followup_logits \ in zip(fd_output['unique_ids'], start_logits, end_logits, yesno_logits, followup_logits): each_unique_id = int(each_unique_id) each_start_logits = [float(x) for x in each_start_logits.tolist()] each_end_logits = [float(x) for x in each_end_logits.tolist()] each_yesno_logits = [float(x) for x in each_yesno_logits.tolist()] each_followup_logits = [float(x) for x in each_followup_logits.tolist()] batch_results.append(RawResult(unique_id=each_unique_id, start_logits=each_start_logits, end_logits=each_end_logits, yesno_logits=each_yesno_logits, followup_logits=each_followup_logits)) all_results.extend(batch_results) output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(step)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(step)) output_null_log_odds_file = os.path.join(args.output_dir, "output_null_log_odds_file_{}.json".format(step)) write_predictions(dev_examples, all_output_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file) val_total_loss_value = np.average(val_total_loss) val_file_json = json.load(open(dev_file, 'r'))['data'] val_eval_res = external_call(val_file_json, output_prediction_file) val_f1 = val_eval_res['f1'] val_followup = val_eval_res['followup'] val_yesno = val_eval_res['yes/no'] val_heq = val_eval_res['HEQ'] val_dheq = val_eval_res['DHEQ'] heq_list.append(val_heq) dheq_list.append(val_dheq) yesno_list.append(val_yesno) followup_list.append(val_followup) print('evaluation: {}, total_loss: {}, f1: {}, followup: {}, yesno: {}, heq: {}, dheq: {}\n'.format( step, val_total_loss_value, val_f1, val_followup, val_yesno, val_heq, val_dheq)) with open(args.output_dir + 'step_result.txt', 'a') as fout: fout.write('{},{},{},{},{},{},{}\n'.format(step, val_f1, val_heq, val_dheq, val_yesno, val_followup, args.output_dir)) f1_list.append(val_f1)