def evaluate(): """Evaluate the model on validation dataset. """ log.info('Loading dev data...') if version_2: dev_data = SQuAD('dev', version='2.0') else: dev_data = SQuAD('dev', version='1.1') if args.debug: sampled_data = [dev_data[0], dev_data[1], dev_data[2]] dev_data = mx.gluon.data.SimpleDataset(sampled_data) log.info('Number of records in dev data:{}'.format(len(dev_data))) dev_dataset = dev_data.transform(SQuADTransform( copy.copy(tokenizer), max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_pad=False, is_training=False)._transform, lazy=False) dev_data_transform, _ = preprocess_dataset( dev_data, SQuADTransform(copy.copy(tokenizer), max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_pad=False, is_training=False)) log.info('The number of examples after preprocessing:{}'.format( len(dev_data_transform))) dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform, batchify_fn=batchify_fn, num_workers=4, batch_size=test_batch_size, shuffle=False, last_batch='keep') log.info('start prediction') all_results = collections.defaultdict(list) epoch_tic = time.time() total_num = 0 for data in dev_dataloader: example_ids, inputs, token_types, valid_length, _, _ = data total_num += len(inputs) out = net( inputs.astype('float32').as_in_context(ctx), token_types.astype('float32').as_in_context(ctx), valid_length.astype('float32').as_in_context(ctx)) output = mx.nd.split(out, axis=2, num_outputs=2) example_ids = example_ids.asnumpy().tolist() pred_start = output[0].reshape((0, -3)).asnumpy() pred_end = output[1].reshape((0, -3)).asnumpy() for example_id, start, end in zip(example_ids, pred_start, pred_end): all_results[example_id].append(PredResult(start=start, end=end)) epoch_toc = time.time() log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format( epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic))) log.info('Get prediction results...') all_predictions = collections.OrderedDict() for features in dev_dataset: results = all_results[features[0].example_id] example_qas_id = features[0].qas_id prediction, _ = predict( features=features, results=results, tokenizer=nlp.data.BERTBasicTokenizer(lower=lower), max_answer_length=max_answer_length, null_score_diff_threshold=null_score_diff_threshold, n_best_size=n_best_size, version_2=version_2) all_predictions[example_qas_id] = prediction with io.open(os.path.join(output_dir, 'predictions.json'), 'w', encoding='utf-8') as fout: data = json.dumps(all_predictions, ensure_ascii=False) fout.write(data) if version_2: log.info( 'Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0' ) else: F1_EM = get_F1_EM(dev_data, all_predictions) log.info(F1_EM)
def evaluate(): """Evaluate the model on validation dataset. """ log.info('Start Evaluation') all_results = collections.defaultdict(list) if VERIFIER_ID == 2: all_pre_na_prob = collections.defaultdict(list) epoch_tic = time.time() total_num = 0 for data in dev_dataloader: example_ids, inputs, token_types, valid_length, _, _ = data total_num += len(inputs) cls_mask = mx.nd.zeros(token_types.shape) sep_mask_1 = mx.nd.zeros(token_types.shape) sep_mask_2 = mx.nd.zeros(token_types.shape) cls_mask[:, 0] = 1. range_row_index = mx.nd.array(np.arange(len(example_ids))) valid_query_length = (1 - token_types).sum(axis=1) sep_mask_1[range_row_index, valid_query_length - 1] = 1. sep_mask_2[range_row_index, valid_length - 1] = 1. additional_masks = (cls_mask.astype('float32').as_in_context(ctx), sep_mask_1.astype('float32').as_in_context(ctx), sep_mask_2.astype('float32').as_in_context(ctx)) out, bert_out = net( inputs.astype('float32').as_in_context(ctx), token_types.astype('float32').as_in_context(ctx), valid_length.astype('float32').as_in_context(ctx), additional_masks) if VERIFIER_ID == 2: has_answer_tmp = verifier.evaluate(dev_features, example_ids, out, token_types, bert_out).asnumpy().tolist() output = mx.nd.split(out, axis=2, num_outputs=2) example_ids = example_ids.asnumpy().tolist() pred_start = output[0].reshape((0, -3)).asnumpy() pred_end = output[1].reshape((0, -3)).asnumpy() for example_id, start, end in zip(example_ids, pred_start, pred_end): all_results[example_id].append(PredResult(start=start, end=end)) if VERIFIER_ID == 2: for example_id, has_ans_prob in zip(example_ids, has_answer_tmp): all_pre_na_prob[example_id].append(has_ans_prob) epoch_toc = time.time() log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format( epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic))) log.info('Get prediction results...') all_predictions = collections.OrderedDict() for features in dev_dataset: results = all_results[features[0].example_id] example_qas_id = features[0].qas_id # prediction2 is likely to be empty when in version_2 prediction, score_diff, best_pred = predict( features=features, results=results, tokenizer=nlp.data.BERTBasicTokenizer(lower=lower), max_answer_length=max_answer_length, n_best_size=n_best_size, version_2=version_2, offsets=offsets) # print(score_diff, null_score_diff_threshold, features[0].is_impossible) # debug # verifier if version_2 and prediction != '': # threshold serves as the basic verifier if score_diff > null_score_diff_threshold: answerable = 0. else: answerable = 1. if VERIFIER_ID == 0: best_pred_score = 1. if best_pred else 0. has_ans_prob = verifier.evaluate(score_diff, best_pred_score) # print(features[0].is_impossible) elif VERIFIER_ID == 1: has_ans_prob = verifier.evaluate(features, prediction) elif VERIFIER_ID == 2: has_ans_prob_list = all_pre_na_prob[features[0].example_id] has_ans_prob = sum(has_ans_prob_list) / max( len(has_ans_prob_list), 1) else: has_ans_prob = 1. if args.verifier_mode == "takeover": answerable = has_ans_prob elif args.verifier_mode == "joint": answerable = answerable * has_ans_prob elif args.verifier_mode == "all": answerable = (answerable + has_ans_prob) * 0.5 if answerable < answerable_threshold: prediction = "" all_predictions[example_qas_id] = prediction # the form of hashkey - answer string with io.open(os.path.join(output_dir, 'predictions.json'), 'w', encoding='utf-8') as fout: data = json.dumps(all_predictions, ensure_ascii=False) fout.write(data) if version_2: log.info( 'Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0' ) else: F1_EM = get_F1_EM(dev_data, all_predictions) log.info(F1_EM)
def evaluate(): """Evaluate the model on validation dataset. """ log.info('Loading dev data...') if version_2: dev_data = SQuAD('dev', version='2.0') else: dev_data = SQuAD('dev', version='1.1') log.info('Number of records in Train data:{}'.format(len(dev_data))) dev_dataset = dev_data.transform( SQuADTransform(berttoken, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_pad=False, is_training=False)._transform) dev_data_transform, _ = preprocess_dataset( dev_data, SQuADTransform(berttoken, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_pad=False, is_training=False)) log.info('The number of examples after preprocessing:{}'.format( len(dev_data_transform))) dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform, batchify_fn=batchify_fn, num_workers=4, batch_size=test_batch_size, shuffle=False, last_batch='keep') log.info('Start predict') _Result = collections.namedtuple( '_Result', ['example_id', 'start_logits', 'end_logits']) all_results = {} epoch_tic = time.time() total_num = 0 for data in dev_dataloader: example_ids, inputs, token_types, valid_length, _, _ = data total_num += len(inputs) out = net( inputs.astype('float32').as_in_context(ctx), token_types.astype('float32').as_in_context(ctx), valid_length.astype('float32').as_in_context(ctx)) output = nd.split(out, axis=2, num_outputs=2) start_logits = output[0].reshape((0, -3)).asnumpy() end_logits = output[1].reshape((0, -3)).asnumpy() for example_id, start, end in zip(example_ids, start_logits, end_logits): example_id = example_id.asscalar() if example_id not in all_results: all_results[example_id] = [] all_results[example_id].append( _Result(example_id, start.tolist(), end.tolist())) if args.test_mode: log.info('Exit early in test mode') break epoch_toc = time.time() log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format( epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic))) log.info('Get prediction results...') all_predictions, all_nbest_json, scores_diff_json = predictions( dev_dataset=dev_dataset, all_results=all_results, tokenizer=nlp.data.BERTBasicTokenizer(lower=lower), max_answer_length=max_answer_length, null_score_diff_threshold=null_score_diff_threshold, n_best_size=n_best_size, version_2=version_2, test_mode=args.test_mode) with open(os.path.join(output_dir, 'predictions.json'), 'w', encoding='utf-8') as all_predictions_write: all_predictions_write.write(json.dumps(all_predictions)) with open(os.path.join(output_dir, 'nbest_predictions.json'), 'w', encoding='utf-8') as all_predictions_write: all_predictions_write.write(json.dumps(all_nbest_json)) if version_2: with open(os.path.join(output_dir, 'null_odds.json'), 'w', encoding='utf-8') as all_predictions_write: all_predictions_write.write(json.dumps(scores_diff_json)) else: log.info(get_F1_EM(dev_data, all_predictions))
def evaluate(): """Evaluate the model on validation dataset. """ log.info('Loading dev data...') if version_2: dev_data = SQuAD('dev', version='2.0') else: dev_data = SQuAD('dev', version='1.1') if args.debug: sampled_data = dev_data[:10] # [dev_data[0], dev_data[1], dev_data[2]] dev_data = mx.gluon.data.SimpleDataset(sampled_data) log.info('Number of records in dev data:{}'.format(len(dev_data))) dev_dataset = dev_data.transform(SQuADTransform( copy.copy(tokenizer), max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_pad=True, is_training=True)._transform, lazy=False) dev_data_transform, _ = preprocess_dataset( dev_data, SQuADTransform(copy.copy(tokenizer), max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_pad=True, is_training=True)) # refer to evaluation process # for feat in train_dataset: # print(feat[0].example_id) # print(feat[0].tokens) # print(feat[0].token_to_orig_map) # input() # exit(0) dev_features = { features[0].example_id: features for features in dev_dataset } #for line in train_data_transform: # print(line) # input() dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform, batchify_fn=batchify_fn, batch_size=test_batch_size, num_workers=4, shuffle=True) ''' dev_dataset = dev_data.transform( SQuADTransform( copy.copy(tokenizer), max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_pad=False, is_training=False)._transform, lazy=False) # for feat in dev_dataset: # print(feat[0].example_id) # print(feat[0].tokens) # print(feat[0].token_to_orig_map) # input() # exit(0) dev_features = {features[0].example_id: features for features in dev_dataset} dev_data_transform, _ = preprocess_dataset( dev_data, SQuADTransform( copy.copy(tokenizer), max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_pad=False, is_training=False)) log.info('The number of examples after preprocessing:{}'.format( len(dev_data_transform))) dev_dataloader = mx.gluon.data.DataLoader( dev_data_transform, batchify_fn=batchify_fn, num_workers=4, batch_size=test_batch_size, shuffle=False, last_batch='keep') ''' log.info('start prediction') all_results = collections.defaultdict(list) if args.verify and VERIFIER_ID in [2, 3]: all_pre_na_prob = collections.defaultdict(list) else: all_pre_na_prob = None epoch_tic = time.time() total_num = 0 for data in dev_dataloader: example_ids, inputs, token_types, valid_length, _, _ = data total_num += len(inputs) out = net( inputs.astype('float32').as_in_context(ctx), token_types.astype('float32').as_in_context(ctx), valid_length.astype('float32').as_in_context(ctx)) if all_pre_na_prob is not None: has_answer_tmp = verifier.evaluate(dev_features, example_ids, out).asnumpy().tolist() output = mx.nd.split(out, axis=2, num_outputs=2) example_ids = example_ids.asnumpy().tolist() pred_start = output[0].reshape((0, -3)).asnumpy() pred_end = output[1].reshape((0, -3)).asnumpy() for example_id, start, end in zip(example_ids, pred_start, pred_end): all_results[example_id].append(PredResult(start=start, end=end)) if all_pre_na_prob is not None: for example_id, has_ans_prob in zip(example_ids, has_answer_tmp): all_pre_na_prob[example_id].append(has_ans_prob) epoch_toc = time.time() log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format( epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic))) log.info('Get prediction results...') all_predictions = collections.OrderedDict() for features in dev_dataset: results = all_results[features[0].example_id] example_qas_id = features[0].qas_id if all_pre_na_prob is not None: has_ans_prob_list = all_pre_na_prob[features[0].example_id] has_ans_prob = sum(has_ans_prob_list) / max( len(has_ans_prob_list), 1) if has_ans_prob < 0.5: prediction = "" all_predictions[example_qas_id] = prediction continue prediction, _ = predict( features=features, results=results, tokenizer=nlp.data.BERTBasicTokenizer(lower=lower), max_answer_length=max_answer_length, null_score_diff_threshold=null_score_diff_threshold, n_best_size=n_best_size, version_2=version_2) if args.verify and VERIFIER_ID == 1: if len(prediction) > 0: has_answer = verifier.evaluate(features, prediction) if not has_answer: prediction = "" all_predictions[example_qas_id] = prediction # the form of hashkey - answer string with io.open(os.path.join(output_dir, 'predictions.json'), 'w', encoding='utf-8') as fout: data = json.dumps(all_predictions, ensure_ascii=False) fout.write(data) if version_2: log.info( 'Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0' ) else: F1_EM = get_F1_EM(dev_data, all_predictions) log.info(F1_EM)