def validate_official(args, data_loader, model, global_stats, offsets, texts, answers): """Run one full official validation. Uses exact spans and same exact match/F1 score computation as in the SQuAD script. Extra arguments: offsets: The character start/end indices for the tokens in each context. texts: Map of qid --> raw text of examples context (matches offsets). answers: Map of qid --> list of accepted answers. """ eval_time = utils.Timer() f1 = utils.AverageMeter() exact_match = utils.AverageMeter() # Run through examples examples = 0 for ex in data_loader: ex_id, batch_size = ex[-1], ex[0].size(0) chosen_offset = ex[-2] pred_s, pred_e, _ = model.predict(ex) for i in range(batch_size): # s_offset = offsets[ex_id[i]][pred_s[i][0]][0] # e_offset = offsets[ex_id[i]][pred_e[i][0]][1] if args.use_sentence_selector: s_offset = chosen_offset[i][pred_s[i][0]][0] e_offset = chosen_offset[i][pred_e[i][0]][1] else: s_offset = offsets[ex_id[i]][pred_s[i][0]][0] e_offset = offsets[ex_id[i]][pred_e[i][0]][1] prediction = texts[ex_id[i]][s_offset:e_offset] # Compute metrics ground_truths = answers[ex_id[i]] exact_match.update(utils.metric_max_over_ground_truths( utils.exact_match_score, prediction, ground_truths)) f1.update(utils.metric_max_over_ground_truths( utils.f1_score, prediction, ground_truths)) examples += batch_size logger.info('dev valid official: Epoch = %d | EM = %.2f | ' % (global_stats['epoch'], exact_match.avg * 100) + 'F1 = %.2f | examples = %d | valid time = %.2f (s)' % (f1.avg * 100, examples, eval_time.time())) return {'exact_match': exact_match.avg * 100, 'f1': f1.avg * 100}
def compute_paragraph_score(sample): ''' 对于每段,计算和问题的f1-score :param sample: :return: ''' scores = [] question = sample['segmented_question'] # 取出问题的分词形式(还是中文 不是id) for doc in sample['documents']: doc['segmented_paragraphs_scores'] = [] # 给每篇文章加个域 段落匹配得分 for p_idx, para_tokens in enumerate( doc['segmented_paragraphs']): # 此处遍历的是一篇文章的每段话(分词形式) if len(question) > 0: related_score = metric_max_over_ground_truths( f1_score, para_tokens, [question]) else: related_score = 0.0 doc['segmented_paragraphs_scores'].append( related_score) # 每段话与问题的得分 scores.append(related_score) # 获取每段文字与问题的相似得分
def validate_adversarial(args, model, global_stats, mode="dev"): # create dataloader for dev sets, load thier jsons, integrate the function for idx, dataset_file in enumerate(args.adv_dev_json): predictions = {} logger.info("Validating Adversarial Dataset %s" % dataset_file) exs = utils.load_data(args, args.adv_dev_file[idx]) logger.info('Num dev examples = %d' % len(exs)) ## Create dataloader dev_dataset = data.ReaderDataset(exs, model, single_answer=False) if args.sort_by_len: dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(), args.test_batch_size, shuffle=False) else: dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset) # if args.use_sentence_selector: # batching_function = vector.batchify_sentences # else: batching_function = vector.batchify dev_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=args.test_batch_size, sampler=dev_sampler, num_workers=args.data_workers, collate_fn=batching_function, pin_memory=args.cuda, ) texts = utils.load_text(dataset_file) offsets = {ex['id']: ex['offsets'] for ex in exs} answers = utils.load_answers(dataset_file) eval_time = utils.Timer() f1 = utils.AverageMeter() exact_match = utils.AverageMeter() examples = 0 bad_examples = 0 for ex in dev_loader: ex_id, batch_size = ex[-1], ex[0].size(0) chosen_offset = ex[-2] pred_s, pred_e, _ = model.predict(ex) for i in range(batch_size): if pred_s[i][0] >= len(offsets[ex_id[i]]) or pred_e[i][0] >= len(offsets[ex_id[i]]): bad_examples += 1 continue if args.use_sentence_selector: s_offset = chosen_offset[i][pred_s[i][0]][0] e_offset = chosen_offset[i][pred_e[i][0]][1] else: s_offset = offsets[ex_id[i]][pred_s[i][0]][0] e_offset = offsets[ex_id[i]][pred_e[i][0]][1] prediction = texts[ex_id[i]][s_offset:e_offset] predictions[ex_id[i]] = prediction ground_truths = answers[ex_id[i]] exact_match.update(utils.metric_max_over_ground_truths( utils.exact_match_score, prediction, ground_truths)) f1.update(utils.metric_max_over_ground_truths( utils.f1_score, prediction, ground_truths)) examples += batch_size logger.info('dev valid official for dev file %s : Epoch = %d | EM = %.2f | ' % (dataset_file, global_stats['epoch'], exact_match.avg * 100) + 'F1 = %.2f | examples = %d | valid time = %.2f (s)' % (f1.avg * 100, examples, eval_time.time())) orig_f1_score = 0.0 orig_exact_match_score = 0.0 adv_f1_scores = {} # Map from original ID to F1 score adv_exact_match_scores = {} # Map from original ID to exact match score adv_ids = {} all_ids = set() # Set of all original IDs f1 = exact_match = 0 dataset = json.load(open(dataset_file))['data'] for article in dataset: for paragraph in article['paragraphs']: for qa in paragraph['qas']: orig_id = qa['id'].split('-')[0] all_ids.add(orig_id) if qa['id'] not in predictions: message = 'Unanswered question ' + qa['id'] + ' will receive score 0.' # logger.info(message) continue ground_truths = list(map(lambda x: x['text'], qa['answers'])) prediction = predictions[qa['id']] cur_exact_match = utils.metric_max_over_ground_truths(utils.exact_match_score, prediction, ground_truths) cur_f1 = utils.metric_max_over_ground_truths(utils.f1_score, prediction, ground_truths) if orig_id == qa['id']: # This is an original example orig_f1_score += cur_f1 orig_exact_match_score += cur_exact_match if orig_id not in adv_f1_scores: # Haven't seen adversarial example yet, so use original for adversary adv_ids[orig_id] = orig_id adv_f1_scores[orig_id] = cur_f1 adv_exact_match_scores[orig_id] = cur_exact_match else: # This is an adversarial example if (orig_id not in adv_f1_scores or adv_ids[orig_id] == orig_id or adv_f1_scores[orig_id] > cur_f1): # Always override if currently adversary currently using orig_id adv_ids[orig_id] = qa['id'] adv_f1_scores[orig_id] = cur_f1 adv_exact_match_scores[orig_id] = cur_exact_match orig_f1 = 100.0 * orig_f1_score / len(all_ids) orig_exact_match = 100.0 * orig_exact_match_score / len(all_ids) adv_exact_match = 100.0 * sum(adv_exact_match_scores.values()) / len(all_ids) adv_f1 = 100.0 * sum(adv_f1_scores.values()) / len(all_ids) logger.info("For the file %s Original Exact Match : %.4f ; Original F1 : : %.4f | " % (dataset_file, orig_exact_match, orig_f1) + "Adversarial Exact Match : %.4f ; Adversarial F1 : : %.4f " % (adv_exact_match, adv_f1))
def eval_end2end(args): out_file = args.out_file os.makedirs(os.path.dirname(out_file), exist_ok=True) prediction_file = args.prediction_file answer_file = args.answer_file match_fn = exact_match_score if args.no_regex else regex_match_score data_dir = os.path.dirname(prediction_file) model_file = args.model_file or os.path.join( data_dir, '{}.xgb'.format(args.classifier)) bst = xgboost.Booster() bst.load_model(model_file) stop_count = 0 stop_correct = 0 processed = 0 with open(out_file, 'w', encoding=ENCODING) as of: for answer_line, prediction_line in zip( open(answer_file, encoding=ENCODING), open(prediction_file, encoding=ENCODING)): answer_data = json.loads(answer_line) answer = [normalize(a) for a in answer_data['answer']] out_predictions = [] all_spans = [] all_a_scores = [] all_a_zscores = [] repeats = 0 prediction = json.loads(prediction_line) for i, entry in enumerate( sorted(prediction, key=lambda k: k['doc_score'], reverse=True)): out_predictions.append(entry) # doc_id = entry['doc_id'] # start = int(entry['start']) # end = int(entry['end']) doc_score = entry['doc_score'] ans_score = entry['span_score'] span = entry['span'] if span in all_spans: repeats += 1 all_spans.append(span) # Calculate sample z score (t statistic) for answer score if all_a_scores == [] or len( all_a_scores ) == 1: # dont use a_zscore feature at the beginning or if we only have 1 a_zscore = 0 else: # Take the sample mean of the previous ones, take zscore of the current with respect to that # sample_mean = np.mean(all_a_scores + [ans_score]) sample_mean = np.mean(all_a_scores) # sample_std = np.std(all_a_scores + [ans_score]) sample_std = np.std(all_a_scores) if sample_std <= 0.0: a_zscore = 0 else: a_zscore = (ans_score - sample_mean) / sample_std all_a_zscores.append(a_zscore) max_zscore = max(all_a_zscores) # repeats_2 = 1 if repeats == 2 else 0 # repeats_3 = 1 if repeats == 3 else 0 # repeats_4 = 1 if repeats == 4 else 0 # repeats_5 = 1 if repeats >= 5 else 0 # past5 = 1 if i >= 5 else 0 # past10 = 1 if i >= 10 else 0 past20 = 1 if i >= 20 else 0 x = [max_zscore, ans_score, doc_score, repeats, past20] feature_mat = xgboost.DMatrix(x) stop_prob = bst.predict(feature_mat) if stop_prob > args.stop_threshold: if metric_max_over_ground_truths(match_fn, normalize(span), answer): stop_correct += 1 stop_count += 1 print(stop_prob, 'stopped at:', i + 1, stop_count, processed) break processed += 1 of.write(json.dumps(out_predictions) + '\n') print('processed', stop_correct, stop_count, processed)
def process_record(data_line_, prediction_line_, neg_gap_, match_fn): records_ = [] stop_count_ = 0 data = json.loads(data_line_) # question = data['question'] # q_id = slugify(question) answer = [normalize(a) for a in data['answer']] prediction = json.loads(prediction_line_) # MAKE SURE REVERSE IS TRUE ranked_prediction = sorted(prediction, key=lambda k: k['doc_score'], reverse=True) correct_rank = get_rank(prediction, answer, match_fn) if correct_rank > 150: # if correct_rank < 50 or correct_rank > 150: return records_, stop_count_ all_p_scores = [] all_a_scores = [] all_a_zscores = [] all_spans = [] repeats = 0 for i, entry in enumerate(ranked_prediction): # doc_id = entry['doc_id'] # start = int(entry['start']) # end = int(entry['end']) doc_score = entry['doc_score'] ans_score = entry['span_score'] span = entry['span'] if span in all_spans: repeats += 1 all_spans.append(span) # Calculate sample z score (t statistic) for answer score if all_a_scores == [] or len( all_a_scores ) == 1: # dont use a_zscore feature at the beginning or if we only have 1 a_zscore = 0 else: # Take the sample mean of the previous ones, take zscore of the current with respect to that # sample_mean = np.mean(all_a_scores + [ans_score]) sample_mean = np.mean(all_a_scores) # sample_std = np.std(all_a_scores + [ans_score]) sample_std = np.std(all_a_scores) if sample_std <= 0.0: a_zscore = 0 else: a_zscore = (ans_score - sample_mean) / sample_std # THESE ARE FOR STATISTISTICS OVER ENTIRE DATA SET, IGNORE # all_doc_scores.append(doc_score) all_a_zscores.append(a_zscore) max_zscore = max(all_a_zscores) # corr_doc_score = (doc_score - DOC_MEAN) / DOC_STD # corr_ans_mean_score = (np.mean(all_a_scores + [ans_score]) - ANS_MEAN) / ANS_STD all_p_scores.append(doc_score) all_a_scores.append(ans_score) # corr_doc_score = (doc_score - DOC_MEAN) / DOC_STD record = OrderedDict() # record['a_zscore'] = a_zscore record['max_zscore'] = max_zscore record['corr_doc_score'] = doc_score repeats_2 = 1 if repeats == 2 else 0 repeats_3 = 1 if repeats == 3 else 0 repeats_4 = 1 if repeats == 4 else 0 repeats_5 = 1 if repeats >= 5 else 0 past20 = 1 if i >= 20 else 0 # record['i'] = i record['repeats_2'] = repeats_2 record['repeats_3'] = repeats_3 record['repeats_4'] = repeats_4 record['repeats_5'] = repeats_5 record['past20'] = past20 # record['prob_avg'] = sum(all_probs) / len(all_probs) # record['prob'] = prob record['repeats'] = repeats # record['ans_avg'] = corr_ans_mean_score # record['question'] = question # if i + 1 == correct_rank: match = metric_max_over_ground_truths(match_fn, normalize(span), answer) # if i + 1 >= correct_rank: if match: record['stop'] = 1 stop_count_ += 1 # if stop_count_ > 10: # should_return = True # else: # should_return = False should_return = False write_record = True # if i % neg_gap_ == 0 or i + 1 == correct_rank: # stop_count_ += 1 # write_record = True # else: # write_record = False # # if i + 1 - correct_rank > 30: # should_return = True # else: # should_return = False else: should_return = False if i % neg_gap_ == 0: record['stop'] = 0 write_record = True else: write_record = False if write_record: records_.append(record) # record_path = os.path.join(record_dir_, '%s_%s.pkl' % (q_id, doc_id)) # with open(record_path, 'wb') as f: # pk.dump(record, f) if should_return: return records_, stop_count_ return records_, stop_count_