def validate_dataset(model, split, tokenizer, topk=1): assert split in ('dev', 'test') dataloader = get_dataloader('xlnet', split, tokenizer, bwd=False, \ batch_size=16, num_workers=16) em, f1, count = 0, 0, 0 model.start_n_top = topk model.end_n_top = topk model.eval() for batch in dataloader: input_ids, attention_mask, token_type_ids, input_tokens_no_unk, answers = batch input_ids = input_ids.cuda(device=device) attention_mask = attention_mask.cuda(device=device) token_type_ids = token_type_ids.cuda(device=device) with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) start_index = outputs[1] end_index = outputs[3].view(-1, model.end_n_top, model.start_n_top).permute([0, 2, 1])[:, :, 0] for i, answer in enumerate(answers): preds = [] for k in range(model.start_n_top): pred_tokens = input_tokens_no_unk[i][ start_index[i][k]:end_index[i][k] + 1] preds.append(tokenizer.convert_tokens_to_string(pred_tokens)) norm_preds_tokens = [ norm_tokenizer.basic_tokenizer.tokenize(pred) for pred in preds ] norm_preds = [ norm_tokenizer.convert_tokens_to_string(norm_pred_tokens) for norm_pred_tokens in norm_preds_tokens ] norm_answer_tokens = [ norm_tokenizer.basic_tokenizer.tokenize(ans) for ans in answer ] norm_answer = [ norm_tokenizer.convert_tokens_to_string(ans_tokens) for ans_tokens in norm_answer_tokens ] em += max( metric_max_over_ground_truths(exact_match_score, norm_pred, norm_answer) for norm_pred in norm_preds) f1 += max( metric_max_over_ground_truths(f1_score, norm_pred, norm_answer) for norm_pred in norm_preds) count += 1 del dataloader return em, f1, count
def evaluate_q_types(dataset, predictions): q_one_grams = {} q_two_grams = {} for article in dataset: for paragraph in article['paragraphs']: for qa in paragraph['qas']: qa_split = qa['question'].split() first_word = qa_split[0] first_2_words = first_word + " " + qa_split[1] if first_word not in q_one_grams: q_one_grams[first_word] = {'f1': 0.0, 'em': 0.0, 'count': 0} if first_2_words not in q_two_grams: q_two_grams[first_2_words] = {'f1': 0.0, 'em': 0.0, 'count': 0} q_one_grams[first_word]['count'] += 1 q_two_grams[first_2_words]['count'] += 1 if qa['id'] not in predictions: continue ground_truths = list(map(lambda x: x['text'], qa['answers'])) prediction = predictions[qa['id']] em = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths) f1 = metric_max_over_ground_truths(f1_score, prediction, ground_truths) q_one_grams[first_word]['f1'] += f1 q_one_grams[first_word]['em'] += em q_two_grams[first_2_words]['f1'] += f1 q_two_grams[first_2_words]['em'] += em results_1 = {} for key in q_one_grams: val = q_one_grams[key] results_1[key] = {'f1':100.0 * val['f1'] / val['count'], 'em':100.0 * val['em'] / val['count'], 'count': val['count']} sorted_results_1 = sorted(results_1.items(), key=lambda (x,y): y['count'], reverse=True) results_2 = {} for key in q_two_grams: val = q_two_grams[key] results_2[key] = {'f1':100.0 * val['f1'] / val['count'], 'em':100.0 * val['em'] / val['count'], 'count': val['count']} sorted_results_2 = sorted(results_2.items(), key=lambda (x,y): y['count'], reverse=True) return {'one grams': sorted_results_1[:20], 'two grams': sorted_results_2[:20]}
def evaluation_devresult(pre_result,target_result): ''' function: QA evaluation ''' f1 = exact_match = total = 0 for i in range(len(pre_result)): total += 1 prediction = pre_result[i] ground_truths = target_result[i] exact_match += evaluate.metric_max_over_ground_truths(evaluate.exact_match_score, prediction, ground_truths) f1 += evaluate.metric_max_over_ground_truths(evaluate.f1_score, prediction, ground_truths) exact_match = exact_match / total f1 = f1 / total return exact_match,f1
def validate_dataset(model, split, tokenizer, topk=1, prefix=None): assert split in ('dev', 'test') fwd_dataloader = get_dataloader('bert', split, tokenizer, bwd=False, \ batch_size=16, num_workers=16, prefix=prefix) bwd_dataloader = get_dataloader('bert', split, tokenizer, bwd=True, \ batch_size=16, num_workers=16, prefix=prefix) em, f1, count = 0, 0, 0 model.eval() for fwd_batch, bwd_batch in zip(fwd_dataloader, bwd_dataloader): # FWD input_ids, attention_mask, token_type_ids, margin_mask, fwd_input_tokens_no_unks, answers = fwd_batch input_ids = input_ids.cuda(device=device) attention_mask = attention_mask.cuda(device=device) token_type_ids = token_type_ids.cuda(device=device) margin_mask = margin_mask.cuda(device=device) with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) start_logits, end_logits = outputs[0], outputs[1] start_logits += margin_mask end_logits += margin_mask start_logits = start_logits.cpu().clone() fwd_end_logits = end_logits.cpu().clone() start_probs = start_logits #softmax(start_logits, dim=1) fwd_start_probs, fwd_start_index = start_probs.topk(topk * 5, dim=1) # BWD input_ids, attention_mask, token_type_ids, margin_mask, bwd_input_tokens_no_unks, answers = bwd_batch input_ids = input_ids.cuda(device=device) attention_mask = attention_mask.cuda(device=device) token_type_ids = token_type_ids.cuda(device=device) margin_mask = margin_mask.cuda(device=device) with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) start_logits, end_logits = outputs[0], outputs[1] start_logits += margin_mask end_logits += margin_mask start_logits = start_logits.cpu().clone() bwd_end_logits = end_logits.cpu().clone() start_probs = start_logits #softmax(start_logits, dim=1) bwd_start_probs, bwd_start_index = start_probs.topk(topk * 5, dim=1) # FWD-BWD for i, answer in enumerate(answers): preds, probs = [], [] for n in range(topk * 5): # FWD start_prob = fwd_start_probs[i][n].item() start_ind = fwd_start_index[i][n].item() beam_end_logits = fwd_end_logits[i].clone().unsqueeze(0) end_probs = beam_end_logits #softmax(beam_end_logits, dim=1) end_probs[0, :start_ind] += -1e10 end_probs[0, start_ind + 20:] += -1e10 end_probs, end_index = end_probs.topk(topk * 5, dim=1) # topk*topk combination for m in range(topk * 5): end_prob = end_probs[0][m].item() end_ind = end_index[0][m].item() prob = start_prob + end_prob # log prob i.e. logits span_tokens = fwd_input_tokens_no_unks[i][ start_ind:end_ind + 1] pred = convert_tokens_to_string(span_tokens) if pred == tokenizer.sep_token or pred == '': pass elif pred and pred not in preds: probs.append(prob) preds.append(pred) elif pred and pred in preds: pred_idx = preds.index(pred) if prob > probs[pred_idx]: probs[pred_idx] = prob #probs[preds.index(pred)] += prob else: pass # BWD start_prob = bwd_start_probs[i][n].item() start_ind = bwd_start_index[i][n].item() beam_end_logits = bwd_end_logits[i].clone().unsqueeze(0) end_probs = beam_end_logits #softmax(beam_end_logits, dim=1) end_probs[0, :start_ind] += -1e10 end_probs[0, start_ind + 20:] += -1e10 end_probs, end_index = end_probs.topk(topk * 5, dim=1) end_ind = end_index[0][0] # topk*topk combination for m in range(topk * 5): end_prob = end_probs[0][m].item() end_ind = end_index[0][m].item() prob = start_prob + end_prob # log prob i.e. logits span_tokens = bwd_input_tokens_no_unks[i][ start_ind:end_ind + 1] pred = convert_tokens_to_string(span_tokens) if pred == tokenizer.sep_token or pred == '': pass elif pred and pred not in preds: probs.append(prob) preds.append(pred) elif pred and pred in preds: pred_idx = pred.index(pred) if prob > probs[pred_idx]: probs[pred_idx] = prob #probs[preds.index(pred)] += prob else: pass count += 1 if len(preds) > 0: sorted_probs_preds = list(reversed(sorted(zip(probs, preds)))) probs, preds = map(list, zip(*sorted_probs_preds)) probs, preds = probs[:topk], preds[:topk] norm_preds_tokens = [ norm_tokenizer.basic_tokenizer.tokenize(pred) for pred in preds ] norm_preds = [ norm_tokenizer.convert_tokens_to_string(norm_pred_tokens) for norm_pred_tokens in norm_preds_tokens ] norm_answer_tokens = [ norm_tokenizer.basic_tokenizer.tokenize(ans) for ans in answer ] norm_answer = [ norm_tokenizer.convert_tokens_to_string(ans_tokens) for ans_tokens in norm_answer_tokens ] em += max( metric_max_over_ground_truths(exact_match_score, norm_pred, norm_answer) for norm_pred in norm_preds) f1 += max( metric_max_over_ground_truths(f1_score, norm_pred, norm_answer) for norm_pred in norm_preds) del fwd_dataloader, bwd_dataloader return em, f1, count
def validate_dataset(model, split, tokenizer, dataset, topk=1): assert split in ('dev', 'test') fwd_dataloader = get_dataloader('bert', split, tokenizer, bwd=False, \ batch_size=16, num_workers=16, prefix=dataset) bwd_dataloader = get_dataloader('bert', split, tokenizer, bwd=True, \ batch_size=16, num_workers=16, prefix=dataset) em, f1, count = 0, 0, 0 model.eval() for fwd_batch, bwd_batch in zip(fwd_dataloader, bwd_dataloader): # Forward input_ids, attention_mask, token_type_ids, fwd_input_tokens_no_unk, answers = fwd_batch input_ids = input_ids.cuda(device=device) attention_mask = attention_mask.cuda(device=device) token_type_ids = token_type_ids.cuda(device=device) with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) start_logits, end_logits = outputs[0], outputs[1] start_probs = softmax(start_logits, dim=1) end_probs = softmax(end_logits, dim=1) fwd_start_probs, fwd_start_index = start_probs.topk(topk, dim=1) fwd_end_probs, fwd_end_index = end_probs.topk(topk, dim=1) # Backward input_ids, attention_mask, token_type_ids, bwd_input_tokens_no_unk, answers = bwd_batch input_ids = input_ids.cuda(device=device) attention_mask = attention_mask.cuda(device=device) token_type_ids = token_type_ids.cuda(device=device) with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) start_logits, end_logits = outputs[0], outputs[1] start_probs = softmax(start_logits, dim=1) end_probs = softmax(end_logits, dim=1) bwd_start_probs, bwd_start_index = start_probs.topk(topk, dim=1) bwd_end_probs, bwd_end_index = end_probs.topk(topk, dim=1) for i, answer in enumerate(answers): preds = [] if topk <= 1: span_tokens = fwd_input_tokens_no_unk[i][ fwd_start_index[i][0]:fwd_end_index[i][0] + 1] preds.append(tokenizer.convert_tokens_to_string(span_tokens)) span_tokens = bwd_input_tokens_no_unk[i][ bwd_start_index[i][0]:bwd_end_index[i][0] + 1] preds.append(tokenizer.convert_tokens_to_string(span_tokens)) else: joint_probs, joint_index = ( fwd_start_probs[i].unsqueeze(1) * fwd_end_probs[i].unsqueeze(0)).view(topk * topk).topk(topk) for n in range(topk): smap = joint_index[n] // topk emap = joint_index[n] - smap * topk span_tokens = fwd_input_tokens_no_unk[i][ fwd_start_index[i][smap]:fwd_end_index[i][emap] + 1] preds.append( tokenizer.convert_tokens_to_string(span_tokens)) joint_probs, joint_index = ( bwd_start_probs[i].unsqueeze(1) * bwd_end_probs[i].unsqueeze(0)).view(topk * topk).topk(topk) for n in range(topk): smap = joint_index[n] // topk emap = joint_index[n] - smap * topk span_tokens = bwd_input_tokens_no_unk[i][ bwd_start_index[i][smap]:bwd_end_index[i][emap] + 1] preds.append( tokenizer.convert_tokens_to_string(span_tokens)) norm_preds_tokens = [ norm_tokenizer.basic_tokenizer.tokenize(pred) for pred in preds ] norm_preds = [ norm_tokenizer.convert_tokens_to_string(norm_pred_tokens) for norm_pred_tokens in norm_preds_tokens ] norm_answer_tokens = [ norm_tokenizer.basic_tokenizer.tokenize(ans) for ans in answer ] norm_answer = [ norm_tokenizer.convert_tokens_to_string(ans_tokens) for ans_tokens in norm_answer_tokens ] em += max( metric_max_over_ground_truths(exact_match_score, norm_pred, norm_answer) for norm_pred in norm_preds) f1 += max( metric_max_over_ground_truths(f1_score, norm_pred, norm_answer) for norm_pred in norm_preds) count += 1 del fwd_dataloader, bwd_dataloader return em, f1, count
def validate_dataset(model, split, tokenizer, topk=5): assert split in ('dev', 'test') dataloader = get_dataloader('xlnet', split, tokenizer, bwd=False, \ batch_size=8, num_workers=8) em, f1, count = 0, 0, 0 model.start_n_top = topk model.end_n_top = topk model.eval() for batch in dataloader: batch = (*(tensor.cuda(device) for tensor in batch[:-2]), *batch[-2:]) input_ids, attention_mask, token_type_ids, cls_index, input_tokens_no_unk, answers = batch with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, cls_index=cls_index) start_index = outputs[1] end_index = outputs[3][:, :, 0] op_types = outputs[4] for i, answer in enumerate(answers): preds = [] for k in range(model.start_n_top): op_type = op_types[i][k].argmax().item() if op_type == 0: pred_tokens = input_tokens_no_unk[i][ start_index[i][k]:end_index[i][k] + 1] pred = tokenizer.convert_tokens_to_string(pred_tokens) elif op_type == 1: pred = arithmetic_op(tokenizer, num_match_re, input_tokens_no_unk[i], start_index[i][k], end_index[i][k], plus=True) elif op_type == 2: pred = arithmetic_op(tokenizer, num_match_re, input_tokens_no_unk[i], start_index[i][k], end_index[i][k], plus=False) elif op_type == 3: pred = date_duration_op(tokenizer, date_re, dur_re, tn, input_tokens_no_unk[i], start_index[i][k], end_index[i][k], plus=True) elif op_type == 4: pred = date_duration_op(tokenizer, date_re, dur_re, tn, input_tokens_no_unk[i], start_index[i][k], end_index[i][k], plus=False) preds.append(pred) norm_preds_tokens = [ norm_tokenizer.basic_tokenizer.tokenize(pred) for pred in preds ] norm_preds = [ norm_tokenizer.convert_tokens_to_string(norm_pred_tokens) for norm_pred_tokens in norm_preds_tokens ] norm_answer_tokens = [ norm_tokenizer.basic_tokenizer.tokenize(ans) for ans in answer ] norm_answer = [ norm_tokenizer.convert_tokens_to_string(ans_tokens) for ans_tokens in norm_answer_tokens ] em += max( metric_max_over_ground_truths(exact_match_score, norm_pred, norm_answer) for norm_pred in norm_preds) f1 += max( metric_max_over_ground_truths(f1_score, norm_pred, norm_answer) for norm_pred in norm_preds) count += 1 del dataloader return em, f1, count