def model_evaluation(model, test_data, tokenizer, slot_meta, epoch, op_code='4', is_gt_op=False, is_gt_p_state=False, is_gt_gen=False): model.eval() op2id = OP_SET[op_code] id2op = {v: k for k, v in op2id.items()} id2domain = {v: k for k, v in domain2id.items()} slot_turn_acc, joint_acc, slot_F1_pred, slot_F1_count = 0, 0, 0, 0 final_joint_acc, final_count, final_slot_F1_pred, final_slot_F1_count = 0, 0, 0, 0 op_acc, op_F1, op_F1_count = 0, {k: 0 for k in op2id}, {k: 0 for k in op2id} all_op_F1_count = {k: 0 for k in op2id} tp_dic = {k: 0 for k in op2id} fn_dic = {k: 0 for k in op2id} fp_dic = {k: 0 for k in op2id} results = {} last_dialog_state = {} wall_times = [] for di, i in enumerate(test_data): if i.turn_id == 0: last_dialog_state = {} if is_gt_p_state is False: i.last_dialog_state = deepcopy(last_dialog_state) i.make_instance(tokenizer, word_dropout=0.) else: # ground-truth previous dialogue state last_dialog_state = deepcopy(i.gold_p_state) i.last_dialog_state = deepcopy(last_dialog_state) i.make_instance(tokenizer, word_dropout=0.) input_ids = torch.LongTensor([i.input_id]).to(device) input_mask = torch.LongTensor([i.input_mask]).to(device) segment_ids = torch.LongTensor([i.segment_id]).to(device) state_position_ids = torch.LongTensor([i.slot_position]).to(device) d_gold_op, _, _ = make_turn_label(slot_meta, last_dialog_state, i.gold_state, tokenizer, op_code, dynamic=True) gold_op_ids = torch.LongTensor([d_gold_op]).to(device) start = time.perf_counter() MAX_LENGTH = 9 with torch.no_grad(): # ground-truth state operation gold_op_inputs = gold_op_ids if is_gt_op else None d, s, g = model(input_ids=input_ids, token_type_ids=segment_ids, state_positions=state_position_ids, attention_mask=input_mask, max_value=MAX_LENGTH, op_ids=gold_op_inputs) _, op_ids = s.view(-1, len(op2id)).max(-1) if g.size(1) > 0: generated = g.squeeze(0).max(-1)[1].tolist() else: generated = [] if is_gt_op: pred_ops = [id2op[a] for a in gold_op_ids[0].tolist()] else: pred_ops = [id2op[a] for a in op_ids.tolist()] gold_ops = [id2op[a] for a in d_gold_op] if is_gt_gen: # ground_truth generation gold_gen = { '-'.join(ii.split('-')[:2]): ii.split('-')[-1] for ii in i.gold_state } else: gold_gen = {} generated, last_dialog_state = postprocessing(slot_meta, pred_ops, last_dialog_state, generated, tokenizer, op_code, gold_gen) end = time.perf_counter() wall_times.append(end - start) pred_state = [] for k, v in last_dialog_state.items(): pred_state.append('-'.join([k, v])) if set(pred_state) == set(i.gold_state): joint_acc += 1 key = str(i.id) + '_' + str(i.turn_id) results[key] = [pred_state, i.gold_state] # Compute prediction slot accuracy temp_acc = compute_acc(set(i.gold_state), set(pred_state), slot_meta) slot_turn_acc += temp_acc # Compute prediction F1 score temp_f1, temp_r, temp_p, count = compute_prf(i.gold_state, pred_state) slot_F1_pred += temp_f1 slot_F1_count += count # Compute operation accuracy temp_acc = sum( [1 if p == g else 0 for p, g in zip(pred_ops, gold_ops)]) / len(pred_ops) op_acc += temp_acc if i.is_last_turn: final_count += 1 if set(pred_state) == set(i.gold_state): final_joint_acc += 1 final_slot_F1_pred += temp_f1 final_slot_F1_count += count # Compute operation F1 score for p, g in zip(pred_ops, gold_ops): all_op_F1_count[g] += 1 if p == g: tp_dic[g] += 1 op_F1_count[g] += 1 else: fn_dic[g] += 1 fp_dic[p] += 1 joint_acc_score = joint_acc / len(test_data) turn_acc_score = slot_turn_acc / len(test_data) slot_F1_score = slot_F1_pred / slot_F1_count op_acc_score = op_acc / len(test_data) final_joint_acc_score = final_joint_acc / final_count final_slot_F1_score = final_slot_F1_pred / final_slot_F1_count latency = np.mean(wall_times) * 1000 op_F1_score = {} for k in op2id.keys(): tp = tp_dic[k] fn = fn_dic[k] fp = fp_dic[k] precision = tp / (tp + fp) if (tp + fp) != 0 else 0 recall = tp / (tp + fn) if (tp + fn) != 0 else 0 F1 = 2 * precision * recall / float(precision + recall) if ( precision + recall) != 0 else 0 op_F1_score[k] = F1 print("------------------------------") print('op_code: %s, is_gt_op: %s, is_gt_p_state: %s, is_gt_gen: %s' % \ (op_code, str(is_gt_op), str(is_gt_p_state), str(is_gt_gen))) print("Epoch %d joint accuracy : " % epoch, joint_acc_score) print("Epoch %d slot turn accuracy : " % epoch, turn_acc_score) print("Epoch %d slot turn F1: " % epoch, slot_F1_score) print("Epoch %d op accuracy : " % epoch, op_acc_score) print("Epoch %d op F1 : " % epoch, op_F1_score) print("Epoch %d op hit count : " % epoch, op_F1_count) print("Epoch %d op all count : " % epoch, all_op_F1_count) print("Final Joint Accuracy : ", final_joint_acc_score) print("Final slot turn F1 : ", final_slot_F1_score) print("Latency Per Prediction : %f ms" % latency) print("-----------------------------\n") json.dump(results, open('preds_%d.json' % epoch, 'w')) per_domain_join_accuracy(results, slot_meta) scores = { 'epoch': epoch, 'joint_acc': joint_acc_score, 'slot_acc': turn_acc_score, 'slot_f1': slot_F1_score, 'op_acc': op_acc_score, 'op_f1': op_F1_score, 'final_slot_f1': final_slot_F1_score } return scores
def evaluate(self, test_data_raw, tokenizer, ontology, slot_meta, epoch, device): slots, values = [], [] for slot in ontology: for value in ontology[slot]: slots.append(slot) values.append(value) op2id = {'update': 0, 'none': 2, 'dontcare': 1} id2op = {v: k for k, v in op2id.items()} slot_turn_acc, joint_acc, slot_F1_pred, slot_F1_count = 0, 0, 0, 0 final_joint_acc, final_count, final_slot_F1_pred, final_slot_F1_count = 0, 0, 0, 0 op_acc, op_F1, op_F1_count = 0, {k: 0 for k in op2id }, {k: 0 for k in op2id} all_op_F1_count = {k: 0 for k in op2id} tp_dic = {k: 0 for k in op2id} fn_dic = {k: 0 for k in op2id} fp_dic = {k: 0 for k in op2id} wall_times = [] results = {} # batch_size = 32 for step in tqdm(range(len(test_data_raw)), desc="Evaluation"): instance = test_data_raw[step] gold_slot_value = { '-'.join(ii.split('-')[:-1]): ii.split('-')[-1] for ii in instance.gold_state } gold_op = [] pred_op = [] pred_state = set() for j, slot in enumerate(slot_meta): if slot not in gold_slot_value: gold_op.append("none") else: gold_op.append("update" if gold_slot_value[slot] != "dontcare" else "dontcare") start = time.perf_counter() # prediction context_inp = [[ slot.replace("-", " "), instance.dialog_history + instance.turn_utter ]] context_tokens = tokenizer(context_inp, padding=True, return_tensors="pt").to(device) with torch.no_grad(): outputs = self.forward(context_tokens) _op = id2op[np.argmax(outputs.cpu().data.numpy())] pred_op.append(_op) if _op == "none": continue elif _op == "dontcare": pred_state.add(slot + "-" + _op) else: slot_pred = [] for value in ontology[slot]: value_tokens = tokenizer( [value], padding=True, return_tensors="pt").to(device) score = self.forward(context_tokens, value_tokens) slot_pred.append(score) pred_state.add(slot + "-" + ontology[slot][np.argmax(slot_pred)]) end = time.perf_counter() wall_times.append(end - start) if set(pred_state) == set(instance.gold_state): joint_acc += 1 key = str(instance.id) + '_' + str(instance.turn_id) results[key] = [list(pred_state), instance.gold_state] # Compute prediction slot accuracy temp_acc = compute_acc(set(instance.gold_state), set(pred_state), slot_meta) slot_turn_acc += temp_acc # Compute prediction F1 score temp_f1, temp_r, temp_p, count = compute_prf( instance.gold_state, pred_state) slot_F1_pred += temp_f1 slot_F1_count += count # Compute operation accuracy temp_acc = sum( [1 if p == g else 0 for p, g in zip(pred_op, gold_op)]) / len(pred_op) op_acc += temp_acc if instance.is_last_turn: final_count += 1 if set(pred_state) == set(instance.gold_state): final_joint_acc += 1 final_slot_F1_pred += temp_f1 final_slot_F1_count += count # Compute operation F1 score for p, g in zip(pred_op, gold_op): all_op_F1_count[g] += 1 if p == g: tp_dic[g] += 1 op_F1_count[g] += 1 else: fn_dic[g] += 1 fp_dic[p] += 1 # joint_acc_score = joint_acc / len(test_data_raw) turn_acc_score = slot_turn_acc / len(test_data_raw) slot_F1_score = slot_F1_pred / slot_F1_count op_acc_score = op_acc / len(test_data_raw) final_joint_acc_score = final_joint_acc / final_count final_slot_F1_score = final_slot_F1_pred / final_slot_F1_count latency = np.mean(wall_times) * 1000 op_F1_score = {} for k in op2id.keys(): tp = tp_dic[k] fn = fn_dic[k] fp = fp_dic[k] precision = tp / (tp + fp) if (tp + fp) != 0 else 0 recall = tp / (tp + fn) if (tp + fn) != 0 else 0 F1 = 2 * precision * recall / float(precision + recall) if ( precision + recall) != 0 else 0 op_F1_score[k] = F1 # print("------------------------------") print("Epoch %d joint accuracy : " % epoch, joint_acc_score) print("Epoch %d slot turn accuracy : " % epoch, turn_acc_score) print("Epoch %d slot turn F1: " % epoch, slot_F1_score) print("Epoch %d op accuracy : " % epoch, op_acc_score) print("Epoch %d op F1 : " % epoch, op_F1_score) print("Epoch %d op hit count : " % epoch, op_F1_count) print("Epoch %d op all count : " % epoch, all_op_F1_count) print("Final Joint Accuracy : ", final_joint_acc_score) print("Final slot turn F1 : ", final_slot_F1_score) print("Latency Per Prediction : %f ms" % latency) print("-----------------------------\n") res_per_domain = per_domain_join_accuracy(results, slot_meta) # scores = { 'epoch': epoch, 'joint_acc_score': joint_acc_score, 'turn_acc_score': turn_acc_score, 'slot_F1_score': slot_F1_score, 'op_acc_score': op_acc_score, 'op_F1_score': op_F1_score, 'op_F1_count': op_F1_count, 'all_op_F1_count': all_op_F1_count, 'final_joint_acc_score': final_joint_acc_score, 'final_slot_F1_score': final_slot_F1_score, 'latency': latency } return scores, res_per_domain, results
def model_evaluation(model, test_data, tokenizer, slot_meta, epoch, op_code='4', is_gt_op=False, is_gt_p_state=False, is_gt_gen=False, use_full_slot=False, use_dt_only=False, no_dial=False, use_cls_only=False, n_gpu=0, submission=False, use_wandb=False): device = torch.device('cuda' if n_gpu else 'cpu') model.eval() op2id = OP_SET[op_code] id2op = {v: k for k, v in op2id.items()} id2domain = {v: k for k, v in domain2id.items()} slot_turn_acc, joint_acc, slot_F1_pred, slot_F1_count = 0, 0, 0, 0 final_joint_acc, final_count, final_slot_F1_pred, final_slot_F1_count = 0, 0, 0, 0 op_acc, op_F1, op_F1_count = 0, {k: 0 for k in op2id}, {k: 0 for k in op2id} all_op_F1_count = {k: 0 for k in op2id} tp_dic = {k: 0 for k in op2id} fn_dic = {k: 0 for k in op2id} fp_dic = {k: 0 for k in op2id} results = {} last_dialog_state = {} wall_times = [] if submission: _submission = {} start_time = time.time() for di, i in enumerate(test_data): if (di + 1) % 1000 == 0: print("{:}, {:.1f}min".format(di, (time.time() - start_time) / 60)) sys.stdout.flush() if i.turn_id == 0: last_dialog_state = {} if is_gt_p_state is False: i.last_dialog_state = deepcopy(last_dialog_state) i.make_instance(tokenizer, word_dropout=0.) else: # ground-truth previous dialogue state last_dialog_state = deepcopy(i.gold_p_state) i.last_dialog_state = deepcopy(last_dialog_state) i.make_instance(tokenizer, word_dropout=0.) id2ds = {} for id, s in enumerate(i.slot_meta): k = s.split('-') # print(k) # e.g. ['attraction', 'area'] id2ds[id] = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(' '.join(k + ['-']))) tensor_list = wrap_into_tensor( [i], pad_id=tokenizer.convert_tokens_to_ids(['[PAD]'])[0], slot_id=tokenizer.convert_tokens_to_ids(['[SLOT]'])[0])[:4] tensor_list = [t.to(device) for t in tensor_list] input_ids_p, segment_ids_p, input_mask_p, state_position_ids = tensor_list d_gold_op, _, _ = make_turn_label(slot_meta, last_dialog_state, i.gold_state, tokenizer, op_code, dynamic=True) gold_op_ids = torch.LongTensor([d_gold_op]).to(device) start = time.perf_counter() MAX_LENGTH = 9 if n_gpu > 1: model.module.decoder.min_len = 1 # just ask the decoder to generate at least a token (notice that [SEP] is included) else: model.decoder.min_len = 1 with torch.no_grad(): # ground-truth state operation gold_op_inputs = gold_op_ids if is_gt_op else None if n_gpu > 1: d, s, generated = model.module.output( input_ids_p, segment_ids_p, input_mask_p, state_position_ids, i.diag_len, op_ids=gold_op_inputs, gen_max_len=MAX_LENGTH, use_full_slot=use_full_slot, use_dt_only=use_dt_only, diag_1_len=i.diag_1_len, no_dial=no_dial, use_cls_only=use_cls_only, i_dslen_map=i.i_dslen_map) else: d, s, generated = model.output(input_ids_p, segment_ids_p, input_mask_p, state_position_ids, i.diag_len, op_ids=gold_op_inputs, gen_max_len=MAX_LENGTH, use_full_slot=use_full_slot, use_dt_only=use_dt_only, diag_1_len=i.diag_1_len, no_dial=no_dial, use_cls_only=use_cls_only, i_dslen_map=i.i_dslen_map) _, op_ids = s.view(-1, len(op2id)).max(-1) if is_gt_op: pred_ops = [id2op[a] for a in gold_op_ids[0].tolist()] else: pred_ops = [id2op[a] for a in op_ids.tolist()] gold_ops = [id2op[a] for a in d_gold_op] if is_gt_gen: # ground_truth generation gold_gen = { '-'.join(ii.split('-')[:2]): ii.split('-')[-1] for ii in i.gold_state } else: gold_gen = {} generated, last_dialog_state = postprocessing(slot_meta, pred_ops, last_dialog_state, generated, tokenizer, op_code, gold_gen) # print(last_dialog_state) end = time.perf_counter() wall_times.append(end - start) pred_state = [] for k, v in last_dialog_state.items(): pred_state.append('-'.join([k, v])) if set(pred_state) == set(i.gold_state): joint_acc += 1 key = str(i.id) + '_' + str(i.turn_id) results[key] = [pred_state, i.gold_state] if submission: key_sub = str(i.id) + '-' + str(i.turn_id) _submission[key_sub] = pred_state # Compute prediction slot accuracy temp_acc = compute_acc(set(i.gold_state), set(pred_state), slot_meta) slot_turn_acc += temp_acc # Compute prediction F1 score temp_f1, temp_r, temp_p, count = compute_prf(i.gold_state, pred_state) slot_F1_pred += temp_f1 slot_F1_count += count # Compute operation accuracy temp_acc = sum( [1 if p == g else 0 for p, g in zip(pred_ops, gold_ops)]) / len(pred_ops) op_acc += temp_acc if i.is_last_turn: final_count += 1 if set(pred_state) == set(i.gold_state): final_joint_acc += 1 final_slot_F1_pred += temp_f1 final_slot_F1_count += count # Compute operation F1 score for p, g in zip(pred_ops, gold_ops): all_op_F1_count[g] += 1 if p == g: tp_dic[g] += 1 op_F1_count[g] += 1 else: fn_dic[g] += 1 fp_dic[p] += 1 joint_acc_score = joint_acc / len(test_data) turn_acc_score = slot_turn_acc / len(test_data) slot_F1_score = slot_F1_pred / slot_F1_count op_acc_score = op_acc / len(test_data) final_joint_acc_score = final_joint_acc / final_count final_slot_F1_score = final_slot_F1_pred / final_slot_F1_count latency = np.mean(wall_times) * 1000 op_F1_score = {} for k in op2id.keys(): tp = tp_dic[k] fn = fn_dic[k] fp = fp_dic[k] precision = tp / (tp + fp) if (tp + fp) != 0 else 0 recall = tp / (tp + fn) if (tp + fn) != 0 else 0 F1 = 2 * precision * recall / float(precision + recall) if ( precision + recall) != 0 else 0 op_F1_score[k] = F1 print("------------------------------") print('op_code: %s, is_gt_op: %s, is_gt_p_state: %s, is_gt_gen: %s' % \ (op_code, str(is_gt_op), str(is_gt_p_state), str(is_gt_gen))) print("Epoch %d joint accuracy : " % epoch, joint_acc_score) print("Epoch %d slot turn accuracy : " % epoch, turn_acc_score) print("Epoch %d slot turn F1: " % epoch, slot_F1_score) print("Epoch %d op accuracy : " % epoch, op_acc_score) print("Epoch %d op F1 : " % epoch, op_F1_score) print("Epoch %d op hit count : " % epoch, op_F1_count) print("Epoch %d op all count : " % epoch, all_op_F1_count) print("Final Joint Accuracy : ", final_joint_acc_score) print("Final slot turn F1 : ", final_slot_F1_score) print("Latency Per Prediction : %f ms" % latency) print("-----------------------------\n") if submission: json.dump( _submission, open(f"{epoch}-output.csv", "w"), indent=2, ensure_ascii=False, ) scores = {} else: json.dump(results, open('preds_%d.json' % epoch, 'w')) if use_wandb: wandb.log({ "joint_goal_accuracy": joint_acc_score, "turn_slot_accuracy": turn_acc_score, "turn_slot_f1": slot_F1_score }) per_domain_join_accuracy(results, slot_meta) scores = { 'epoch': epoch, 'joint_acc': joint_acc_score, 'slot_acc': turn_acc_score, 'slot_f1': slot_F1_score, 'op_acc': op_acc_score, 'op_f1': op_F1_score, 'final_slot_f1': final_slot_F1_score } return scores
def evaluation(infile, tokenizer, model, device, epoch): slot_turn_acc, joint_acc, slot_F1_pred, slot_F1_count = 0, 0, 0, 0 len_test_data = 0 model.eval() data_gen = DataGenerator(infile, tokenizer, device) wall_times = [] for batch in data_gen.batchIter(eval_batch_size): batch_content_ids = batch["batch_content_ids"] batch_token_type_ids = batch["batch_token_type_ids"] batch_attention_mask = batch["batch_attention_mask"] batch_gold_state = batch["batch_gold_state"] start = time.perf_counter() with torch.no_grad(): domain_score, slot_pointer_prob, slot_gate_prob, slot_pointer, slot_gate, start_prob, end_prob = model( batch_content_ids, batch_token_type_ids, batch_attention_mask) state_list = get_state(batch_content_ids, slot_pointer, slot_gate, start_prob, end_prob) end = time.perf_counter() wall_times.append(end - start) for pred_state, gold_state in zip(state_list, batch_gold_state): if set(pred_state) == set(gold_state): joint_acc += 1 len_test_data += 1 # Compute prediction slot accuracy temp_acc = compute_acc(set(gold_state), set(pred_state), SLOT) slot_turn_acc += temp_acc # Compute prediction F1 score temp_f1, temp_r, temp_p, count = compute_prf( gold_state, pred_state) slot_F1_pred += temp_f1 slot_F1_count += count joint_acc_score = joint_acc / len_test_data turn_acc_score = slot_turn_acc / len_test_data slot_F1_score = slot_F1_pred / slot_F1_count latency = np.mean(wall_times) * 1000 print("------------------------------") print("Epoch %d joint accuracy : " % epoch, joint_acc_score) print("Epoch %d slot turn accuracy : " % epoch, turn_acc_score) print("Epoch %d slot turn F1: " % epoch, slot_F1_score) print("Latency Per Prediction : %f ms" % latency) print("-----------------------------\n") scores = { 'epoch': epoch, 'joint_acc': joint_acc_score, 'slot_acc': turn_acc_score, 'slot_f1': slot_F1_score } return scores