def test_eval(self): data = DATAMultiWOZ(debug=False, data_dir=self.data_dir) test_examples = data.read_examples( os.path.join(self.data_dir, 'test.json')) print('eval_examples的数量', len(test_examples)) dialogueID = [x.guid for x in test_examples] test_features = data.convert_examples_to_features( test_examples, self.tokenizer, self.max_seq_length) test_input_ids = torch.tensor(data.select_field( test_features, 'input_ids'), dtype=torch.long) test_input_mask = torch.tensor(data.select_field( test_features, 'input_mask'), dtype=torch.long) test_segment_ids = torch.tensor(data.select_field( test_features, 'segment_ids'), dtype=torch.long) test_utterance_mask = torch.tensor(data.select_field( test_features, 'utterance_mask'), dtype=torch.long) test_domainslot_mask = torch.tensor(data.select_field( test_features, 'domainslot_mask'), dtype=torch.long) test_label_tokens_start = torch.tensor( [f.label_tokens_start for f in test_features], dtype=torch.long) test_label_tokens_end = torch.tensor( [f.label_tokens_end for f in test_features], dtype=torch.long) test_label_sentence_domainslot = torch.tensor( [f.label_sentence_domainslot for f in test_features], dtype=torch.long) test_label_tokens_domainslot = torch.tensor( [f.label_tokens_domainslot for f in test_features], dtype=torch.long) test_hist_tokens = [f.hist_token for f in test_features] test_data = TensorDataset( test_input_ids, test_input_mask, test_segment_ids, test_utterance_mask, test_domainslot_mask, test_label_tokens_start, test_label_tokens_end, test_label_sentence_domainslot, test_label_tokens_domainslot) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained(os.path.join( self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) model.eval() gold_labels_tokens_start = [] gold_labels_tokens_end = [] gold_label_sentence_domainslot = [] gold_label_tokens_domainslot = [] scores_tokens_start = [] scores_tokens_end = [] scores_sentence_domainslot = [] scores_tokens_domainslot = [] for input_ids, input_mask, segment_ids, \ utterance_mask, domainslot_mask, \ label_tokens_start, label_tokens_end, \ label_sentence_domainslot, label_tokens_domainslot in test_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) utterance_mask = utterance_mask.to(self.device) domainslot_mask = domainslot_mask.to(self.device) label_tokens_start = label_tokens_start.to(self.device) label_tokens_end = label_tokens_end.to(self.device) label_sentence_domainslot = label_sentence_domainslot.to( self.device) # print(label_sentence_domainslot.size()) # print(label_sentence_domainslot) label_tokens_domainslot = label_tokens_domainslot.to(self.device) with torch.no_grad(): batch_eval_loss_tokens_start, batch_eval_loss_tokens_end, batch_eval_loss_sentence_domainslot, batch_eval_loss_tokens_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, domainslot_mask=domainslot_mask, label_tokens_start=label_tokens_start, label_tokens_end=label_tokens_end, label_sentence_domainslot=label_sentence_domainslot, label_tokens_domainslot=label_tokens_domainslot) logits_tokens_start, logits_tokens_end, logits_sentence_domainslot, logits_tokens_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, domainslot_mask=domainslot_mask) logits_tokens_start = logits_tokens_start.view(-1, 2).cpu().numpy() logits_tokens_end = logits_tokens_end.view(-1, 2).cpu().numpy() logits_tokens_domainslot = logits_tokens_domainslot.view( -1, 2).detach().cpu().numpy() logits_sentence_domainslot = logits_sentence_domainslot.view( -1, 2).cpu().numpy() label_tokens_start = label_tokens_start.view(-1).to('cpu').numpy() label_tokens_end = label_tokens_end.view(-1).to('cpu').numpy() label_sentence_domainslot = label_sentence_domainslot.to( 'cpu').numpy() label_tokens_domainslot = label_tokens_domainslot.to('cpu').numpy() scores_tokens_start.append(logits_tokens_start) scores_tokens_end.append(logits_tokens_end) scores_sentence_domainslot.append(logits_sentence_domainslot) scores_tokens_domainslot.append(logits_tokens_domainslot) gold_labels_tokens_start.append(label_tokens_start) gold_labels_tokens_end.append(label_tokens_end) gold_label_sentence_domainslot.append(label_sentence_domainslot) gold_label_tokens_domainslot.append(label_tokens_domainslot) gold_labels_tokens_start = np.concatenate(gold_labels_tokens_start, 0) gold_labels_tokens_end = np.concatenate(gold_labels_tokens_end, 0) gold_label_sentence_domainslot = np.concatenate( gold_label_sentence_domainslot, 0) gold_label_tokens_domainslot = np.concatenate( gold_label_tokens_domainslot, 0) scores_tokens_start = np.concatenate(scores_tokens_start, 0) scores_tokens_end = np.concatenate(scores_tokens_end, 0) scores_sentence_domainslot = np.concatenate(scores_sentence_domainslot, 0) scores_tokens_domainslot = np.concatenate(scores_tokens_domainslot, 0) # 计算评价指标 # eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain,mode='domain',report=True) # eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy,mode='dependcy',report=True) eval_F1_tokenstart, eval_F1_tokenend, F1_sentence_domainslot, F1_token_domainslot = compute_jointGoal_domainslot_1_( dialogueID, test_hist_tokens, scores_tokens_start, scores_tokens_end, scores_sentence_domainslot, scores_tokens_domainslot, gold_labels_tokens_start, gold_labels_tokens_end, gold_label_sentence_domainslot, gold_label_tokens_domainslot) print('F1_token_domainslot', F1_token_domainslot, 'F1_sentence_domainslot', F1_sentence_domainslot, 'eval_F1_tokenstart', eval_F1_tokenstart, 'eval_F1_tokenend', eval_F1_tokenend)
def create_dataloader(self): data = DATAMultiWOZ( debug=False, data_dir=self.data_dir, ) train_examples = data.read_examples( os.path.join(self.data_dir, 'train.json')) train_features = data.convert_examples_to_features( train_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( train_features, 'segment_ids'), dtype=torch.long) all_utterance_mask = torch.tensor(data.select_field( train_features, 'utterance_mask'), dtype=torch.long) all_domainslot_mask = torch.tensor(data.select_field( train_features, 'domainslot_mask'), dtype=torch.long) all_label_tokens_start = torch.tensor( [f.label_tokens_start for f in train_features], dtype=torch.long) all_label_tokens_end = torch.tensor( [f.label_tokens_end for f in train_features], dtype=torch.long) all_label_sentence_domainslot = torch.tensor( [f.label_sentence_domainslot for f in train_features], dtype=torch.long) all_label_tokens_domainslot = torch.tensor( [f.label_tokens_domainslot for f in train_features], dtype=torch.long) all_hist_tokens = [f.hist_token for f in train_features] train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_utterance_mask, all_domainslot_mask, all_label_tokens_start, all_label_tokens_end, all_label_sentence_domainslot, all_label_tokens_domainslot) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.train_batch_size) eval_examples = data.read_examples( os.path.join(self.data_dir, 'test.json')) eval_features = data.convert_examples_to_features( eval_examples, self.tokenizer, self.max_seq_length) eval_input_ids = torch.tensor(data.select_field( eval_features, 'input_ids'), dtype=torch.long) eval_input_mask = torch.tensor(data.select_field( eval_features, 'input_mask'), dtype=torch.long) eval_segment_ids = torch.tensor(data.select_field( eval_features, 'segment_ids'), dtype=torch.long) eval_utterance_mask = torch.tensor(data.select_field( eval_features, 'utterance_mask'), dtype=torch.long) eval_domainslot_mask = torch.tensor(data.select_field( eval_features, 'domainslot_mask'), dtype=torch.long) eval_label_tokens_start = torch.tensor( [f.label_tokens_start for f in eval_features], dtype=torch.long) eval_label_tokens_end = torch.tensor( [f.label_tokens_end for f in eval_features], dtype=torch.long) eval_label_sentence_domainslot = torch.tensor( [f.label_sentence_domainslot for f in eval_features], dtype=torch.long) eval_label_tokens_domainslot = torch.tensor( [f.label_tokens_domainslot for f in eval_features], dtype=torch.long) eval_hist_tokens = [f.hist_token for f in eval_features] eval_data = TensorDataset( eval_input_ids, eval_input_mask, eval_segment_ids, eval_utterance_mask, eval_domainslot_mask, eval_label_tokens_start, eval_label_tokens_end, eval_label_sentence_domainslot, eval_label_tokens_domainslot) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size) return train_dataloader, eval_dataloader, train_examples, eval_examples, all_hist_tokens, eval_hist_tokens
def test_eval(self): data = DATAMultiWOZ(debug=False, data_dir=self.data_dir) test_examples = data.read_examples( os.path.join(self.data_dir, 'test.json')) print('eval_examples的数量', len(test_examples)) dialogueID = [x.guid for x in test_examples] utterance_text = [x.text_history for x in test_examples] test_features = data.convert_examples_to_features( test_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(test_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( test_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( test_features, 'segment_ids'), dtype=torch.long) eval_labels_domainslot = torch.tensor( [f.labels_domainslot for f in test_features], dtype=torch.float) eval_labels_domain = torch.tensor( [f.labels_domain for f in test_features], dtype=torch.long) eval_labels_dependcy = torch.tensor( [f.labels_dependcy for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, eval_labels_domainslot, eval_labels_domain, eval_labels_dependcy) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained(os.path.join( self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) model.eval() inference_labels = [] gold_labels_domain = [] gold_labels_dependcy = [] gold_labels_domainslot = [] scores_domainslot = [] scores_domain = [] scores_dependcy = [] for input_ids, input_mask, segment_ids, label_domainslot, label_domain, label_dependcy in test_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_domainslot = label_domainslot.to(self.device) label_domain = label_domain.to(self.device) label_dependcy = label_dependcy.to(self.device) with torch.no_grad(): logits_domainslot, logits_domain, logits_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits_domainslot = torch.sigmoid(logits_domainslot) logits_domainslot = (logits_domainslot > 0.4).float() logits_domainslot = logits_domainslot.cpu().long().numpy() logits_domain = logits_domain.view( -1, self.num_labels_domain).cpu().numpy() logits_dependcy = logits_dependcy.view( -1, self.num_labels_dependcy).cpu().numpy() label_domainslot = label_domainslot.to('cpu').numpy() label_domain = label_domain.view(-1).to('cpu').numpy() label_dependcy = label_dependcy.view(-1).to('cpu').numpy() scores_domainslot.append(logits_domainslot) scores_domain.append(logits_domain) scores_dependcy.append(logits_dependcy) gold_labels_domainslot.append(label_domainslot) gold_labels_domain.append(label_domain) gold_labels_dependcy.append(label_dependcy) gold_labels_domainslot = np.concatenate(gold_labels_domainslot, 0) gold_labels_domain = np.concatenate(gold_labels_domain, 0) gold_labels_dependcy = np.concatenate(gold_labels_dependcy, 0) scores_domainslot = np.concatenate(scores_domainslot, 0) scores_domain = np.concatenate(scores_domain, 0) scores_dependcy = np.concatenate(scores_dependcy, 0) # 计算评价指标 assert scores_domain.shape[0] == scores_dependcy.shape[ 0] == gold_labels_domain.shape[0] == gold_labels_dependcy.shape[0] eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain, mode='domain', report=True) eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy, mode='dependcy', report=True) eval_jointGoal = compute_jointGoal_domainslot( dialogueID, utterance_text, scores_domainslot, gold_labels_domainslot, scores_domain, gold_labels_domain, scores_dependcy, gold_labels_dependcy) print('eval_accuracy_domain', eval_accuracy_domain) print('eval_accuracy_dependcy', eval_accuracy_dependcy) print('eval_jointGoal', eval_jointGoal)