def __init__(self, pretrained_model, model_path, model_config_path, freeze_deberta=True, max_seq_length=512, batch_size=32, num_labels=2): super().__init__() if pretrained_model.model_class_name == 'base': self.model = DebertaForSequenceClassification.from_pretrained( pretrained_model.model_class['pretrain_key'], num_labels=num_labels) elif pretrained_model.model_class_name == 'xxlarge-v2': self.model = DebertaV2ForSequenceClassification.from_pretrained( pretrained_model.model_class['pretrain_key'], num_labels=num_labels) self.tokenizer = DebertaTokenizer.from_pretrained( pretrained_model.model_class['pretrain_key']) self.max_seq_length = max_seq_length self.batch_size = batch_size self.dataset = NLIDataset_DeBERTa(model_path, pretrained_model, max_seq_length, batch_size) if freeze_deberta: for param in self.model.parameters(): param.requires_grad = False for param in self.model.base_model.parameters(): param.requires_grad = False
def train_model(train_dataloader, validation_dataloader, labels): model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=13, output_attentions=False, output_hidden_states=False) model.cuda() optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-6) epochs = 4 total_steps = len(train_dataloader) * epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) average_losses = [] for epoch in range(0, epochs): total_loss = 0 model.train() for step, batch in enumerate(train_dataloader): batch_ids = batch[0].to(device) batch_mask = batch[1].to(device) batch_labels = batch[2].to(device) model.zero_grad() outputs = model(batch_ids, token_type_ids=None, attention_mask=batch_mask, labels=batch_labels) loss = outputs[0] total_loss = total_loss + loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() avg_train_loss = total_loss / len(train_dataloader) average_losses.append(avg_train_loss) print("Average training loss:", avg_train_loss) model.eval() eval_acc = 0 eval_steps = 0 for batch in validation_dataloader: batch = tuple(t.to(device) for t in batch) batch_ids, batch_mask, batch_labels = batch with torch.no_grad(): outputs = model(batch_ids, token_type_ids=None, attention_mask=batch_mask) logits = outputs[0] logits = logits.detach().cpu().numpy() label_ids = batch_labels.to('cpu').numpy() predictions = np.argmax(logits, axis=1).flatten() flat_labels = labels.flatten() temp_eval_acc = np.sum(predictions == flat_labels) / len(flat_labels) eval_acc = eval_acc + temp_eval_acc eval_steps = eval_steps + 1 total_acc = eval_acc / eval_steps print(" Accuracy:", total_acc) torch.save(model, "../DataFiles/bert_model")
def __init__(self, model_class): if model_class == 'base': self.params = self._get_base_params() self.model = DebertaForSequenceClassification.from_pretrained(self.params['pretrain_key']).to(device) elif model_class == 'xxlarge-v2': self.params = self._get_xxlarge_params() self.model = DebertaV2ForSequenceClassification.from_pretrained(self.params['pretrain_key']).to(device) else: NameError("Currently only supporting 'base' and 'xxlarge-v2' model.") self.tokenizer = DebertaTokenizer.from_pretrained(self.params['pretrain_key']) for param in self.model.base_model.parameters(): param.requires_grad = False
def test_inference_classification_head(self): random.seed(0) np.random.seed(0) torch.manual_seed(0) torch.cuda.manual_seed_all(0) model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base") input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 2)) self.assertEqual(output.shape, expected_shape) expected_tensor = torch.tensor([[0.0884, -0.1047]]) self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4), f"{output}")
def create_and_check_deberta_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_labels = self.num_labels model = DebertaForSequenceClassification(config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels]) self.check_loss_output(result)
def sequence_classify(): """ 文件src/transformers/models/deberta/modeling_deberta.py的1169行有一些问题,所以会报错,维度不匹配的错误, 在BERT上是没有此种错误的 RuntimeError: Index tensor must have the same number of dimensions as input tensor labels = torch.gather(labels, 0, label_index.view(-1)) Returns: """ from transformers import DebertaTokenizer, DebertaForSequenceClassification import torch tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base') model = DebertaForSequenceClassification.from_pretrained( 'microsoft/deberta-base') inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") labels = torch.tensor([1]).unsqueeze(0) #假设标签为1, 这里以一个样本,Batch size 也是 1 outputs = model(**inputs, labels=labels) loss = outputs.loss logits = outputs.logits print(loss) print(logits)
def main(): if len(sys.argv) == 3: FOLDER_NAME = sys.argv[1] EPOCH = sys.argv[2] else: print( 'ERROR : Please insert correct arguments! python evaluate.py <folder_name> <chosen_epoch>' ) return # Check GPU Availability if torch.cuda.is_available(): device = torch.device('cuda') print('Using GPU!') else: device = torch.device('cpu') print('Using CPU :(') # Import Dataset print('#### Importing Dataset ####') df_val = openData('./snli/dev.jsonl') df_test = openData('./snli/test.jsonl') # Preprocessing : removing data with label -1 df_val = removeMinVal(df_val) df_test = removeMinVal(df_test) BERT_MODEL = 'microsoft/deberta-base' # BERT_MODEL = 'roberta-base' # BERT_MODEL = 'distilbert-base-uncased' # BERT_MODEL = 'bert-base-uncased' # BERT_MODEL = 'albert-base-v2' tokenizer = DebertaTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True) print('Encoding validation data') encode_val = tokenizer(df_val.premise.tolist(), df_val.hypothesis.tolist(), return_tensors='pt', padding='max_length', max_length=MAX_LENGTH) labels_val = torch.tensor(df_val.label.values) print('Encoding test data') encode_test = tokenizer(df_test.premise.tolist(), df_test.hypothesis.tolist(), return_tensors='pt', padding='max_length', max_length=MAX_LENGTH) labels_test = torch.tensor(df_test.label.values) dataset_val = TensorDataset(encode_val['input_ids'], encode_val['attention_mask'], labels_val) dataset_test = TensorDataset(encode_test['input_ids'], encode_test['attention_mask'], labels_test) model = DebertaForSequenceClassification.from_pretrained( BERT_MODEL, num_labels=len(label_dict), output_attentions=False, output_hidden_states=False) PARALLEL_GPU = False if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) PARALLEL_GPU = True model = model.to(device) model.load_state_dict( torch.load( f'./models/{FOLDER_NAME}/finetuned_model_epoch_{EPOCH}.model', map_location=torch.device('cuda'))) print('#### Validation Data Result ####') dataloader_validation = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size) _, predictions, true_vals = evaluate(model, device, dataloader_validation, PARALLEL_GPU) accuracy_per_class(predictions, true_vals) print('#### Test Data Result ####') dataloader_test = DataLoader(dataset_test, sampler=SequentialSampler(dataset_test), batch_size=batch_size) _, predictions, true_vals = evaluate(model, device, dataloader_test, PARALLEL_GPU) accuracy_per_class(predictions, true_vals)
def main(): device_ids=[0] init_lr = 1e-5 max_epochs = 10 max_length = 512 batch_size = 1 gradient_accu = 32 // batch_size num_label = 2 train_mode = False prev_acc = 0. max_acc = 0. config = DebertaConfig.from_pretrained('microsoft/deberta-large') tknzr = DebertaTokenizer.from_pretrained('microsoft/deberta-large') DebertaConfig.num_labels = 2 train_data, test_data = loadData.load_data() train_data = train_data + loadData.load_data_aug() train_input_ids, train_mask_ids, train_segment_ids, train_label_ids = get_features(train_data, max_length, tknzr) test_input_ids, test_mask_ids, test_segment_ids, test_label_ids = get_features(test_data, max_length, tknzr) # print(all_input_ids.shape) all_input_ids = torch.cat(train_input_ids, dim=0).long() all_input_mask_ids = torch.cat(train_mask_ids, dim=0).long() all_segment_ids = torch.cat(train_segment_ids, dim=0).long() all_label_ids = torch.Tensor(train_label_ids).long() train_dataloader = create_dataloader(all_input_ids, all_input_mask_ids, all_segment_ids, all_label_ids, batch_size=batch_size, train=True) all_input_ids = torch.cat(test_input_ids, dim=0).long() all_input_mask_ids = torch.cat(test_mask_ids, dim=0).long() all_segment_ids = torch.cat(test_segment_ids, dim=0).long() all_label_ids = torch.Tensor(test_label_ids).long() test_dataloader = create_dataloader(all_input_ids, all_input_mask_ids, all_segment_ids, all_label_ids, batch_size=batch_size, train=False) model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-large').cuda(device_ids[0]) model = torch.nn.DataParallel(model, device_ids=device_ids) optimizer = transformers.AdamW(model.parameters(), lr=init_lr, eps=1e-8) optimizer.zero_grad() #scheduler = transformers.get_constant_schedule_with_warmup(optimizer, len(train_dataloader) // (batch_size * gradient_accu)) #scheduler = transformers.get_linear_schedule_with_warmup(optimizer, len(train_dataloader) // (batch_size * gradient_accu), (len(train_dataloader) * max_epochs * 2) // (batch_size * gradient_accu), last_epoch=-1) if not train_mode: max_epochs = 1 model.load_state_dict(torch.load("../model/model-deberta-1231.ckpt")) foutput = open("answer-deberta-large-test.txt", "w") global_step = 0 for epoch in range(max_epochs): model.train() if train_mode: loss_avg = 0. for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): global_step += 1 batch = [t.cuda() for t in batch] input_id, input_mask, segment_id, label_id = batch loss, _ = model(input_ids=input_id, token_type_ids=segment_id, attention_mask=input_mask, labels=label_id) loss = torch.sum(loss) loss_avg += loss.item() loss = loss / (batch_size * gradient_accu) loss.backward() if global_step % gradient_accu == 0: optimizer.step() optimizer.zero_grad() #if epoch == 0: #scheduler.step() print(loss_avg / len(train_dataloader)) model.eval() final_acc = 0. num_test_sample = 0 tot = [0, 0] correct = [0, 0] countloop = 0 for input_id, input_mask, segment_id, label_id in test_dataloader: countloop += 1 input_id = input_id.cuda() input_mask = input_mask.cuda() segment_id = segment_id.cuda() label_id = label_id.cuda() with torch.no_grad(): loss, logit = model(input_ids=input_id, token_type_ids=segment_id, attention_mask=input_mask, labels=label_id) logit = logit.detach().cpu().numpy() print(logit[0][0], logit[0][1], file = foutput) #print(logit) label_id = label_id.to('cpu').numpy() acc = np.sum(np.argmax(logit, axis=1) == label_id) pred = np.argmax(logit, axis=1) for i in range(label_id.shape[0]): tot[label_id[i]] += 1 if pred[i] == label_id[i]: correct[label_id[i]] += 1 final_acc += acc num_test_sample += input_id.size(0) print("epoch:", epoch) print("final acc:", final_acc / num_test_sample) if train_mode and final_acc / num_test_sample > max_acc: max_acc = final_acc / num_test_sample print("save...") torch.save(model.state_dict(), "../model/model-deberta-1231.ckpt") print("finish") print("Max acc:", max_acc) ''' if final_acc / num_test_sample <= prev_acc: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.8 ''' prev_acc = final_acc / num_test_sample tp = correct[1] tn = correct[0] fp = tot[1] - correct[1] fn = tot[0] - correct[0] rec = tp / (tp + fn + 1e-5) pre = tp / (tp + fp + 1e-5) print("recall:{0}, precision:{1}".format(rec, pre)) print("f:", 2 * pre * rec / (pre + rec)) print("acc:", (tp + tn) / (tp+tn+fp+fn))