def __init__(self, bert_dir: Optional[str], pad_token_id: int, cls_token_id: int, sep_token_id: int, num_labels: int, max_length: int = 512, use_half_precision=False, config=Optional[PretrainedConfig]): super(BertClassifier, self).__init__() if bert_dir is None: assert config is not None assert config.num_labels == num_labels bert = RobertaForSequenceClassification(config) #bert = BertForSequenceClassification(config) else: bert = RobertaForSequenceClassification.from_pretrained( bert_dir, num_labels=num_labels) #bert = BertForSequenceClassification.from_pretrained(bert_dir, num_labels=num_labels) if use_half_precision: import apex bert = bert.half() self.bert = bert self.pad_token_id = pad_token_id self.cls_token_id = cls_token_id self.sep_token_id = sep_token_id self.max_length = max_length
def __init__(self, model_dir): self.model = RobertaForSequenceClassification( model_dir, output_attentions=True, output_hidden_states=True) self.tokenizer = RobertaTokenizer( model_dir, add_special_tokens=True, merges_file=os.path.join(model_dir, "merges.txt"), )
def load_pytorch_model(model_dir): model_path = '{}/{}'.format(model_dir, MODEL_NAME) model = RobertaForSequenceClassification() if torch.cuda.is_available(): device = torch.device('cuda') model.load_state_dict(torch.load(model_path, map_location='cuda:0')) else: device = torch.device('cpu') model.load_state_dict(torch.load(model_path, map_location=device)) return model
def main(args): test_x = np.load(os.path.join(args.test_dir, "test_x.npy"), allow_pickle=True) test_y = np.load(os.path.join(args.test_dir, "test_y.npy"), allow_pickle=True) num_classes1 = len(np.unique(test_y)) if args.test2_dir is not None: test_x2 = np.load(os.path.join(args.test2_dir, "test_x.npy"), allow_pickle=True) test_y2 = np.load(os.path.join(args.test2_dir, "test_y.npy"), allow_pickle=True) test_y2 += num_classes1 test_x = np.concatenate((test_x, test_x2), axis=0) test_y = np.concatenate((test_y, test_y2), axis=0) num_classes = len(np.unique(test_y)) tokenizer_path = args.tokenizer tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>") test_dataset = PhoneRobertaDataset(test_x, test_y, tokenizer) test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) config = RobertaConfig( vocab_size=tokenizer.vocab_size, max_position_embeddings=514, num_attention_heads=args.heads, # default 12 num_hidden_layers=args.num_layers, # default 6 type_vocab_size=1, num_labels=num_classes) model = RobertaForSequenceClassification(config) device = torch.device("cuda" if torch.cuda.is_available() else 'cpu') model.load_state_dict(torch.load(args.model)) preds_all, labels_all = evaluate(model, device, test_loader) if args.test2_dir is not None: print("Evaluate on separate validation using the best model") evaluate_separate(preds_all, labels_all, num_classes1)
# compute metrics function for binary classification def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) precision, recall, f1, _ = precision_recall_fscore_support( labels, preds, average="binary") acc = accuracy_score(labels, preds) return { "accuracy": acc, "f1": f1, "precision": precision, "recall": recall } # download model from model hub model = RobertaForSequenceClassification(RobertaConfig()) tokenizer = AutoTokenizer.from_pretrained(args.model_name) # define training args training_args = TrainingArguments( output_dir=args.model_dir, num_train_epochs=args.epochs, per_device_train_batch_size=args.train_batch_size, per_device_eval_batch_size=args.eval_batch_size, warmup_steps=args.warmup_steps, logging_dir=f"{args.output_data_dir}/logs", learning_rate=args.learning_rate, fp16=True, dataloader_drop_last=True, disable_tqdm=True, evaluation_strategy="no",
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, choices=["vlsp_2018_single", \ "vlsp_2018_NLI_M", "vlsp_2018_QA_M", "vlsp_2018_NLI_B", "vlsp_2018_QA_B"], help="The name of the task to train.") parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument('--bpe-codes', default=None, required=True, type=str, help='path to fastBPE BPE') parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Other parameters parser.add_argument("--do_save_model", default=False, action='store_true', help="Whether to save checkpoint.") parser.add_argument("--eval_test", default=False, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--accumulate_gradients", type=int, default=1, help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # prepare dataloaders processors = { "vlsp_2018_single":VLSP_2018_single_Processor, "vlsp_2018_NLI_M":VLSP_2018_NLI_M_Processor, "vlsp_2018_QA_M":VLSP_2018_QA_M_Processor, "vlsp_2018_NLI_B":VLSP_2018_NLI_B_Processor, "vlsp_2018_QA_B":VLSP_2018_QA_B_Processor, } processor = processors[args.task_name]() label_list = processor.get_labels() bert_config = RobertaConfig.from_pretrained(args.bert_config_file) bert_config.num_labels = len(label_list) label2id = {} id2label = {} for (i, label) in enumerate(label_list): label2id[label] = i id2label[str(i)] = label bert_config.label2id = label2id bert_config.id2label = id2label if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) bpe = fastBPE(args) vocab = Dictionary() vocab.add_from_file(args.vocab_file) # training set train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, bpe, vocab) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # dev set dev_examples = processor.get_dev_examples(args.data_dir) dev_features = convert_examples_to_features( dev_examples, label_list, args.max_seq_length, bpe, vocab) all_dev_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) all_dev_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) all_dev_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) all_dev_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(all_dev_input_ids, all_dev_input_mask, all_dev_segment_ids, all_dev_label_ids) dev_dataloader = DataLoader(dev_data, batch_size=args.eval_batch_size, shuffle=False) # test set if args.eval_test: test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, bpe, vocab) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) test_dataloader = DataLoader(test_data, batch_size=args.eval_batch_size, shuffle=False) # model and optimizer model = RobertaForSequenceClassification(bert_config) if args.init_checkpoint is not None: model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) # train output_log_file = os.path.join(args.output_dir, "log.txt") print("output_log_file=",output_log_file) with open(output_log_file, "w") as writer: if args.eval_test: writer.write("epoch\tglobal_step\tloss\tdev_loss\tdev_accuracy\ttest_loss\ttest_accuracy\n") else: writer.write("epoch\tglobal_step\tloss\n") global_step = 0 epoch=0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch+=1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch #RoBERTa not use token_type_ids loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 if(args.do_save_model): if(n_gpu > 1): torch.save(model.module.state_dict(), os.path.join(args.output_dir, 'model_ep' + str(epoch) + '.bin')) else: torch.save(model.state_dict(), os.path.join(args.output_dir, 'model_ep' + str(epoch) + '.bin')) #dev eval model.eval() dev_loss, dev_accuracy = 0, 0 nb_dev_steps, nb_dev_examples = 0, 0 with open(os.path.join(args.output_dir, "dev_ep_"+str(epoch)+".txt"),"w") as f_dev: for input_ids, input_mask, segment_ids, label_ids in dev_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_dev_test_loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) logits = F.softmax(logits, dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output_i in range(len(outputs)): f_dev.write(str(outputs[output_i])) for ou in logits[output_i]: f_dev.write(" "+str(ou)) f_dev.write("\n") tmp_dev_accuracy=np.sum(outputs == label_ids) dev_loss += tmp_dev_test_loss.mean().item() dev_accuracy += tmp_dev_accuracy nb_dev_examples += input_ids.size(0) nb_dev_steps += 1 dev_loss = dev_loss / nb_dev_steps dev_accuracy = dev_accuracy / nb_dev_examples # eval_test if args.eval_test: model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 with open(os.path.join(args.output_dir, "test_ep_"+str(epoch)+".txt"),"w") as f_test: for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_test_loss, logits = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) logits = F.softmax(logits, dim=-1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output_i in range(len(outputs)): f_test.write(str(outputs[output_i])) for ou in logits[output_i]: f_test.write(" "+str(ou)) f_test.write("\n") tmp_test_accuracy=np.sum(outputs == label_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples result = collections.OrderedDict() if args.eval_test: result = {'epoch': epoch, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps, 'dev_loss': dev_loss, 'dev_accuracy': dev_accuracy, 'test_loss': test_loss, 'test_accuracy': test_accuracy} else: result = {'epoch': epoch, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} logger.info("***** Eval results *****") with open(output_log_file, "a+") as writer: for key in result.keys(): logger.info(" %s = %s\n", key, str(result[key])) writer.write("%s\t" % (str(result[key]))) writer.write("\n")
data_args = AttributeDict(data_args) training_args = AttributeDict(training_args) train_dataset = ( GlueDataset(data_args, tokenizer=tokenizer) ) eval_dataset = ( GlueDataset(data_args, tokenizer=tokenizer, mode="dev") ) test_dataset = None output_mode = glue_output_modes[data_args.task_name] config = RobertaConfig.from_pretrained('roberta-base') config.num_labels = 2 model = RobertaForSequenceClassification(config) def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn trainer = Trainer( model=model, args=training_args,
def fit_and_train(self, train_df, val_df, val_train_df, require_grad): NUM_LABELS = 2 max_value = 0 best_model = None tokenizer = RobertaTokenizer.from_pretrained(pre_trained_model_name, do_lower_case=True) trainset = DialogueDataset(train_df, "train", tokenizer=tokenizer) trainloader = DataLoader(trainset, batch_size=self.batch_size, collate_fn=self.create_mini_batch) val_trainset = DialogueDataset(val_train_df, "train", tokenizer=tokenizer) val_trainloader = DataLoader(val_trainset, batch_size=self.batch_size, collate_fn=self.create_mini_batch) valset = DialogueDataset(val_df, 'test', tokenizer=tokenizer) valloader = DataLoader(valset, batch_size=val_batch_size, collate_fn=self.create_mini_batch) config = RobertaConfig.from_pretrained(pre_trained_model_name) config.num_labels = 2 config.type_vocab_size = 2 model = RobertaForSequenceClassification(config) # model = CustomRobertatModel() # model = BertForSequenceClassification.from_pretrained(pre_trained_model_name, num_labels=NUM_LABELS) # model = BertForNextSentencePrediction.from_pretrained(pre_trained_model_name) # if require_grad: # for param in model.parameters(): # param.requires_grad = True model.train() if self.gpu: model = model.cuda(device) for epo in range(self.epoch): total = 0 total_loss = 0 # optimizer = AdamW(model.parameters(), # lr = self.lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5 # eps = 1e-8 # args.adam_epsilon - default is 1e-8. # ) optimizer = optim.Adam(model.parameters(), lr=self.lr, betas=(0.9, 0.98), weight_decay=0.01, eps=1e-6) # Total number of training steps is number of batches * number of epochs. total_steps = len(trainloader) * self.epoch # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps=1000, t_total=total_steps) for data in trainloader: if self.gpu: tokens_tensors, segments_tensors, \ masks_tensors, labels = [x.type(torch.LongTensor).cuda(device) for x in data] else: tokens_tensors, segments_tensors, \ masks_tensors, labels = [x for x in data] outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors, labels=labels) # (tensor(0.6968, grad_fn=<NllLossBackward>), tensor([[-0.0359, -0.0432]], grad_fn=<AddmmBackward>)) loss = outputs[0] # (tensor(0.0086, device='cuda:1', grad_fn=<NllLossBackward>), tensor([[ 2.3423, -2.4149]], device='cuda:1', grad_fn=<AddmmBackward>)) loss.backward( ) # calculate gradientopt = torch.optim.SGD(model.parameters(), lr=self.lr, momentum=0.9) # opt = torch.optim.Adam(model.parameters(), lr = self.lr) # opt = torch.optim.SGD(model.parameters(), lr=self.lr, momentum=0.9) # opt.step() #update parameter # opt.zero_grad() # Clip the norm of the gradients to 1.0. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient optimizer.step() # Update the learning rate. scheduler.step() # Clear out the gradients (by default they accumulate) model.zero_grad() total += len(tokens_tensors) total_loss += loss.item() * len(tokens_tensors) # outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors) # loss_f = nn.CrossEntropyLoss() # loss = loss_f(outputs[0], labels) # loss.backward() # calculate gradientopt = torch.optim.SGD(model.parameters(), lr=self.lr, momentum=0.9) # opt = torch.optim.Adam(model.parameters(), lr = self.lr) # opt.step() #update parameter # opt.zero_grad() # total += len(tokens_tensors) # total_loss += loss.item() * len(tokens_tensors) del data, tokens_tensors, segments_tensors, \ masks_tensors, labels print(f'Epoch : {epo+1}/{self.epoch} , Training Loss : {loss}', end='\r') self.loss_list.append(total_loss / total) print( f'Epoch : {epo+1}/{self.epoch} , Training Loss : {self.loss_list[epo]}', end=',') with open(f'./train_loss_{model_type}.txt', 'w') as f: for i in self.loss_list: f.write(str(i) + '\n') model.eval() numebr = 0 ans = [] with torch.no_grad(): for data in valloader: if self.gpu: tokens_tensors, segments_tensors, masks_tensors, _ = [ x.type(torch.LongTensor).cuda(device) if x is not None else None for x in data ] else: tokens_tensors, segments_tensors, masks_tensors, _ = [ x for x in data ] outputs = model( input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors, ) # (tensor([[-0.0359, -0.0432]], grad_fn=<AddmmBackward>)) values = outputs[0].data[:, 1].tolist() ans += values print(f'count : {numebr}', end='\r') numebr += val_batch_size count = 0 val_len = 0 val_df['prob'] = ans groups = val_df.groupby('question') for index, data in groups: val_len += 1 if 'candidate_id' in val_df.columns: pred_id = data.loc[data['prob'].idxmax(), 'candidate_id'] if data.loc[data['prob'].idxmax(), 'ans'] == pred_id: count += 1 val_accu = count / val_len if val_accu >= max_value: max_value = val_accu self.model = model best_model = model torch.save(model.state_dict(), f'./model/{model_name}_torch_dict') self.val_accu_list.append(val_accu) print( f'Epoch : {epo+1}/{self.epoch}, Validation Accuracy : {self.val_accu_list[epo]}', end=',') with open(f'./val_accu_{model_type}.txt', 'w') as f: for i in self.val_accu_list: f.write(str(i) + '\n') ## Eventually fine tuned with validation data for epo in range(val_fine_tuned_epo): total = 0 total_loss = 0 optimizer = AdamW( best_model.parameters(), lr=self. lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) total_steps = len(val_trainloader) * 1 scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps=10, t_total=total_steps) for data in val_trainloader: if self.gpu: tokens_tensors, segments_tensors, \ masks_tensors, labels = [x.type(torch.LongTensor).cuda(device) for x in data] else: tokens_tensors, segments_tensors, \ masks_tensors, labels = [x for x in data] outputs = best_model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors, labels=labels) loss = outputs[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() model.zero_grad() total += len(tokens_tensors) total_loss += loss.item() * len(tokens_tensors) del data, tokens_tensors, segments_tensors, \ masks_tensors, labels # check if fine tune with validation work torch.save(best_model.state_dict(), f'./model/{model_name}_torch_dict_tuned_val')
def main(): parser = setup_parser() args = parser.parse_args() # specifies the path where the biobert or clinical bert model is saved if args.bert_model == 'biobert' or args.bert_model == 'clinical_bert': args.bert_model = args.model_loc print(args.bert_model) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "mednli": MedNLIProcessor } num_labels_task = {"cola": 2, "mnli": 3, "mrpc": 2, "mednli": 3} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) print('TRAIN') train = processor.get_train_examples(args.data_dir) print([(train[i].text_a, train[i].text_b, train[i].label) for i in range(3)]) print('DEV') dev = processor.get_dev_examples(args.data_dir) print([(dev[i].text_a, dev[i].text_b, dev[i].label) for i in range(3)]) print('TEST') test = processor.get_test_examples(args.data_dir) print([(test[i].text_a, test[i].text_b, test[i].label) for i in range(3)]) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples( args.data_dir) #NOTE: HERE IS THE DATA PULL-IN num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned # config = BertConfig(output_config_file) config = RobertaConfig(output_config_file) # model = BertForSequenceClassification(config, num_labels=num_labels) model = RobertaForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( test_dataloader, desc="Testing"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_test_accuracy = accuracy(logits, label_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'test_loss': test_loss, 'test_accuracy': test_accuracy, 'global_step': global_step, 'loss': loss } output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(args): train_x, train_y, valid_x, valid_y = load_xy(args.data_dir) num_classes1 = len(np.unique(train_y)) if args.data2_dir is not None: train_x2, train_y2, valid_x2, valid_y2 = load_xy(args.data2_dir) train_y2 += num_classes1 valid_y2 += num_classes1 train_x = np.concatenate((train_x, train_x2), axis=0) train_y = np.concatenate((train_y, train_y2), axis=0) valid_x = np.concatenate((valid_x, valid_x2), axis=0) valid_y = np.concatenate((valid_y, valid_y2), axis=0) num_classes = len(np.unique(train_y)) tokenizer_path = args.tokenizer tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>") train_dataset = PhoneRobertaDataset(train_x, train_y, tokenizer) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) valid_dataset = PhoneRobertaDataset(valid_x, valid_y, tokenizer) valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False) lr = args.lr num_epochs = args.epochs verbose = args.verbose if args.pretrained: model = RobertaForSequenceClassification.from_pretrained(args.pretrained, num_labels=num_classes) else: config = RobertaConfig( vocab_size=tokenizer.vocab_size, max_position_embeddings=514, num_attention_heads=args.heads, # default 12 num_hidden_layers=args.num_layers, # default 6 type_vocab_size=1, num_labels=num_classes ) model = RobertaForSequenceClassification(config) model.to(device) print(model) optimizer = AdamW(model.parameters(), lr=lr) scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=3, verbose=verbose) # best_model_dict = None best_acc = 0 best_preds = None acc_logs = [] for epoch in range(1, num_epochs + 1): train_loss, train_acc = train_epoch(model, train_loader, optimizer, verbose) y_preds, y_true, valid_loss, valid_acc = valid_epoch(model, valid_loader, verbose) acc_logs.append(valid_acc) if verbose: print("Epoch {} finished.".format(epoch)) print('='*20) if args.scheduler: scheduler.step(valid_loss) if valid_acc > best_acc: torch.save(model.state_dict(), args.save_model_path) best_acc = valid_acc best_preds = y_preds # if best_model_dict and args.save_model_path: # torch.save(best_model_dict, args.save_model_path) print("Evaluate on aggreagate validation using the best model") evaluate(best_preds, y_true) if args.data2_dir is not None: print("Evaluate on separate validation using the best model") evaluate_separate(best_preds, y_true, num_classes1) print("Best validation accuracy: ", best_acc, "%") if args.log_acc: np.save("roberta/logs/log_acc.npy", np.array(acc_logs))
def convert_roberta_checkpoint_to_pytorch( roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool ): """ Copy/paste/tweak roberta's weights to our BERT structure. """ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout roberta_sent_encoder = roberta.model.encoder.sentence_encoder config = RobertaConfig( vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, intermediate_size=roberta.args.encoder_ffn_embed_dim, max_position_embeddings=514, type_vocab_size=1, layer_norm_eps=1e-5, # PyTorch default used in fairseq ) if classification_head: config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0] print("Our BERT config:", config) model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config) model.eval() # Now let's copy all the weights. # Embeddings model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( model.roberta.embeddings.token_type_embeddings.weight ) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i] # self attention self_attn: BertSelfAttention = layer.attention.self assert ( roberta_layer.self_attn.k_proj.weight.data.shape == roberta_layer.self_attn.q_proj.weight.data.shape == roberta_layer.self_attn.v_proj.weight.data.shape == torch.Size((config.hidden_size, config.hidden_size)) ) self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias # self-attention output self_output: BertSelfOutput = layer.attention.output assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape self_output.dense.weight = roberta_layer.self_attn.out_proj.weight self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias # intermediate intermediate: BertIntermediate = layer.intermediate assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape intermediate.dense.weight = roberta_layer.fc1.weight intermediate.dense.bias = roberta_layer.fc1.bias # output bert_output: BertOutput = layer.output assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape bert_output.dense.weight = roberta_layer.fc2.weight bert_output.dense.bias = roberta_layer.fc2.bias bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias # end of layer if classification_head: model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias else: # LM Head model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias # Let's check that we get the same results. input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 our_output = model(input_ids)[0] if classification_head: their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids)) else: their_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 success = torch.allclose(our_output, their_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if success else "💩") if not success: raise Exception("Something went wRoNg") pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path)