def _load_edge_model(self, bert_model_file, bert_config_file): bert_config = BertConfig.from_json_file(bert_config_file) model = BertEdgeScorer(bert_config) model_states = torch.load(bert_model_file) print(model_states.keys()) model.bert.load_state_dict(model_states) model.cuda() model.eval() return model
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): """Convert a bert model checkpoint for TensorFlow to PyTorch.""" # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="Path to the configuration file for the BERT model.") ## Other parameters parser.add_argument("--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--discr", default=False, action='store_true', help="Whether to do discriminative fine-tuning.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--accumulate_gradients", type=int, default=1, help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--layers', type=int, nargs='+', default=[-2], help="choose the layers that used for downstream tasks, " "-2 means use pooled output, -1 means all layer," "else means the detail layers. default is -2") parser.add_argument('--num_datas', default=None, type=int, help="the number of data examples") parser.add_argument('--num_test_datas', default=None, type=int, help="the number of data examples" ) parser.add_argument('--pooling_type', default=None, type=str, choices=[None, 'mean', 'max']) args = parser.parse_args() processors = { "sst": SSTProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) summary_writer = SummaryWriter(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained("bert-large-cased") bert_config = BertConfig.from_json_file(args.bert_config_file) model = BertForSequenceClassification(bert_config, len(label_list), args.layers, pooling=args.pooling_type) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = AdamW(optimizer_parameters, lr=args.learning_rate, correct_bias=False) global_step = 0 global_train_step = 0 all_examples = processor.get_all_examples(args.data_dir) all_features = convert_examples_to_features( all_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = all_features['input_ids'] all_input_mask = all_features['attention_mask'] all_segment_ids = all_features['token_type_ids'] all_label_ids = all_features['labels'] all_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_data, eval_data = random_split(all_data, [100000, 12428]) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size, shuffle=False) if args.do_train: logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) print("TOTAL STEPS: ", (len(train_dataloader)*int(args.num_train_epochs))) epoch=0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch+=1 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, token_type_ids, label_ids = batch loss, _ = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients # scheduler.step() summary_writer.add_scalar('Loss/train', loss.item(), global_step) # possibly comment this out max_grad_norm = 1.0 _clip_grad_norm(optimizer_parameters, max_grad_norm) model.zero_grad() global_step += 1 model.eval() eval_loss, eval_accuracy = 0, 0 pos_eval_prec, pos_eval_recall, pos_eval_f1 = 0, 0, 0 neg_eval_prec, neg_eval_recall, neg_eval_f1 = 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results_ep"+str(epoch)+".txt"),"w") as f: for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluate"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.detach().to('cpu').numpy() outputs = np.argmax(logits, axis=1) for output in outputs: f.write(str(output)+"\n") tmp_eval_accuracy=np.sum(outputs == label_ids) tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_neg_sent(outputs, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy neg_eval_prec += tmp_eval_prec neg_eval_recall += tmp_eval_recall neg_eval_f1 += tmp_eval_f1 tmp_eval_prec, tmp_eval_recall, tmp_eval_f1 = get_analytics_pos_sent(outputs, label_ids) pos_eval_prec += tmp_eval_prec pos_eval_recall += tmp_eval_recall pos_eval_f1 += tmp_eval_f1 global_train_step += 1 summary_writer.add_scalar("Loss/test", tmp_eval_loss.mean().item(), global_train_step) summary_writer.add_scalar("Accuracy/test", tmp_eval_accuracy, global_train_step) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples pos_eval_prec = pos_eval_prec / nb_eval_steps pos_eval_recall = pos_eval_recall / nb_eval_steps pos_eval_f1 = pos_eval_f1 / nb_eval_steps neg_eval_prec = neg_eval_prec / nb_eval_steps neg_eval_recall = neg_eval_recall / nb_eval_steps neg_eval_f1 = neg_eval_f1 / nb_eval_steps result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps, 'pos_eval_precision': pos_eval_prec, 'neg_eval_precision': neg_eval_prec, 'pos_eval_recall': pos_eval_recall, 'neg_eval_recall': neg_eval_recall, 'pos_eval_f1': pos_eval_f1, 'neg_eval_f1': neg_eval_f1} summary_writer.add_scalar("Epoch_loss/train", tr_loss, epoch) summary_writer.add_scalar("Epoch_loss/test", eval_loss, epoch) summary_writer.add_scalar("Epoch_accuracy/test", eval_accuracy, epoch) summary_writer.add_scalar("Epoch_positive_precision/test", pos_eval_prec, epoch) summary_writer.add_scalar("Epoch_negative_precision/test", neg_eval_prec, epoch) summary_writer.add_scalar("Epoch_positive_recall/test", pos_eval_recall, epoch) summary_writer.add_scalar("Epoch_negative_recall/test", neg_eval_recall, epoch) summary_writer.add_scalar("Epoch_positive_f1/test", pos_eval_f1, epoch) summary_writer.add_scalar("Epoch_negative_f1/test", neg_eval_f1, epoch) output_eval_file = os.path.join(args.output_dir, "eval_results_ep"+str(epoch)+".txt") print("output_eval_file=",output_eval_file) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) print("Saving model") torch.save(model.module.state_dict(), os.path.join(args.output_dir, "sst2-finetuned-bert-model_"+str(epoch)+".pth"))