class Model(nn.Module): CLS_POSITION = 0 def __init__(self, cfg=None, mdo_prob=0., mdo_num=1, num_classes=1, path=None): # unnecessary head is present super().__init__() if path is not None: self.backbone = BertForSequenceClassification.from_pretrained(path) self.backbone.config.output_hidden_states = True else: assert cfg is not None, 'Config should be provided if no pretrained path was specified.' self.backbone = BertForSequenceClassification(cfg) self.head = nn.Linear(self.backbone.config.hidden_size, num_classes) weights_init = torch.zeros(self.backbone.config.num_hidden_layers).float() self.cls_weights = torch.nn.Parameter(weights_init, requires_grad=True) self.mdo = None if mdo_prob > 0.: self.mdo = MultiDropoutHead(mdo_prob, mdo_num) def forward(self, x, attention_mask): _, hidden_states = self.backbone(x, attention_mask) hidden_states = torch.stack([states[:, self.CLS_POSITION] for states in hidden_states[1:]]) x = torch.einsum('ijk,i->jk', hidden_states, torch.softmax(self.cls_weights, dim=-1)) if self.mdo is not None: return self.mdo(x, self.head) return self.head(x) def load_weights(self, path): found = [] with open(path, 'rb') as f: weights = torch.load(f) for name, param in weights['model'].items(): if name in self.backbone.state_dict() and 'cls' not in name: if param.shape == self.backbone.state_dict()[name].shape: self.backbone.state_dict()[name].copy_(param) logger.info(f'\t Preloading {name}') found.append(name) logger.info('\n\t Didnt find layers:') for name in self.backbone.state_dict(): if name not in weights['model']: logger.info(f'\t {name}') return found
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForSequenceClassification(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path, output_folder): # Instantiate model logger.info(f'Loading model based on config from {config_path}...') config = BertConfig.from_json_file(config_path) model = BertForSequenceClassification(config) # Load weights from checkpoint logger.info(f'Loading weights from checkpoint {tf_checkpoint_path}...') load_tf2_weights_in_bert(model, tf_checkpoint_path, config) # Create dirs if not os.path.isdir(output_folder): os.makedirs(output_folder) # Save pytorch-model f_out_model = os.path.join(output_folder, 'pytorch_model.bin') logger.info(f'Saving PyTorch model to {f_out_model}...') torch.save(model.state_dict(), f_out_model) # Save config to output f_out_config = os.path.join(output_folder, 'config.json') logger.info(f'Saving config to {f_out_config}...') config.to_json_file(f_out_config)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/', type=str, required=False, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--task_name", default='ubuntu', type=str, required=False, help="The name of the task to train.") parser.add_argument("--output_dir", default='/hdd/lujunyu/model/chatbert/ubuntu_without_pretraining/', type=str, required=False, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--init_model_name", default='bert-base-uncased', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--data_augmentation", default=False, action='store_true', help="Whether to use augmentation") parser.add_argument("--max_seq_length", default=256, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_test", default=True, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--train_batch_size", default=500, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=500, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-3, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0.0, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=1e-3, type=float, help="weight_decay") parser.add_argument("--save_checkpoints_steps", default=8000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=20, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_pretrained(args.init_model_name, num_labels=2) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): if args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.init_model_name, do_lower_case=args.do_lower_case) if args.data_augmentation: train_dataset = UbuntuDatasetForSP( file_path=os.path.join(args.data_dir, "train_augment_3.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) else: train_dataset = UbuntuDatasetForSP( file_path=os.path.join(args.data_dir, "train.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) eval_dataset = UbuntuDatasetForSP( file_path=os.path.join(args.data_dir, "valid.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, sampler=RandomSampler(train_dataset), num_workers=4) eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=args.eval_batch_size, sampler=SequentialSampler(eval_dataset), num_workers=4) model = BertForSequenceClassification(config=bert_config) model.to(device) num_train_steps = None if args.do_train: num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer param_optimizer = list(model.named_parameters()) # remove pooler, which is not used thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_steps) else: optimizer = None scheduler = None if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 best_metric = 0.0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients scheduler.step() model.zero_grad() global_step += 1 if step % args.save_checkpoints_steps == 0: model.eval() f = open(os.path.join(args.output_dir, 'logits_dev.txt'), 'w') eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() logits_all.append(logits) label_ids = label_ids.cpu().numpy() for logit, label in zip(logits, label_ids): logit = '{},{}'.format(logit[0], logit[1]) f.write('_\t{}\t{}\n'.format(logit, label)) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 f.close() logits_all = np.concatenate(logits_all,axis=0) eval_loss = eval_loss / nb_eval_steps result = evaluate(os.path.join(args.output_dir, 'logits_dev.txt')) result.update({'eval_loss': eval_loss}) output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ### Save the best checkpoint if best_metric < result['R10@1'] + result['R10@2']: try: ### Remove 'module' prefix when using DataParallel state_dict = model.module.state_dict() except AttributeError: state_dict = model.state_dict() torch.save(state_dict, os.path.join(args.output_dir, "model.pt")) best_metric = result['R10@1'] + result['R10@2'] logger.info('Saving the best model in {}'.format(os.path.join(args.output_dir, "model.pt"))) ### visualize bad cases of the best model # logger.info('Saving Bad cases...') # visualize_bad_cases( # logits=logits_all, # input_file_path=os.path.join(args.data_dir, 'valid.txt'), # output_file_path=os.path.join(args.output_dir, 'valid_bad_cases.txt') # ) model.train()
def train_process(config, train_load, valid_load, test_load, k, train_sampler): # load source bert weights # model_config = BertConfig.from_pretrained(pretrained_model_name_or_path="../user_data/bert_source/{}/config.json".format(config.model_name)) model_config = BertConfig() model_config.vocab_size = len( pd.read_csv('../user_data/vocab', names=["score"])) model = BertForSequenceClassification(config=model_config) if os.path.isfile('save_model/{}_best_model_v1111.pth.tar'.format( config.model_name)): checkpoint = torch.load('save_model/{}_best_model_v1.pth.tar'.format( config.model_name), map_location=torch.device('cpu')) model.load_state_dict(checkpoint['status'], strict=False) best_dev_auc = 0 print('***********load best model weight*************') else: checkpoint = torch.load( '../user_data/save_bert/{}_checkpoint.pth.tar'.format( config.model_name), map_location=torch.device('cpu')) model.load_state_dict(checkpoint['status'], strict=False) best_dev_auc = 0 print('***********load pretrained mlm model weight*************') for param in model.parameters(): param.requires_grad = True # 4) 封装之前要把模型移到对应的gpu model = model.to(config.device) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": config.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate) # t_total = len(train_load) * config.num_train_epochs # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=t_total * config.warmup_proportion, num_training_steps=t_total # ) cudnn.benchmark = True if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # 5)封装 model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[config.local_rank]) model.train() if config.fgm: fgm = FGM(model) for epoch in range(config.num_train_epochs): train_sampler.set_epoch(epoch) is_best = False torch.cuda.empty_cache() for batch, (input_ids, token_type_ids, attention_mask, label) in enumerate(train_load): input_ids = input_ids.cuda(config.local_rank, non_blocking=True) attention_mask = attention_mask.cuda(config.local_rank, non_blocking=True) token_type_ids = token_type_ids.cuda(config.local_rank, non_blocking=True) label = label.cuda(config.local_rank, non_blocking=True) outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label) loss = outputs.loss model.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) if config.fgm: fgm.attack() # 在embedding上添加对抗扰动 loss_adv = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label).loss loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 optimizer.step() # scheduler.step() dev_auc = model_evaluate(config, model, valid_load) # 同步各个进程的速度,计算分布式loss torch.distributed.barrier() reduce_dev_auc = reduce_auc(dev_auc, config.nprocs).item() if reduce_dev_auc > best_dev_auc: best_dev_auc = reduce_dev_auc is_best = True now = strftime("%Y-%m-%d %H:%M:%S", localtime()) msg = 'number {} fold,time:{},epoch:{}/{},reduce_dev_auc:{},best_dev_auc:{}' if config.local_rank in [0, -1]: print( msg.format(k, now, epoch + 1, config.num_train_epochs, reduce_dev_auc, best_dev_auc)) checkpoint = { "status": model.state_dict(), "epoch": epoch + 1, 'reduce_dev_auc': reduce_dev_auc } if is_best: torch.save( checkpoint, '../user_data/save_model' + os.sep + '{}_best_model.pth.tar'.format(config.model_name)) torch.save( checkpoint, '../user_data/save_model' + os.sep + '{}_checkpoint.pth.tar'.format(config.model_name)) del checkpoint torch.distributed.barrier()
return data if __name__ == '__main__': args = create_args() # load tokenizer tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case, piece=args.piece, piece_model=args.piece_model) # load bert model config = BertConfig.from_json_file(args.config_file) model = BertForSequenceClassification(config) model_state_dict = model.state_dict() print('Model parameter: {}'.format( sum(p.numel() for k, p in model_state_dict.items()))) pre_state_dict = torch.load(args.pretrained_file) pre_state_dict = { k: v for k, v in pre_state_dict.items() if k in model_state_dict } model_state_dict.update(pre_state_dict) model.load_state_dict(model_state_dict) if args.cuda: model.cuda() # load data data = BERTCLDCDataReader(args, tokenizer)
label_list = list(map(json.loads, label)) text_tensor = torch.tensor(text_list).to(device) label_tensor = torch.tensor(label_list).to(device) outputs = model(text_tensor, labels=label_tensor) loss, logits = outputs[:2] optimizer.zero_grad() loss.backward() scheduler.step() optimizer.step() acc = batch_accuracy(logits, label_tensor) print('epoch:{} | acc:{} | loss:{}'.format(epoch, acc, loss)) torch.save(model.state_dict(), 'bert_cla.ckpt') print('保存训练完成的model...') # 测试 print('开始加载训练完成的model...') model.load_state_dict(torch.load('bert_cla.ckpt')) print('开始测试...') model.eval() test_result = [] for item in test_dataset: text_list = list(json.loads(item[1])) text_tensor = torch.tensor(text_list).unsqueeze(0).to(device)