def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForMaskedLM(config=config) model.eval() loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--wp", type=bool, default=False, help="if train on wp") parser.add_argument( '--from_scratch', action='store_true', help='do not load prtrain model, only random initialize') parser.add_argument("--output_step", type=int, default=100000, help="Number of step to save model") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] num_data_epochs = args.epochs for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) args.output_mode = "classification" if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) while True: try: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) if tokenizer._noi_token is None: tokenizer._noi_token = '[NOI]' if args.bert_model == 'bert-base-uncased' or 'bert-large-uncased': tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused0]') else: tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused1]') # else: # raise ValueError("No clear choice for insert NOI for tokenizer type {}".format(args.model_name_or_path)) tokenizer.ids_to_tokens[1] = '[NOI]' logger.info("Adding [NOI] to the vocabulary 1") except: continue break total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.from_scratch: model = BertForMaskedLM() else: model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory, args=args) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model( input_ids, segment_ids, input_mask, lm_label_ids, ) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.output_step == 0 and args.local_rank in [ -1, 0 ]: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.local_rank in [-1, 0]: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) logger.info("PROGRESS: {}%".format( round(100 * (epoch + 1) / args.epochs, 4))) logger.info("EVALERR: {}%".format(tr_loss)) # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def pretrain(args, data_path): print('[pretrain] create config, model') if args.model == 'bert': if args.redefined_tokenizer: bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path, do_lower_case=True) else: bert_tokenizer = BertTokenizer.from_pretrained( './pretrained_weights/bert-base-uncased-vocab.txt', do_lower_case=True) elif args.model == 'biobert': if args.redefined_tokenizer: bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path, do_lower_case=False) else: bert_tokenizer = BertTokenizer.from_pretrained( './pretrained_weights/biobert_pretrain_output_all_notes_150000/vocab.txt', do_lower_case=False) elif args.model == 'bert-tiny': if args.redefined_tokenizer: bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path, do_lower_case=True) else: bert_tokenizer = BertTokenizer.from_pretrained( './pretrained_weights/bert-tiny-uncased-vocab.txt', do_lower_case=True) if args.model == 'bert': config = BertConfig.from_pretrained( './pretrained_weights/bert-base-uncased-config.json') if args.Y == 'full': config.Y = 8921 else: config.Y = int(args.Y) config.gpu = args.gpu config.redefined_vocab_size = len(bert_tokenizer) if args.max_sequence_length is None: config.redefined_max_position_embeddings = MAX_LENGTH else: config.redefined_max_position_embeddings = args.max_sequence_length config.last_module = args.last_module config.model = args.model if args.from_scratch: model = BertForMaskedLM(config=config) else: model = BertForMaskedLM.from_pretrained( './pretrained_weights/bert-base-uncased-pytorch_model.bin', config=config) elif args.model == 'biobert': config = BertConfig.from_pretrained( './pretrained_weights/biobert_pretrain_output_all_notes_150000/bert_config.json' ) if args.Y == 'full': config.Y = 8921 else: config.Y = int(args.Y) config.gpu = args.gpu config.redefined_vocab_size = len(bert_tokenizer) if args.max_sequence_length is None: config.redefined_max_position_embeddings = MAX_LENGTH else: config.redefined_max_position_embeddings = args.max_sequence_length config.last_module = args.last_module config.model = args.model if args.from_scratch: bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path, do_lower_case=False) else: bert_tokenizer = BertTokenizer.from_pretrained( './pretrained_weights/biobert_pretrain_output_all_notes_150000/vocab.txt', do_lower_case=False) config.redefined_vocab_size = len(bert_tokenizer) if args.max_sequence_length is None: config.redefined_max_position_embeddings = MAX_LENGTH else: config.redefined_max_position_embeddings = args.max_sequence_length config.model = args.model if args.from_scratch: model = BertForMaskedLM(config=config) else: model = BertForMaskedLM.from_pretrained( './pretrained_weights/biobert_pretrain_output_all_notes_150000/pytorch_model.bin', config=config) elif args.model == 'bert-tiny': config = BertConfig.from_pretrained( './pretrained_weights/bert-tiny-uncased-config.json') if args.Y == 'full': config.Y = 8921 else: config.Y = int(args.Y) config.gpu = args.gpu config.redefined_vocab_size = len(bert_tokenizer) if args.max_sequence_length is None: config.redefined_max_position_embeddings = MAX_LENGTH else: config.redefined_max_position_embeddings = args.max_sequence_length config.last_module = args.last_module config.model = args.model if args.from_scratch: model = BertForMaskedLM(config=config) else: model = BertForMaskedLM.from_pretrained( './pretrained_weights/bert-tiny-uncased-pytorch_model.bin', config=config) if args.gpu: model.cuda() print('[pretrain] prepare optimizer, scheduler') param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] pretrain_optimizer = optim.Adam(optimizer_grouped_parameters, weight_decay=args.weight_decay, lr=args.lr) length = datasets.data_length(args.data_path, args.version) t_total = length // args.pretrain_batch_size * args.pretrain_epochs pretrain_scheduler = get_linear_schedule_with_warmup(pretrain_optimizer, \ num_warmup_steps=args.warmup_steps, \ num_training_steps=t_total, \ ) print_every = 25 model.train() model.zero_grad() print('[pretrain] create dataloader') train_dataset = datasets.pretrain_data_generator( args, data_path, args.pretrain_batch_size, version=args.version, bert_tokenizer=bert_tokenizer) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.pretrain_batch_size) print('[pretrain] start epoch') for epoch in range(args.pretrain_epochs): losses = [] for batch_idx, data in tqdm(enumerate(train_dataloader)): inputs, labels = random_mask_tokens(args, data, bert_tokenizer) if args.gpu: inputs = inputs.cuda() labels = labels.cuda() token_type_ids = (inputs > 0).long() * 0 attention_mask = (inputs > 0).long() position_ids = torch.arange(inputs.size(1)).expand( inputs.size(0), inputs.size(1)) if args.gpu: position_ids = position_ids.cuda() position_ids = position_ids * (inputs > 0).long() outputs = model(input_ids=inputs, \ token_type_ids=token_type_ids, \ attention_mask=attention_mask, \ position_ids=position_ids, \ masked_lm_labels=labels, \ ) loss = outputs[0] losses.append(loss.item()) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) pretrain_optimizer.step() pretrain_scheduler.step() model.zero_grad() if batch_idx % print_every == 0: # print the average loss of the last 10 batches print( "Train epoch: {} [batch #{}, batch_size {}, seq length {}]\tLoss: {:.6f}" .format(epoch, batch_idx, data.size()[0], data.size()[1], np.mean(losses[-10:]))) loss = sum(losses) / len(losses) print('Epoch %d: %.4f' % (epoch, loss)) model.save_pretrained(args.pretrain_ckpt_dir) print('Save pretrained model --> %s' % (args.pretrain_ckpt_dir))
logging.basicConfig(filename=log_path, format="%(asctime)s %(message)s", level=logging.DEBUG) logging.info(f"script_path: {script_path}") logging.info(f"soft labels will be saved to {save_path}") args = parser.parse_args() config = configparser.ConfigParser() config.read(args.conf) vocab_size = int(config["vocab"]["vocab_size"]) hidden_size = int(config["model"]["hidden_size"]) num_hidden_layers = int(config["model"]["num_hidden_layers"]) num_attention_heads = int(config["model"]["num_attention_heads"]) intermediate_size = int(config["model"]["intermediate_size"]) max_position_embeddings = int(config["model"]["max_position_embeddings"]) bertconfig = modeling_bert.BertConfig(vocab_size_or_config_json_file=vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, max_position_embeddings=max_position_embeddings) model = BertForMaskedLM(config=bertconfig) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") state_dict = torch.load(args.model, map_location=device) model.load_state_dict(state_dict) logging.info(f"load model from {args.model}") model.to(device) get_label(model, device, script_path, save_path, args.temp)
def train(): parser = argparse.ArgumentParser() parser.add_argument("-conf", type=str) parser.add_argument("--debug", action="store_true") parser.add_argument( "--gpu", type=str, default=None, help= "binary flag which gpu to use (For example '10100000' means use device_id=0 and 2)" ) args = parser.parse_args() config = configparser.ConfigParser() config.read(args.conf) hidden_size = int(config["model"]["hidden_size"]) num_hidden_layers = int(config["model"]["num_hidden_layers"]) num_attention_heads = int(config["model"]["num_attention_heads"]) intermediate_size = int(config["model"]["intermediate_size"]) max_position_embeddings = int(config["model"]["max_position_embeddings"]) # vocab_size = int(config["vocab"]["vocab_size"]) mask_id = int(config["vocab"]["mask_id"]) # log_path = config["log"]["log_path"] log_dir = os.path.dirname(log_path) os.makedirs(log_dir, exist_ok=True) log_step = int(config["log"]["log_step"]) # train_size = int(config["data"]["train_size"]) # save_prefix = config["save"]["save_prefix"] save_dir = os.path.dirname(save_prefix) os.makedirs(save_dir, exist_ok=True) save_epoch = int(config["save"]["save_epoch"]) # batch_size = int(config["train"]["batch_size"]) if args.debug: batch_size = 10 num_epochs = int(config["train"]["num_epochs"]) learning_rate = float(config["train"]["learning_rate"]) warmup_proportion = float(config["train"]["warmup_proportion"]) weight_decay = float(config["train"]["weight_decay"]) # num_to_mask = int(config["mask"]["num_to_mask"]) max_seq_len = int(config["mask"]["max_seq_len"]) if args.debug: logging.basicConfig(format="%(asctime)s %(message)s", level=logging.DEBUG) else: logging.basicConfig(filename=log_path, format="%(asctime)s %(message)s", level=logging.DEBUG) bertconfig = modeling_bert.BertConfig( vocab_size_or_config_json_file=vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, max_position_embeddings=max_position_embeddings) model = BertForMaskedLM(config=bertconfig) total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) if args.gpu is not None: device_ids = [] for device_id, flag in enumerate(args.gpu): if flag == "1": device_ids.append(device_id) multi_gpu = True device = torch.device("cuda:{}".format(device_ids[0])) else: multi_gpu = False device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logging.info(f"device: {device}") if "model_path" in config["train"]: model_path = config["train"]["model_path"] state_dict = torch.load(model_path, map_location=device) model.load_state_dict(state_dict) logging.info(f"load model from {model_path}") model.to(device) if multi_gpu: logging.info(f"GPU: device_id={device_ids}") model = torch.nn.DataParallel(model, device_ids=device_ids) model.train() # optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = (train_size // batch_size) * num_epochs optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, weight_decay=weight_decay, t_total=t_total) logging.info("start training...") for epoch in range(num_epochs): if "train_dir" in config["data"]: train_dir = config["data"]["train_dir"] datpaths = os.listdir(train_dir) random.shuffle(datpaths) for step_ds, path in enumerate(datpaths): path = os.path.join(train_dir, path) dataset = LMDataset(path) num_steps = (len(dataset) // batch_size) + 1 logging.info(f"dataset from: {path}") loss_ds = train_dataset(dataset=dataset, model=model, optimizer=optimizer, multi_gpu=multi_gpu, device=device, epoch=epoch, batch_size=batch_size, num_steps=num_steps, log_step=log_step, num_to_mask=num_to_mask, mask_id=mask_id, max_seq_len=max_seq_len) logging.info( f"step {step_ds + 1} / {len(datpaths)}: {(loss_ds / num_steps):.6f}" ) else: train_path = config["data"]["train_path"] dataset = LMDataset(train_path) num_steps = (len(dataset) // batch_size) + 1 loss_epoch = train_dataset(dataset=dataset, model=model, optimizer=optimizer, multi_gpu=multi_gpu, device=device, epoch=epoch, batch_size=batch_size, num_steps=num_steps, log_step=log_step, num_to_mask=num_to_mask, mask_id=mask_id, max_seq_len=max_seq_len) logging.info( f"epoch {epoch + 1} / {num_epochs} : {(loss_epoch / num_steps):.6f}" ) if (epoch + 1) % save_epoch == 0: save_path = f"{save_prefix}.network.epoch{(epoch + 1):d}" optimizer_save_path = f"{save_prefix}.optimizer.epoch{(epoch + 1):d}" if multi_gpu: torch.save(model.module.state_dict(), save_path.format(epoch + 1)) else: torch.save(model.state_dict(), save_path.format(epoch + 1)) logging.info(f"model saved: {save_path}") torch.save(optimizer.state_dict(), optimizer_save_path.format(epoch + 1)) logging.info(f"optimizer saved: {optimizer_save_path}")