def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: for split in raw_datasets.keys(): raw_datasets[split] = raw_datasets[split].select(range(100)) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names else: column_names = raw_datasets["validation"].column_names # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" label_column_name = "label" if "label" in column_names else "labels" # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.model_name_or_path) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMultipleChoice.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [[ f"{header} {examples[end][i]}" for end in ending_names ] for i, header in enumerate(question_headers)] labels = examples[label_column_name] # Flatten out first_sentences = list(chain(*first_sentences)) second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer( first_sentences, second_sentences, max_length=args.max_length, padding=padding, truncation=True, ) # Un-flatten tokenized_inputs = { k: [v[i:i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items() } tokenized_inputs["labels"] = labels return tokenized_inputs with accelerator.main_process_first(): processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForMultipleChoice( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Use the device given by the `accelerator` object. device = accelerator.device model.to(device) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Metrics metric = load_metric("accuracy") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() accelerator.print(f"epoch {epoch}: {eval_metric}") if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
# 准备随机数据用测试【100,128】, 100条数据,每条数据有128个字 all_input_ids = torch.randint(low=0, high=100, size=(100, 128)) # 为了测试简单设置所有attention都为1 all_attention_mask = torch.ones_like(all_input_ids) # 为100个样本生成标签 all_labels = torch.randint(low=0, high=2, size=(100, )) # 构造DictDataset dataset = DictDataset(all_input_ids, all_attention_mask, all_labels) eval_dataset = DictDataset(all_input_ids, all_attention_mask, all_labels) # 构造dataloader dataloader = DataLoader(dataset, batch_size=32) num_epochs = 10 # 训练steps是epoch * dataloader的长度 num_training_steps = len(dataloader) * num_epochs # Optimizer and learning rate scheduler optimizer = AdamW(student_model.parameters(), lr=1e-4) # 学习率scheduler scheduler_class = get_linear_schedule_with_warmup # arguments dict except 'optimizer', 前10%作为num_warmup_steps scheduler_args = { 'num_warmup_steps': int(0.1 * num_training_steps), 'num_training_steps': num_training_steps } # display model parameters statistics, result是模型参数信息 print("\nteacher_model's parametrers:") result, _ = textbrewer.utils.display_parameters(teacher_model, max_level=3) print(result) print("student_model's parametrers:") result, _ = textbrewer.utils.display_parameters(student_model, max_level=3)
def train(self): train_sampler = RandomSampler(self.train_dataset) train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size) if self.args.max_steps > 0: t_total = self.args.max_steps self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.args.weight_decay}, {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(self.train_dataset)) logger.info(" Num Epochs = %d", self.args.num_train_epochs) logger.info(" Total train batch size = %d", self.args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) logger.info(" Logging steps = %d", self.args.logging_steps) logger.info(" Save steps = %d", self.args.save_steps) witer = SummaryWriter(logdir=self.args.model_dir, comment="BERT_intent") global_step = 0 tr_loss = 0.0 best_f1 = 0.0 eval_results = {} self.model.zero_grad() train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch") set_seed(self.args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): self.model.train() batch = tuple(t.to(self.device) for t in batch) # GPU or CPU inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'label': batch[3]} outputs = self.model(**inputs) # eval_results = self.evaluate1('dev') loss = outputs[0] # eval_results = self.evaluate1('dev') if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % self.args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule self.model.zero_grad() global_step += 1 if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0: _ = self.evaluate1('train') eval_results = self.evaluate1('dev') if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: if eval_results.get("mean", {}).get("mean_precision", 0.0) > best_f1: best_f1 = eval_results.get("mean", {}).get("mean_precision", 0.0) self.save_model() if 0 < self.args.max_steps < global_step: epoch_iterator.close() break witer.add_scalar("Train/loss", tr_loss/global_step, global_step) lr = scheduler.get_lr()[-1] witer.add_scalar("Train/lr", lr, global_step) _ = self.evaluate1('train') eval_results = self.evaluate1('dev') if eval_results.get("mean", {}).get("mean_precision", 0.0) > best_f1: best_f1 = eval_results.get("mean", {}).get("mean_precision", 0.0) self.save_model() if 0 < self.args.max_steps < global_step: train_iterator.close() break return global_step, tr_loss / global_step
def train_eval(clf_model, train_dataloader, validation_dataloader, base_dir, weights=None, lr=2e-5, epochs=4, eval_every_num_iters=40, seed_val=42): """train and evaluate a deep learning model :params[in]: clf_model, a classifier :params[in]: train_dataloader, training data :params[in]: validation_dataloader, validation data :params[in]: base_dir, output directory to create the directory to save results :params[in]: lr, the learning rate :params[in]: epochs, the number of training epochs :params[in]: eval_every_num_iters, the number of iterations to evaluate :params[in]: seed_val, set a random seed """ # the 'W' stands for 'Warm up", AdamW is a class from the huggingface library optimizer = AdamW( clf_model.parameters(), lr=lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) # Number of training epochs (authors recommend between 2 and 4) epochs = epochs # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=1, # Default value in run_glue.py num_training_steps=total_steps) # see if weights is None: if weights != None: weights = torch.FloatTensor(weights) # Set the seed value all over the place to make this reproducible. random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Store the average loss after each epoch so we can plot them. loss_values = [] # For each epoch... for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_loss = 0 # Put the model into training mode. Don't be mislead--the call to # `train` just changes the *mode*, it doesn't *perform* the training. # `dropout` and `batchnorm` layers behave differently during training # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch) clf_model.train() ## model training mode # For each batch of training data... for step, batch in enumerate(train_dataloader): # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) clf_model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # This will return the loss (rather than the model output) because we # have provided the `labels`. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = clf_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, weights=weights) #weights=torch.FloatTensor([100/127,100/191,100/34])) # The call to `model` always returns a tuple, so we need to pull the # loss value out of the tuple. loss = outputs[0] # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_loss += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(clf_model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # eveluate the performance after some iterations if step % eval_every_num_iters == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed)) tmp_dir = base_dir + '/epoch' + str( epoch_i + 1) + 'iteration' + str(step) ## save pretrained model evaluate_model(clf_model, validation_dataloader, tmp_dir) clf_model.train() ## model training mode # Calculate the average loss over the training data. avg_train_loss = total_loss / len(train_dataloader) # Store the loss value for plotting the learning curve. loss_values.append(avg_train_loss) # save the data after epochs tmp_dir = base_dir + '/epoch' + str(epoch_i + 1) + '_done' ## save pretrained model evaluate_model(clf_model, validation_dataloader, tmp_dir) clf_model.train() ## model training mode
def train(train_dataset, model, tokenizer,args): """ Train the model """ tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Train! print("***** Running training *****") print(f"Num examples = {len(train_dataset)}") print(f"Num Epochs = {int(args.num_train_epochs)}") print(f"Instantaneous batch size per GPU ={args.per_gpu_train_batch_size}") print(f"Gradient Accumulation steps = {args.gradient_accumulation_steps}") print(f"Total optimization steps = {int(t_total)}") global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() # Added here for reproductibility set_seed(args) for epoch in range(int(args.num_train_epochs)): print(f"epoch{epoch+1} of {int(args.num_train_epochs)}") epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]: del inputs["token_type_ids"] outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() epoch_iterator.set_description('(loss=%g)' % loss) if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)model = BertForSequenceClassification.from_pretrained(BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels) # Parameters: lr = 2e-5 adam_epsilon = 1e-8 # Number of training epochs (authors recommend between 2 and 4) epochs = 3 num_warmup_steps = 0 num_training_steps = len(train_dataloader)*epochs ### In Transformers, optimizer and schedules are splitted and instantiated like this: optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon,correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler ## Store our loss and accuracy for plotting train_loss_set = [] learning_rate = [] # Gradients gets accumulated by default model.zero_grad() # tnrange is a tqdm wrapper around the normal python range for _ in tnrange(1,epochs+1,desc='Epoch'): print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">") # Calculate total loss for this epoch batch_loss = 0
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility (even between python 2 and 3) best_result = 0 for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) start_ems = 0 end_ems = 0 em = 0 for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'start_positions': batch[3], 'end_positions': batch[4]} if args.model_type != 'distilbert': inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] if args.model_type in ['xlnet', 'xlm', 'xlnet_sent', 'xlnet_all_sent']: inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) if args.all_bound or args.use_word_stat: inputs.update({'start_bias': batch[-2], 'end_bias': batch[-1], 'segment_ids': batch[2]}) outputs, ems = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) start_ems += torch.sum(ems[0]).data.cpu().numpy() end_ems += torch.sum(ems[1]).data.cpu().numpy() em += torch.sum(ems[0] * ems[1]).data.cpu().numpy() if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break start_ems = start_ems / len(train_dataset) * 100 end_ems = end_ems / len(train_dataset) * 100 em = em / len(train_dataset) * 100 logger.info('\n') logger.info('Start EM %.2f / End EM %.2f / EM %.2f' % (start_ems, end_ems, em)) results = evaluate(args, model, tokenizer) if best_result < results['f1']: best_result = results['f1'] logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(train_dataset, model, tokenizer): # Training """ Train the model """ train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=12) epochs = 2 t_total = len(train_dataloader) * epochs # Prepare optimizer and schedule (linear warmup and decay) optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=t_total) # Train! print("***** Running training *****") print(" Num examples = ", len(train_dataset)) print(" Total optimization steps = ", t_total) global_step = 1 tr_loss = 0.0 model.zero_grad() for epoch in range(epochs): print('Epoch:{}'.format(epoch+1)) epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration", disable=False) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] loss.backward() tr_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if global_step % 50 == 0: # Only evaluate when single GPU otherwise metrics may not average well print('Global step = {}, logging_loss = {}'.format(global_step,tr_loss)) return global_step, tr_loss / global_step def evaluate(model, tokenizer): # Evaluate dataset, examples, features = load_and_cache_examples(tokenizer, is_training=False) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=8) # Eval! print("***** Running evaluation *****") print(" Num examples = ", len(dataset)) all_results = [] start_time = timeit.default_timer() for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [output[i].detach().cpu().tolist() for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time print(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) predictions = compute_predictions_logits( examples, features, all_results, n_best_size = 20, max_answer_length = 30, do_lower_case=False, output_prediction_file="predictions.json", output_nbest_file="nbest_predictions.json", output_null_log_odds_file=None, verbose_logging=False, version_2_with_negative=False, null_score_diff_threshold=0.0, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results def main(): # Set seed seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Setup CUDA, GPU & distributed training device = torch.device("cuda") config = BertConfig.from_pretrained('bert-base-cased') tokenizer = BertTokenizer.from_pretrained('bert-base-cased',do_lower_case=True,) #the nn.module BertForQuestionAnswering has a single untrained layer qa_output: Linear(hidden_size,2) on top of the trained BERT-base. model = BertForQuestionAnswering.from_pretrained('bert-base-cased',config=config,) model.to(device) max_seq_length=384 train_dataset = load_and_cache_examples(tokenizer, is_training=True)[0] # Training global_step, ave_loss = train(train_dataset, model, tokenizer) print(" global_step = %s, average loss = %s", global_step, tr_loss / global_step) # Save the trained model and the tokenizer output_dir = 'output/' # Create output directory if needed if not os.path.exists(output_dir): os.makedirs(output_dir) print("Saving model checkpoint to %s", output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(output_dir) tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=True) model.to(device) # Evaluate results = evaluate(model, tokenizer) print("Results: {}".format(results)) return result def predict(max_seq_length=384,tokenizer,model,q,doc,device): indexed_tokens = tokenizer.encode(q,doc) attention_mask = [1]*len(indexed_tokens) seg_idx = indexed_tokens.index(102)+1 segment_ids = [0]*seg_idx+[1]*(len(indexed_tokens)-seg_idx) #padding indexed_tokens += [0]*(max_seq_length-len(indexed_tokens)) attention_mask += [0]*(max_seq_length-len(attention_mask)) segment_ids += [0]*(max_seq_length-len(segment_ids)) # for debugging # ind2word = {v:k for k,v in tokenizer.vocab.items()} # [ind2word[ind] for ind in indexed_tokens] tokens_tensor = torch.tensor([indexed_tokens]).to(device) segment_tensor = torch.tensor([segment_ids]).to(device) attention_tensor = torch.tensor([attention_mask]).to(device) # Predict the start and end positions logits with torch.no_grad(): start_logits, end_logits = model(tokens_tensor, token_type_ids=segment_tensor, attention_mask=attention_tensor) # get the highest prediction answer = tokenizer.decode(indexed_tokens[torch.argmax(start_logits):torch.argmax(end_logits)+1]) return answer if __name__ == "__main__": main()
def train(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset.") parser.add_argument("--train_batch_size", type=int, default=64, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation") parser.add_argument("--keyword_module", type=str, default="new", help="add, attention, ") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-4, help="Learning rate") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=15, help="Number of training epochs") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cpu" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument("--gpt2_model_name", type=str, default="gpt2", help="Path, url or short name of the model") parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-label_smoothing', action='store_true') args = parser.parse_args() args.d_word_vec = args.d_model # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.gpt2_model_name else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.gpt2_model_name) num_tokens = len(tokenizer.encoder) num_added_tokens = tokenizer.add_special_tokens( ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there model = Transformer( num_tokens + num_added_tokens, num_tokens + num_added_tokens, src_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]), trg_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]), trg_emb_prj_weight_sharing=args.proj_share_weight, emb_src_trg_weight_sharing=args.embs_share_weight, d_k=args.d_k, d_v=args.d_v, d_model=args.d_model, d_word_vec=args.d_word_vec, d_inner=args.d_inner_hid, n_layers=args.n_layers, n_head=args.n_head, dropout=args.dropout, n_position=512, keyword_module=args.keyword_module).to(args.device) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) source_ids, target_ids, lm_labels, keyword_scores = batch (lm_loss), *_ = model(source_ids, target_ids, labels=lm_labels, keyword=keyword_scores) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) source_ids, target_ids, lm_labels, keyword_scores = batch #logger.info(tokenizer.decode(target_ids[0].tolist())) lm_logits, *_ = model(source_ids, target_ids, keyword=keyword_scores) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, ), (lm_labels_flat_shifted, ) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.gpt2_model_name, args.dataset_path) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=4) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') #getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def train(args, train_dataset, model, tokenizer): """ Train the model """ record_result = [] if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) inputs_list = [] global_step = 0 epochs_trained = 0 collect_step = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to global_step of last saved checkpoint from model path try: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) except ValueError: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue if collect_step < 5: batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids inputs_list.append(inputs) collect_step += 1 continue else: print('collect_step', collect_step) print('start pruning') new_mask = GraSP(model, args.tt, inputs_list, args.device, original_mask=None) rate = 0 sum1 = 0 for key in new_mask.keys(): rate += float(torch.sum(new_mask[key] == 0)) sum1 += float(new_mask[key].nelement()) print('zero rate = ', rate / sum1) torch.save(new_mask, 'grasp_mask2/' + args.task_name + '.pt') return 0, 0 model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)): if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) record_result.append(results) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(model, os.path.join(output_dir, "model.pt")) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() results = evaluate(args, model, tokenizer) record_result.append(results) torch.save(record_result, os.path.join(args.output_dir, "result.pt")) return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() log_writer = open(os.path.join(args.output_dir, "evaluate_logs.txt"), 'w') args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) logger.info(" Logging steps = %d", args.logging_steps) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path) and False: # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss, best_avg = 0.0, 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility def logging(): if args.evaluate_during_training: results = evaluate(args, model, tokenizer, single_gpu=True) for task, result in results.items(): for key, value in result.items(): tb_writer.add_scalar("eval_{}_{}".format(task, key), value, global_step) log_writer.write("{0}\t{1}\n".format(global_step, json.dumps(results))) log_writer.flush() tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) return results for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert"] else None ) # XLM and DistilBERT don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: cur_result = logging() logging_loss = tr_loss if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.local_rank in [-1, 0] and args.logging_each_epoch: cur_result = logging() logging_loss = tr_loss task_metric = "acc" if args.task_name == "rel": task_metric = "ndcg" if best_avg < cur_result["valid_avg"][task_metric]: best_avg = cur_result["valid_avg"][task_metric] output_dir = os.path.join(args.output_dir, "checkpoint-best") if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() log_writer.close() return global_step, tr_loss / (global_step + 1)
from fairseq.models.roberta import CamembertModel camembert_path='C:/Users/theo.roudil-valentin/.cache/torch/hub/camembert-base/' camembert = CamembertModel.from_pretrained(camembert_path) # camembert.train() #%% #On import CamembertModel : #The bare CamemBERT Model transformer outputting raw hidden-states # without any specific head on top. #C'est exactement ce qu'on veut ! from transformers import CamembertModel camembert=CamembertModel.from_pretrained("camembert-base") #Autrement dit, il n'y a pas de "dernière couche" avec un objectif : #C'est à nous de créer le(s) dernier(s) layer(s !!) #%% from transformers import AdamW optimizer = AdamW(camembert.parameters(), lr=1e-5) from transformers import CamembertTokenizer,CamembertForSequenceClassification tokenizer=CamembertTokenizer.from_pretrained(camembert_path) #%% # from pathlib import Path # text_batch = [] # for src_file in Path(chemin_donnees).glob("**/*.txt"): # print("🔥", src_file) # text_batch += src_file.read_text(encoding="utf-8").splitlines() text_batch=pickle.load(open(chemin_modele+'text_batch_1.pickle','rb')) #%% ml=512 N=10 z=9
def prepare_for_training(args, model, checkpoint_state_dict, amp): no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if checkpoint_state_dict: optimizer.load_state_dict(checkpoint_state_dict['optimizer']) model.load_state_dict(checkpoint_state_dict['model']) # then remove optimizer state to make amp happy # https://github.com/NVIDIA/apex/issues/480#issuecomment-587154020 if amp: optimizer.state = {} if amp: model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if checkpoint_state_dict: amp.load_state_dict(checkpoint_state_dict['amp']) # Black Tech from https://github.com/NVIDIA/apex/issues/480#issuecomment-587154020 # forward, backward, optimizer step, zero_grad random_input = { 'source_ids': torch.ones(size=(2, 2), device=args.device, dtype=torch.long), 'target_ids': torch.ones(size=(2, 2), device=args.device, dtype=torch.long), 'label_ids': torch.ones(size=(2, 2), device=args.device, dtype=torch.long), 'pseudo_ids': torch.ones(size=(2, 2), device=args.device, dtype=torch.long), 'num_source_tokens': torch.zeros(size=(2, ), device=args.device, dtype=torch.long), 'num_target_tokens': torch.zeros(size=(2, ), device=args.device, dtype=torch.long) } loss = model(**random_input) print("Loss = %f" % loss.cpu().item()) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() model.zero_grad() # then load optimizer state_dict again (this time without removing optimizer.state) optimizer.load_state_dict(checkpoint_state_dict['optimizer']) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) return model, optimizer
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch output_loss = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, labels=lm_labels) loss = (output_loss.loss * args.lm_coef + output_loss.mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses output_gpt = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = output_gpt.logits[ ..., :-1, :].contiguous().view(-1, output_gpt.logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, output_gpt.mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach( evaluator, log_handler=OutputHandler( tag="validation", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def train(args): # + inference 과정까지 추가하였습니다. assert sum([args.use_kfold,args.use_simple_fold,args.no_valid])==1 assert (args.concat_exp_p==0 or args.concat_log_p==0) # assert args.eval_steps == args.logging_steps if args.use_kfold==True: assert (args.num_fold_k>=2) seed_everything(args.seed) USE_KFOLD = args.use_kfold # load model and tokenizer model_type_getattr = args.model_type # ELECTRA # BERT model_name_from_pretrained = args.pretrained_model # "monologg/koelectra-small-discriminator", "monologg/koelectra-small-discriminator" tokenizer = AutoTokenizer.from_pretrained(model_name_from_pretrained) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load dataset # load_data_module = getattr(import_module("load_data"), f'load_data') # dataset = load_tr_val_data("../input/data/train/train.tsv",args) dataset = load_tr_val_data("../input/data/train/final_train_ner.tsv",args) # setting model hyperparameter # config_module = getattr(import_module("transformers"), f'{args.model_type}Config') # model_config = config_module.from_pretrained(model_name_from_pretrained) model_module = getattr(import_module("transformers"), f'{model_type_getattr}ForSequenceClassification') model = model_module.from_pretrained(model_name_from_pretrained, num_labels=42) model.parameters model.to(device) # model_saved_dir = increment_output_dir(args.model_output_dir) model_saved_dir = increment_output_dir(model_name_from_pretrained.replace('/','_')) # f'./results/{output_path}' neptune.append_tag(f"{model_saved_dir.split('/')[-1]}") neptune.append_tag(f"{args.name}") with open('../input/data/label_type.pkl', 'rb') as f: label_type = pickle.load(f) # Simple Train Valid Split # Not KFOLD # => StratifiedShuffleSplit ################################################################################################# ################################################################################################# elif args.use_kfold==True: # KFOLD if not os.path.isdir('./kfold_results'): # 모델들을 저장할 상위폴더 os.makedirs('./kfold_results') kfold = StratifiedKFold(n_splits=args.num_fold_k, random_state=args.seed, shuffle=True) label = dataset['label'] # 이미 해당 모델로 kfold가 수행되고 모델 저장된 적이 있는지 확인 model_name_from_pretrained_used_for_save = model_name_from_pretrained.replace('/','_') check_upper_dir = f'./kfold_results/{model_name_from_pretrained_used_for_save}' if not os.path.isdir(check_upper_dir+'0'): # 존재하지 않는다면 그대로 사용 upper_dir=check_upper_dir+'0' else: # 존재한다면 존재하는 것들 중 숫자 찾아서 최댓값 +1 을 사용 all_directories = glob.glob(f'./kfold_results/*') max_num = max(int(re.search(rf"{model_name_from_pretrained_used_for_save}[0-9]+",ad).group().replace(model_name_from_pretrained_used_for_save,'')) for ad in all_directories if re.search(rf"{model_name_from_pretrained_used_for_save}[0-9]+",ad)) upper_dir = check_upper_dir+str(max_num+1) neptune.log_text('Model_Name_Number', f"{upper_dir.split('/')[-1]}") kfold_train_acc_score = [] kfold_val_acc_score = [] k=0 for train_idx, val_idx in kfold.split(dataset, label): # model_module = getattr(import_module("transformers"), f'{model_type_getattr}ForSequenceClassification') # model = model_module.from_pretrained(model_name_from_pretrained, num_labels=42) config_module = getattr(import_module("transformers"), f'{model_type_getattr}Config') model_config = config_module.from_pretrained(model_name_from_pretrained) # model_config = ElectraConfig.from_pretrained(model_name_from_pretrained) model_config.num_labels = 42 model_config.hidden_dropout_prob = args.hidden_dropout_prob model_module = getattr(import_module("transformers"), f'{model_type_getattr}ForSequenceClassification') model = model_module.from_pretrained(model_name_from_pretrained, config=model_config) model.parameters model.to(device) print('='*50) print('=' * 15 + f'{k}-th Fold Cross Validation Started ({k+1}/{args.num_fold_k})' + '=' * 15) train_dataset = dataset.iloc[train_idx] val_dataset = dataset.iloc[val_idx] # 새로운 외부데이터 추가해서 학습해보기 if args.concat_external_data==True: train_dataset = concat_external_data(train_dataset,label_type,args) train_label = train_dataset['label'].values val_label = val_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer, args) tokenized_val = tokenized_dataset(val_dataset, tokenizer, args) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_val_dataset = RE_Dataset(tokenized_val, val_label) print('='*50) print('Train & Valid Loaded Successfully!!') print(f'len(RE_train_dataset) : {len(RE_train_dataset)}, len(RE_val_dataset) : {len(RE_val_dataset)}') model_saved_dir = upper_dir+f'/{k}fold' # f'./kfold_results/{model_name_from_pretrained_used_for_save}'+f'/{k}fold' neptune.log_text(f'{k}-th model_saved_dir',model_saved_dir) neptune.log_text(f'Num_Data : {k}-th len(RE_train_dataset)',str(len(RE_train_dataset))) neptune.log_text(f'Num_Data : {k}-th len(RE_val_dataset)',str(len(RE_val_dataset))) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. # https://huggingface.co/transformers/main_classes/trainer.html?highlight=trainingarguments#trainingarguments total_num_epochs = (len(RE_train_dataset)//args.batch_size+1)*args.epochs if args.use_warmup_ratio: warmup_steps = total_num_epochs*args.warmup_ratio else: warmup_steps = 0 wandb_run_name = model_saved_dir.replace('./kfold_results/',f'Total {args.num_fold_k}fold :') training_args = TrainingArguments( report_to = 'wandb', # 'all' run_name = f"{args.name+wandb_run_name.replace('/','_')}", output_dir=model_saved_dir, # output directory # overwrite_output_dir=False, # 모델을 저장할 때 덮어쓰기 할 것인지 save_total_limit=args.save_total_limit, # number of total save model. save_steps=args.model_save_steps, # model saving step. num_train_epochs=args.epochs, # total number of training epochs learning_rate=args.lr, # learning_rate per_device_train_batch_size=args.batch_size, # batch size per device during training per_device_eval_batch_size=args.val_batch_size, # batch size for evaluation warmup_steps=warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.weight_decay, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=args.logging_steps, # log saving step. evaluation_strategy='steps', # evaluation strategy to adopt during training eval_steps = args.eval_steps, # evaluation step. # max_grad_norm=1, label_smoothing_factor = args.label_smoothing_factor, load_best_model_at_end = args.load_best_model_at_end, # default => False # greater_is_better = True, metric_for_best_model = args.metric_for_best_model, # metric_for_best_model: Optional[str] = None # fp16 = True, # Whether to use 16-bit (mixed) precision training instead of 32-bit training. # dataloader_num_workers = 2, ) # EarlyStopping # 여기선 global epochs 하이퍼파라미터를 기준으로 하지 않고, Total_Step을 본다. # 만약 patience를 1로 설정하면 eval_step * 1만큼을 기준으로 판단한다. (eval_step=25로 설정했다면 25만큼 patience) early_stopping = EarlyStoppingCallback( early_stopping_patience = args.early_stopping_patience, early_stopping_threshold = 1e-4) ## Optimizer if args.optimizer_name == "Adam": optimizer = Adam(model.parameters(), lr=args.min_lr) elif args.optimizer_name == "AdamW": optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_name == "SGD": optimizer = SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) # https://arxiv.org/pdf/1608.03983.pdf ## Scheduler T_0 = int(np.ceil(total_num_epochs*args.first_cycle_ratio)) if args.scheduler_name == "Custom": scheduler = CustomizedCosineAnnealingWarmRestarts(optimizer, T_0=T_0, T_mult=2, eta_max=args.lr, T_up=int(T_0*args.first_warmup_ratio), gamma=args.scheduler_gamma, last_epoch=-1) elif args.scheduler_name == "Original": scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=T_0, T_mult=2, eta_min=args.min_lr) # https://huggingface.co/tkransformers/main_classes/trainer.html?highlight=trainer#id1 trainer = Trainer( model=model, # Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers= (optimizer,scheduler), # optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]=(None, None)) callbacks= [early_stopping], # callbacks: Optional[List[TrainerCallback]]=None # model_init= ) # train model trainer.train() print(f'Neptune Saving {k}-th Model Logs Plot') # Get Log from Model # Neptune Save Plot (train_eval_loss, learning_rate, eval_accuracy) train_eval_loss_plot, learning_rate_plot, eval_accuracy_plot = k_th_plot_from_logs(trainer.state.log_history) neptune.log_image(f'Logs : {k}-th train_eval_loss_plot',train_eval_loss_plot) neptune.log_image(f'Logs : {k}-th learning_rate_plot',learning_rate_plot) neptune.log_image(f'Logs : {k}-th eval_accuracy_plot',eval_accuracy_plot) print(f'{k}-th train finished!!') state_log_history = trainer.state.log_history eval_log_dict = [log_dict for log_dict in state_log_history if 'eval_loss' in log_dict.keys() ] k_th_val_logs_dict = defaultdict(list) for dict_per_step in eval_log_dict: for key,value in dict_per_step.items(): k_th_val_logs_dict[key].append(value) best_val_acc_score = max(k_th_val_logs_dict['eval_accuracy']) # neptune.log_metric(f'{k}-th train_acc_score',best_train_acc_score) neptune.log_metric(f'{k}-th val_acc_score',best_val_acc_score) kfold_val_acc_score.append(best_val_acc_score) k=int(k) k+=1 # neptune.log_text(f"{args.num_fold_k}-fold train best acc list", f"{kfold_train_acc_score}") neptune.log_text(f"{args.num_fold_k}-fold val best acc list", f"{kfold_val_acc_score}") # neptune.log_metric(f"Result ACC : {args.num_fold_k}-fold train Total Average acc", np.mean(kfold_train_acc_score)) neptune.log_metric(f"Result ACC : {args.num_fold_k}-fold val Total Average acc", np.mean(kfold_val_acc_score))
def train(model, device, train_data, test_data, args): """ 训练模型 Args: model: 模型 device: 设备信息 train_data: 训练数据类 test_data: 测试数据类 args: 训练参数配置信息 Returns: """ tb_write = SummaryWriter() if args.gradient_accumulation_steps < 1: raise ValueError("gradient_accumulation_steps参数无效,必须大于等于1") train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) train_sampler = RandomSampler(train_data) train_data_loader = DataLoader( train_data, sampler=train_sampler, batch_size=train_batch_size, collate_fn=collate_func, ) total_steps = int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps ) logger.info("总训练步数为:{}".format(total_steps)) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] # Adam优化 optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(args.warmup_proportion * total_steps), num_training_steps=total_steps, ) # 清空cuda缓存 torch.cuda.empty_cache() model.train() title_id = train_data.title_id tr_loss, logging_loss = 0.0, 0.0 global_step = 0 # Training for iepoch in trange(0, int(args.num_train_epochs), desc="Epoch", disable=False): iter_bar = tqdm(train_data_loader, desc="Iter (loss=X.XXX)", disable=False) for step, batch in enumerate(iter_bar): input_ids = batch["input_ids"].to(device) token_type_ids = batch["token_type_ids"].to(device) outputs = model.forward( input_ids=input_ids, token_type_ids=token_type_ids, labels=input_ids, title_id=title_id, ) loss = outputs[0] tr_loss += loss.item() iter_bar.set_description("Iter (loss=%5.3f)" % loss.item()) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # 反向传播 loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # 当训练步数整除累积步数时,进行参数优化 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 # 记录学习率和训练集损失值 if args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_write.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_write.add_scalar( "train_loss", (tr_loss - logging_loss) / (args.logging_steps * args.gradient_accumulation_steps), global_step, ) logging_loss = tr_loss # 进行模型测试,记录测试集的损失 if args.eval_steps > 0 and global_step % args.eval_steps == 0: eval_loss = evaluate(model, device, test_data, args) tb_write.add_scalar("test_loss", eval_loss, global_step) model.train() # 每次epoch都保存模型 output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) # 清空cuda缓存 torch.cuda.empty_cache()
def train_function(gpu, world_size, node_rank, gpus, fold_number, group_name): os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu) import torch.multiprocessing torch.multiprocessing.set_sharing_strategy('file_system') torch.manual_seed(25) np.random.seed(25) rank = node_rank * gpus + gpu dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank) batch_size = 64 n_epochs = 20 max_len = 128 weight_decay = 0.01 init_lr = 1e-5 if rank == 0: wandb.init(project='shopee_effnet0', group=group_name, job_type=str(fold_number)) wandb.config.batch_size = batch_size wandb.config.n_epochs = n_epochs wandb.config.max_len = max_len wandb.config.weight_decay = weight_decay wandb.config.init_lr = init_lr df = pd.read_csv('../../dataset/reliable_validation_tm.csv') train_dataset = TextDataset(df, df[df['fold_group'] != fold_number], max_len=max_len) sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True) train_dataloader = DataLoader(train_dataset, batch_size=batch_size // world_size, shuffle=False, num_workers=4, sampler=sampler) valid_dataset = TextDataset(df, df[df['fold_group'] == fold_number], max_len=max_len) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = BertForSequenceClassification.from_pretrained( "bert-base-multilingual-cased", num_labels=df['label_group'].nunique()) model.to(device) model = DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True, output_device=gpu) model.train() no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=init_lr) best_loss = 0 for epoch in range(n_epochs): train_loss = train_one_epoch(model, train_dataloader, optimizer, device) if rank == 0: wandb.log({'train_loss': train_loss, 'epoch': epoch}) if train_loss > best_loss: best_loss = train_loss torch.save(model.module.state_dict(), 'best_bt.pth') if rank == 0: wandb.finish()
'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] Learning_rate = 5e-6 # 学习率 optimizer = AdamW(optimizer_grouped_parameters, lr=Learning_rate, eps=1e-8) for epoch in range(30): # 訓練模式 model.train() if epoch % 5 == 0 and epoch != 0: Learning_rate = Learning_rate * 0.5 adjust_learning_rate(optimizer, Learning_rate) All_train_correct = 0.0 AllTrainLoss = 0.0 count = 0 for batch_index, batch_dict in enumerate(train_dataloader): batch_dict = tuple(t.to(device) for t in batch_dict) # print(batch_dict[0].shape) # print(batch_dict[3].shape)
'acc': accuracy_score_prob, 'nDCG@5': nDCGat5_Calculator('../data/Kdd/valid_all_processed_label.h5') } basic_model = BasicModelLabel(cfg['bert_model_name'], cfg=cfg, model_cfg=basic_model_cfg) model = basic_model bert_params = basic_model.bert.parameters() other_params = list(set(basic_model.parameters()) - set(bert_params)) optimizer = AdamW([{ 'params': basic_model.bert.parameters(), 'lr': 3e-7 }, { 'params': other_params, 'lr': 3e-4 }], ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int((len(ds) // cfg['dataloader_cfg']['batch_size'] + 1) * cfg['epochs'] * 0.2), num_training_steps=(len(ds) // cfg['dataloader_cfg']['batch_size'] + 1) * cfg['epochs']) criterion = nn.BCEWithLogitsLoss() # In[ ]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # tb_writer = SummaryWriter(log_dir=os.path.join(aakrgs.output_dir, "log", "train")) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples): inputs = [x[0] for x in examples] labels = [x[1] for x in examples] metadata = [x[2] for x in examples] def collate_individual(samples): length_of_first = samples[0].size(0) are_tensors_same_length = all(x.size(0) == length_of_first for x in samples) if are_tensors_same_length: return torch.stack(samples, dim=0) else: if tokenizer.pad_token is None: raise ValueError( "You are attempting to pad samples but the tokenizer you are using" f" ({tokenizer.__class__.__name__}) does not have one." ) return pad_sequence(samples, batch_first=True, padding_value=tokenizer.pad_token_id) inputs = collate_individual(inputs) labels = collate_individual(labels) metadata = collate_individual(metadata) return inputs, labels, metadata train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if ( args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0], file=sys.stdout, mininterval=10) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels, _ = batch # torch.set_printoptions(profile="full") # print(f"Inputs : {tokenizer.convert_ids_to_tokens(inputs.tolist())}") # print(f"Labels : {tokenizer.convert_ids_to_tokens(labels.tolist())}") # exit(0) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, labels=labels) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None, verbose=True): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args[ "gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"], }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args[ "warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW( optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"], ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"]) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 if args["evaluate_during_training"]: training_progress_scores = self._create_training_progress_scores() if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args["n_gpu"] > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_( # amp.master_params(optimizer), args["max_grad_norm"] # ) else: loss.backward() # torch.nn.utils.clip_grad_norm_( # model.parameters(), args["max_grad_norm"] # ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args["logging_steps"], global_step, ) logging_loss = tr_loss if args["wandb_project"]: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, }) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, model=model) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) os.makedirs(output_dir_current, exist_ok=True) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, model=model, results=results) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False, ) if args["wandb_project"]: wandb.log( self._get_last_metrics( training_progress_scores)) if not best_eval_metric: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) if best_eval_metric and args[ "early_stopping_metric_minimize"]: if results[args[ "early_stopping_metric"]] - best_eval_metric < args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args[ "early_stopping_metric"]] - best_eval_metric > args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args["save_model_every_epoch"] or args[ "evaluate_during_training"]: os.makedirs(output_dir_current, exist_ok=True) if args["save_model_every_epoch"]: self._save_model(output_dir_current, model=model) if args["evaluate_during_training"]: results, _, _ = self.eval_model(eval_df, verbose=True) self._save_model(output_dir_current, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False) if not best_eval_metric: best_eval_metric = results[args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) if best_eval_metric and args["early_stopping_metric_minimize"]: if results[args[ "early_stopping_metric"]] - best_eval_metric < args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if results[args[ "early_stopping_metric"]] - best_eval_metric > args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 return global_step, tr_loss / global_step
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", ) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) if args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples[text_column_name] = [ line for line in examples[text_column_name] if len(line) > 0 and not line.isspace() ] return tokenizer( examples[text_column_name], padding=padding, truncation=True, max_length=max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset line_by_line", ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on every text in dataset", ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, desc=f"Grouping texts in chunks of {max_seq_length}", ) train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=args.mlm_probability) # DataLoaders creation: train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size))) losses = torch.cat(losses) losses = losses[: len(eval_dataset)] try: perplexity = math.exp(torch.mean(losses)) except OverflowError: perplexity = float("inf") logger.info(f"epoch {epoch}: perplexity: {perplexity}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert", "bart", "longformer" ]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer,trans_dict=None) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples): src_id,tgt_id,src_am,tgt_am=list(zip(*examples)) # print (src_id) src_id,tgt_id,src_am,tgt_am=torch.stack(src_id),torch.stack(tgt_id),torch.stack(src_am),torch.stack(tgt_am) # padding_value = 0 if tokenizer._pad_token is None else tokenizer.pad_token_id # input_ids_src = pad_sequence(src_examples, batch_first=True, padding_value=padding_value) # input_ids_tgt = pad_sequence(tgt_examples, batch_first=True, padding_value=padding_value) # max_length = input_ids.shape[1] # attention_mask_src = torch.stack( # [torch.cat([torch.ones(len(t), dtype=torch.long), torch.zeros(max_length - len(t), dtype=torch.long)]) for t # in src_examples]) # attention_mask_tgt = torch.stack( # [torch.cat([torch.ones(len(t), dtype=torch.long), torch.zeros(max_length - len(t), dtype=torch.long)]) for t # in tgt_examples]) return src_id,tgt_id,src_am,tgt_am print ('train dataset', train_dataset) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if ( args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs_src,inputs_tgt,attention_mask_src,attention_mask_tgt=batch # print ('inputs src',inputs_src) # print ('inputs_tgt',inputs_tgt) # print ('attention_mask_src',attention_mask_src) # print ('attention_mask_tgt',attention_mask_tgt) # inputs=deepcopy(inputs_orig) # inputs, labels = mask_tokens(inputs, tokenizer, args) if args.mlm else (inputs, inputs) # attention_mask=attention_mask.to(args.device) inputs_src,inputs_tgt = inputs_src.to(args.device),inputs_tgt.to(args.device) attention_mask_src,attention_mask_tgt=attention_mask_src.to(args.device),attention_mask_tgt.to(args.device) # labels = labels.to(args.device) # if args.para_data_file: # posids=create_posids(inputs).to(args.device) # else: # posids=None model.train() # logger.info('sample posids {0}',posids[0].__repr__()) loss = model(input_ids_src=inputs_src, input_ids_tgt=inputs_tgt,attention_mask_src=attention_mask_src,attention_mask_tgt=attention_mask_tgt) # loss = outputs[0] # model outputs are always tuple in transformers (see doc) # if trans_dict: # trans_labels,changed_flag=translate_label(trans_dict,labels,inputs_orig,tokenizer,attention_mask) # trans_labels=trans_labels.to(args.device) # if changed_flag: # outputs = model(inputs, attention_mask=attention_mask,position_ids=posids,masked_lm_labels=trans_labels) if args.mlm else model(inputs, attention_mask=attention_mask,position_ids=posids,labels=labels) # loss2=outputs[0] # loss=(loss+loss2)/2 if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--do_predict", action="store_true", help="Whether to run predictions on the test set.", ) parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents.", ) parser.add_argument( "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents.", ) parser.add_argument( "--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.", ) parser.add_argument( "--train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--optimizer", default="lamb", type=str, help="Optimizer (AdamW or lamb)", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument( "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() # New example based on https://colab.research.google.com/drive/1uVP09ynQ1QUmSE2sjEysHjMfKgo4ssb7?usp=sharing device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('DEVICE: ' + str(device)) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = EncoderDecoderModel.from_encoder_decoder_pretrained( 'bert-base-cased', 'bert-base-cased') model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5) model.train() train_loss_set = [] train_loss = 0 save_step = 500 for epoch in range(int(args.num_train_epochs)): batches = tqdm(batch_loader(tokenizer, args.data_dir, step='train', batch_size=args.train_batch_size, start_pad=False), desc='Training') for step, batch in enumerate(batches): batch = tuple(t.to(device) for t in batch) input_ids_encode, attention_mask_encode, input_ids_decode, attention_mask_decode, lm_labels = batch optimizer.zero_grad() model.zero_grad() loss, outputs = model(input_ids=input_ids_encode, decoder_input_ids=input_ids_decode, attention_mask=attention_mask_encode, decoder_attention_mask=attention_mask_decode, lm_labels=lm_labels)[:2] train_loss_set.append(loss.item()) loss.backward() optimizer.step() train_loss += loss.item() print(epoch) clear_output(True) plt.plot(train_loss_set) plt.title(f'Training loss. Epoch {epoch}') plt.xlabel(f'Batch {step}') plt.ylabel('Loss') plt.show() print('STARTING EVALUATION') model.eval() test_batches = tqdm(batch_loader(tokenizer, args.data_dir, step='test', batch_size=1, start_pad=True), desc='Evaluating') for step, batch in enumerate(test_batches): batch = tuple(t.to(device) for t in batch) input_ids_encode, attention_mask_encode, input_ids_decode, attention_mask_decode, lm_labels = batch with torch.no_grad(): generated = model.generate( input_ids_encode, attention_mask=attention_mask_encode, decoder_start_token_id=model.config.decoder.pad_token_id, do_sample=True, max_length=10, top_k=200, top_p=0.75, num_return_sequences=10, #num_beams=5, #no_repeat_ngram_size=2, ) for i in range(len(generated)): print( f'Generated {i}: {tokenizer.decode(generated[i], skip_special_tokens=True, clean_up_tokenization_spaces=True)}' ) print( 'Expected: ', ' '.join([ tokenizer.decode(elem, skip_special_tokens=True, clean_up_tokenization_spaces=True) for elem in input_ids_decode ])) print( 'Lm Labels: ', ' '.join([ tokenizer.decode(elem, skip_special_tokens=True, clean_up_tokenization_spaces=True) for elem in lm_labels ])) print( 'Input: ', ' '.join([ tokenizer.decode(elem, skip_special_tokens=True, clean_up_tokenization_spaces=True) for elem in input_ids_encode ])) print()
def train(args, train_dataset, model, tokenizer): """ Train the model """ #if args.local_rank in [-1, 0]: #tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_dev_f1 = 0.0 best_steps = 0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None, # XLM don't use segment_ids "maskL": batch[3], "maskR": batch[4], "labels": batch[5], } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) #for key, value in results.items(): #tb_writer.add_scalar("eval_{}".format(key), value, global_step) if results["eval_f1"] > best_dev_f1: best_dev_f1 = results["eval_f1"] best_steps = global_step if args.do_test: results_test = evaluate(args, model, tokenizer, test=True) #for key, value in results_test.items(): #tb_writer.add_scalar("test_{}".format(key), value, global_step) logger.info( "test f1: %s, loss: %s, global steps: %s", str(results_test["eval_f1"]), str(results_test["eval_loss"]), str(global_step), ) #tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) #tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logger.info( "Average loss: %s at global step: %s", str((tr_loss - logging_loss) / args.logging_steps), str(global_step), ) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) #add basic = model.module if hasattr(model, "module") else model bert_to_save = (basic.bert.module if hasattr( basic.bert, "module") else basic.bert) tmp = os.path.join(output_dir, "bert") if not os.path.exists(tmp): os.makedirs(tmp) bert_to_save.save_pretrained(tmp) #add end tokenizer.save_vocabulary(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break #if args.local_rank in [-1, 0]: #tb_writer.close() return global_step, tr_loss / global_step, best_steps
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "train_0" logger = get_logger() # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") df = pd.read_pickle( "../input/riiid-test-answer-prediction/split10/train_0.pickle" ).sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" }, "task_container_id_bin300": { "type": "category" }, "previous_answer_index_question_id": { "type": "category" }, "previous_answer_question_id": { "type": "category" }, "timediff-elapsedtime_bin500": { "type": "category" }, "timedelta_log10": { "type": "category" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="question_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"][ "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True) feature_factory_dict["user_id"][ f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator( column="user_id", agg_column="study_time", remove_now=False) feature_factory_dict["user_id"][ "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder( ) feature_factory_dict["post"] = { "DurationFeaturePostProcess": DurationFeaturePostProcess() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) def f(x): x = x // 1000 if x < -100: return -100 if x > 400: return 400 return x df["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df["task_container_id"] ] df["timediff-elapsedtime_bin500"] = [ f(x) for x in df["timediff-elapsedtime"].values ] df["timedelta_log10"] = np.log10( df["duration_previous_content"].values) df["timedelta_log10"] = df["timedelta_log10"].replace( -np.inf, -1).replace(np.inf, -1).fillna(-1).astype("int8") df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300", "previous_answer_index_question_id", "previous_answer_question_id", "row_id", "timediff-elapsedtime_bin500", "timedelta_log10" ]] print(df.head(10)) print("data preprocess") ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df_val_row = pd.read_feather( "../../riiid_takoi/notebook/fe/validation_row_id.feather").head( len(df)) if is_debug: df_val_row = df_val_row.head(3000) df_val_row["is_val"] = 1 df = pd.merge(df, df_val_row, how="left", on="row_id") df["is_val"] = df["is_val"].fillna(0) print(df["is_val"].value_counts()) w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model270", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model270/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model270/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model270/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model270/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout, cont_emb=params["cont_emb"]) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': params["weight_decay"] }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=params["weight_decay"], ) num_train_optimization_steps = int(len(dataloader_train) * 20) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, epoch, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) preds = [] labels = [] with torch.no_grad(): for item in tqdm(dataloader_val): label = item["label"].to(device).float() output = model(item, device) preds.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index print(len(dataloader_val)) print(len(preds)) df_oof["predict"] = preds df_oof["target"] = labels df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def start(self): import torch # Load the dataset into a pandas dataframe. with S3() as s3: df = pd.read_csv(smart_open.smart_open( 's3://sentstorage/scrape/labelledtweets.tsv'), delimiter='\t', header=None, names=['label', 'tweet']) #df = pd.read_csv("t.tsv", delimiter='\t', header=None, names=['label','tweet']) self.tweets = df.tweet.values self.labels = df.label.values # Load the BERT tokenizer. self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) self.max_len = 0 # For every sentence... for self.tweet in self.tweets: # Tokenize the text and add `[CLS]` and `[SEP]` tokens. self.input_ids = self.tokenizer.encode(self.tweet, add_special_tokens=True) # Update the maximum sentence length. self.max_len = max(self.max_len, len(self.input_ids)) # Tokenize all of the sentences and map the tokens to thier word IDs. self.input_ids2 = [] self.attention_masks2 = [] # For every sentence... for self.tweet in self.tweets: self.encoded_dict = self.tokenizer.encode_plus( self.tweet, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=128, # Pad & truncate all sentences. pad_to_max_length=True, return_attention_mask=True, # Construct attn. masks. return_tensors='pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. self.input_ids2.append(self.encoded_dict['input_ids']) # And its attention mask (simply differentiates padding from non-padding). self.attention_masks2.append(self.encoded_dict['attention_mask']) # Convert the lists into tensors. self.input_ids2 = torch.cat(self.input_ids2, dim=0) self.attention_masks2 = torch.cat(self.attention_masks2, dim=0) self.labels = torch.tensor(self.labels) # Combine the training inputs into a TensorDataset. self.dataset = TensorDataset(self.input_ids2, self.attention_masks2, self.labels) # Create a 90-10 train-validation split. # Calculate the number of samples to include in each set. self.train_size = int(0.9 * len(self.dataset)) self.val_size = len(self.dataset) - self.train_size # Divide the dataset by randomly selecting samples. self.train_dataset, self.val_dataset = random_split( self.dataset, [self.train_size, self.val_size]) # The DataLoader needs to know our batch size for training, so we specify it # here. For fine-tuning BERT on a specific task, the authors recommend a batch # size of 16 or 32. self.batch_size = 32 # Create the DataLoaders for our training and validation sets. # We'll take training samples in random order. self.train_dataloader = DataLoader( self.train_dataset, # The training samples. sampler=RandomSampler( self.train_dataset), # Select batches randomly batch_size=self.batch_size # Trains with this batch size. ) # For validation the order doesn't matter, so we'll just read them sequentially. self.validation_dataloader = DataLoader( self.val_dataset, # The validation samples. sampler=SequentialSampler( self.val_dataset), # Pull out batches sequentially. batch_size=self.batch_size # Evaluate with this batch size. ) # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. self.model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels=3, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) self.optimizer = AdamW( self.model.parameters(), lr= 5e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) # Number of training epochs. The BERT authors recommend between 2 and 4. self.epochs = 2 # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). self.total_steps = len(self.train_dataloader) * self.epochs # Create the learning rate scheduler. """self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps = 0, num_training_steps = self.total_steps)""" # Set the seed value all over the place to make this reproducible. self.seed_val = 42 random.seed(self.seed_val) np.random.seed(self.seed_val) torch.manual_seed(self.seed_val) for self.epoch_i in range(0, self.epochs): # Reset the total loss for this epoch. self.total_train_loss = 0 self.model.train() # For each batch of training data... for self.step, self.batch in enumerate(self.train_dataloader): self.b_input_ids = self.batch[0] self.b_input_mask = self.batch[1] self.b_labels = self.batch[2] self.model.zero_grad() self.loss, self.logits = self.model( self.b_input_ids, token_type_ids=None, attention_mask=self.b_input_mask, labels=self.b_labels) # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. self.total_train_loss += self.loss.item() # Perform a backward pass to calculate the gradients. self.loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. self.optimizer.step() # Update the learning rate. #self.scheduler.step() # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` self.model_to_save = self.model.module if hasattr( self.model, 'module' ) else self.model # Take care of distributed/parallel training with S3(s3root='s3://sentstorage/model') as s3: self.model_to_save.save_pretrained('s3://sentstorage/model') self.tokenizer.save_pretrained('s3://sentstorage/model') self.next(self.end)
'params': [ p for n, p in shared_bert.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in shared_bert.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=lr) scheduler = get_linear_schedule_with_warmup( optimizer, args.warmup_steps, n_dc_epochs * len(domain_classifier_train_dl)) train_domain_classifier( shared_bert, domain_classifier_train_dl, optimizer, scheduler, domain_classifier_val_evaluator, n_dc_epochs, device, args.log_interval, model_dir=args.model_dir, gradient_accumulation=args.gradient_accumulation, domain_name=domain)
def initialize_and_train(task_config, trainset, augmentset, validset, testset, hp, run_tag): """The train process. Args: task_config (dictionary): the configuration of the task trainset (SnippextDataset): the training set augmentset (SnippextDataset): the augmented training set validset (SnippextDataset): the validation set testset (SnippextDataset): the testset hp (Namespace): the parsed hyperparameters run_tag (string): the tag of the run (for logging purpose) Returns: None """ padder = SnippextDataset.pad # iterators for dev/test set valid_iter = data.DataLoader(dataset=validset, batch_size=hp.batch_size * 4, shuffle=False, num_workers=0, collate_fn=padder) test_iter = data.DataLoader(dataset=testset, batch_size=hp.batch_size * 4, shuffle=False, num_workers=0, collate_fn=padder) # initialize model device = 'cuda' if torch.cuda.is_available() else 'cpu' if device == 'cpu': model = MultiTaskNet([task_config], device, hp.finetuning, lm=hp.lm, bert_path=hp.bert_path) optimizer = AdamW(model.parameters(), lr=hp.lr) else: model = MultiTaskNet([task_config], device, hp.finetuning, lm=hp.lm, bert_path=hp.bert_path).cuda() optimizer = AdamW(model.parameters(), lr=hp.lr) if hp.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') # learning rate scheduler num_steps = (len(trainset) // hp.batch_size) * hp.n_epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_steps // 10, num_training_steps=num_steps) # create logging if not os.path.exists(hp.logdir): os.makedirs(hp.logdir) writer = SummaryWriter(log_dir=hp.logdir) # start training best_dev_f1 = best_test_f1 = 0.0 epoch = 1 while epoch <= hp.n_epochs: train(model, trainset, augmentset, optimizer, scheduler=scheduler, fp16=hp.fp16, batch_size=hp.batch_size, alpha_aug=hp.alpha_aug) print(f"=========eval at epoch={epoch}=========") dev_f1, test_f1 = eval_on_task(epoch, model, task_config['name'], valid_iter, validset, test_iter, testset, writer, run_tag) # skip the epochs with zero f1 if dev_f1 > 1e-6: epoch += 1 if hp.save_model: if dev_f1 > best_dev_f1: best_dev_f1 = dev_f1 torch.save(model.state_dict(), run_tag + '_dev.pt') if test_f1 > best_test_f1: best_test_f1 = dev_f1 torch.save(model.state_dict(), run_tag + '_test.pt') writer.close()