def test_get_device_gpu(): device, gpus = get_device(num_gpus=1) assert isinstance(device, torch.device) assert device.type == "cuda" assert gpus == 1 device, gpus = get_device(gpu_ids=[0]) assert device.type == "cuda" assert gpus == 1
def test_get_device_cpu(): device, gpus = get_device(num_gpus=0) assert isinstance(device, torch.device) assert device.type == "cpu" assert gpus == 0 device, gpus = get_device(gpu_ids=[]) assert device.type == "cpu" assert gpus == 0
def predict(self, eval_dataloader, num_gpus=1, verbose=True): """ Scores a dataset using a fine-tuned model and a given dataloader. Args: eval_dataloader (Dataloader): Dataloader for the evaluation data. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. verbose (bool, optional): Whether to print out the training log. Defaults to True. Returns 1darray: numpy array of predicted label indices. """ device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) if isinstance(self.model, nn.DataParallel): self.model.module.to(device) else: self.model.to(device) preds = list( super().predict( eval_dataloader=eval_dataloader, get_inputs=Processor.get_inputs, device=device, verbose=verbose, ) ) preds = np.concatenate(preds) # todo generator & probs return np.argmax(preds, axis=1)
def predict(self, eval_dataloader, get_inputs, num_gpus, gpu_ids, verbose=True): # get device device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=-1) # move model self.model = move_model_to_device(model=self.model, device=device) # parallelize model self.model = parallelize_model( model=self.model, device=device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=-1, ) # predict self.model.eval() for batch in tqdm(eval_dataloader, desc="Scoring", disable=not verbose): with torch.no_grad(): inputs = get_inputs(batch, device, self.model_name, train_mode=False) outputs = self.model(**inputs) logits = outputs[0] yield logits.detach().cpu().numpy()
def prepare_model_and_optimizer( self, num_gpus, gpu_ids, local_rank, weight_decay, learning_rate, adam_epsilon, fp16=False, fp16_opt_level="O1", checkpoint_state_dict=None, ): """ This function initializes an optimizer and moves the model to a device. It can be used by most child classes before calling fine_tune. Child classes that require custom optimizers need to either override this function or implement the steps listed below in the specified order before fine-tuning. The steps are performed in the following order: 1. Move model to device 2. Create optimizer 3. Initialize amp 4. Parallelize model """ amp = get_amp(fp16) # get device device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank) # move model self.model = move_model_to_device(model=self.model, device=device) # init optimizer self.optimizer = Transformer.get_default_optimizer( self.model, weight_decay, learning_rate, adam_epsilon) if fp16 and amp: self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level=fp16_opt_level) if checkpoint_state_dict: self.optimizer.load_state_dict(checkpoint_state_dict["optimizer"]) self.model.load_state_dict(checkpoint_state_dict["model"]) if fp16 and amp: amp.load_state_dict(checkpoint_state_dict["amp"]) self.model = parallelize_model( model=self.model, device=device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank, ) return device, num_gpus, amp
def predict_scores(self, test_dataloader, num_gpus=1, gpu_ids=None, verbose=True): """ Scores a dataset using a fine-tuned model and a given dataloader. Args: test_dataloader (Dataloader): Dataloader for scoring the data. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. gpu_ids (list): List of GPU IDs to be used. If set to None, the first num_gpus GPUs will be used. Defaults to None. verbose (bool, optional): Whether to print out the training log. Defaults to True. Returns 1darray: numpy array of predicted sentence scores. """ device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) preds = list( super().predict( eval_dataloader=test_dataloader, get_inputs=ExtSumProcessor.get_inputs, num_gpus=num_gpus, gpu_ids=gpu_ids, verbose=verbose, ) ) return preds
def get_hidden_states(self, text, batch_size=32): """Extract the hidden states from the pretrained model Args: text: List of documents to extract features from. batch_size: Batch size, defaults to 32. Returns: pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]). """ device = get_device("cpu" if self.num_gpus == 0 or self.cuda else "gpu") self.model = move_to_device(self.model, device, self.num_gpus) self.model.eval() tokens = self.tokenizer.tokenize(text) tokens, input_ids, input_mask, input_type_ids = self.tokenizer.preprocess_encoder_tokens( tokens, max_len=self.max_len ) input_ids = torch.tensor(input_ids, dtype=torch.long, device=device) input_mask = torch.tensor(input_mask, dtype=torch.long, device=device) input_type_ids = torch.arange(input_ids.size(0), dtype=torch.long, device=device) eval_data = TensorDataset(input_ids, input_mask, input_type_ids) eval_dataloader = DataLoader( eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size ) hidden_states = {"text_index": [], "token": [], "layer_index": [], "values": []} for (input_ids_tensor, input_mask_tensor, example_indices_tensor) in eval_dataloader: with torch.no_grad(): all_encoder_layers, _ = self.model( input_ids_tensor, token_type_ids=None, attention_mask=input_mask_tensor ) self.embedding_dim = all_encoder_layers[0].size()[-1] for b, example_index in enumerate(example_indices_tensor): for (i, token) in enumerate(tokens[example_index.item()]): for (j, layer_index) in enumerate(self.layer_index): layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() layer_output = layer_output[b] hidden_states["text_index"].append(example_index.item()) hidden_states["token"].append(token) hidden_states["layer_index"].append(layer_index) hidden_states["values"].append( [round(x.item(), 6) for x in layer_output[i]] ) # empty cache del [input_ids_tensor, input_mask_tensor, example_indices_tensor] torch.cuda.empty_cache() # empty cache del [input_ids, input_mask, input_type_ids] torch.cuda.empty_cache() return pd.DataFrame.from_dict(hidden_states)
def fit( self, train_dataloader, num_epochs=1, num_gpus=None, local_rank=-1, weight_decay=0.0, learning_rate=5e-5, adam_epsilon=1e-8, warmup_steps=0, verbose=True, seed=None, ): """ Fit the TokenClassifier model using the given training dataset. Args: train_dataloader (DataLoader): DataLoader instance for training. num_epochs (int, optional): Number of training epochs. Defaults to 1. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. local_rank (int, optional): Whether need to do distributed training. Defaults to -1, no distributed training. weight_decay (float, optional): Weight decay rate. Defaults to 0. learning_rate (float, optional): The learning rate. Defaults to 5e-5. adam_espilon (float, optional): The 'eps' parameter for the 'AdamW' optimizer. Defaults to 1e-8. warmup_steps (int, optional): Number of warmup steps for 'WarmupLinearSchedule'. Defaults to 0. verbose (bool, optional): Verbose model. Defaults to False. seed (int, optional): The seed for the transformers. Defaults to None, use the default seed. """ device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) if isinstance(self.model, nn.DataParallel): self.model.module.to(device) else: self.model.to(device) super().fine_tune( train_dataloader=train_dataloader, get_inputs=TokenClassificationProcessor.get_inputs, device=device, n_gpu=num_gpus, num_train_epochs=num_epochs, weight_decay=weight_decay, learning_rate=learning_rate, adam_epsilon=adam_epsilon, warmup_steps=warmup_steps, verbose=verbose, seed=seed, )
def fit( self, train_dataloader, num_epochs=1, num_gpus=None, local_rank=-1, weight_decay=0.0, learning_rate=5e-5, adam_epsilon=1e-8, warmup_steps=0, verbose=True, seed=None, ): """ Fine-tunes a pre-trained sequence classification model. Args: train_dataloader (Dataloader): Dataloader for the training data. num_epochs (int, optional): Number of training epochs. Defaults to 1. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to -1, which means non-distributed training. weight_decay (float, optional): Weight decay to apply after each parameter update. Defaults to 0.0. learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to 5e-5. adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8. warmup_steps (int, optional): Number of steps taken to increase learning rate from 0 to `learning rate`. Defaults to 0. verbose (bool, optional): Whether to print out the training log. Defaults to True. seed (int, optional): Random seed used to improve reproducibility. Defaults to None. """ device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) if isinstance(self.model, nn.DataParallel): self.model.module.to(device) else: self.model.to(device) super().fine_tune( train_dataloader=train_dataloader, get_inputs=Processor.get_inputs, device=device, n_gpu=num_gpus, num_train_epochs=num_epochs, weight_decay=weight_decay, learning_rate=learning_rate, adam_epsilon=adam_epsilon, warmup_steps=warmup_steps, verbose=verbose, seed=seed, )
def predict(self, eval_dataloader, num_gpus=1, verbose=True): device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) if isinstance(self.model, nn.DataParallel): self.model.module.to(device) else: self.model.to(device) preds = list(super().predict( eval_dataloader=eval_dataloader, get_inputs=Processor.get_inputs, device=device, verbose=verbose, )) preds = np.concatenate(preds) # todo generator & probs return np.argmax(preds, axis=1)
def predict( self, eval_dataloader, num_gpus=None, verbose=True ): """ Test on an evaluation dataset and get the token label predictions. Args: eval_dataset (TensorDataset): A TensorDataset for evaluation. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. verbose (bool, optional): Verbose model. Defaults to False. Returns: ndarray: Numpy ndarray of raw predictions. The shape of the ndarray is [number_of_examples, sequence_length, number_of_labels]. Each value in the ndarray is not normalized. Post-process will be needed to get the probability for each class label. """ device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1) if isinstance(self.model, nn.DataParallel): self.model.module.to(device) else: self.model.to(device) preds = list( super().predict( eval_dataloader=eval_dataloader, get_inputs=TokenClassificationProcessor.get_inputs, device=device, verbose=verbose ) ) preds_np = np.concatenate(preds) return preds_np
def predict(self, eval_dataloader, get_inputs, n_gpu=1, verbose=True): device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1) if isinstance(self.model, torch.nn.DataParallel): self.model = self.model.module if num_gpus > 1: self.model = torch.nn.DataParallel(self.model, device_ids=list( range(num_gpus))) self.model.to(device) self.model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = get_inputs(batch, self.model_name, train_mode=False) outputs = self.model(**inputs) logits = outputs[0] yield logits.detach().cpu().numpy()
def fit( self, train_dataloader, num_epochs=1, num_gpus=None, local_rank=-1, weight_decay=0.0, learning_rate=5e-5, adam_epsilon=1e-8, warmup_steps=0, verbose=True, seed=None, ): """ Fine-tunes a pre-trained sequence classification model. """ device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) if isinstance(self.model, nn.DataParallel): self.model.module.to(device) else: self.model.to(device) super().fine_tune( train_dataloader=train_dataloader, get_inputs=Processor.get_inputs, device=device, n_gpu=num_gpus, num_train_epochs=num_epochs, weight_decay=weight_decay, learning_rate=learning_rate, adam_epsilon=adam_epsilon, warmup_steps=warmup_steps, verbose=verbose, seed=seed, )
def fit( self, train_dataset, num_gpus=None, gpu_ids=None, batch_size=4, local_rank=-1, max_steps=5e4, warmup_steps_bert=20000, warmup_steps_dec=10000, learning_rate_bert=0.002, learning_rate_dec=0.2, optimization_method="adam", max_grad_norm=0, beta1=0.9, beta2=0.999, decay_method="noam", gradient_accumulation_steps=1, report_every=10, save_every=1000, verbose=True, seed=None, fp16=False, fp16_opt_level="O2", world_size=1, rank=0, validation_function=None, checkpoint=None, **kwargs, ): """ Fine-tune pre-trained transofmer models for extractive summarization. Args: train_dataset (SummarizationDataset): Training dataset. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. gpu_ids (list): List of GPU IDs to be used. If set to None, the first num_gpus GPUs will be used. Defaults to None. batch_size (int, optional): Maximum number of tokens in each batch. local_rank (int, optional): Local_rank for distributed training on GPUs. Local rank means the ranking of the current GPU device on the current node. Defaults to -1, which means non-distributed training. max_steps (int, optional): Maximum number of training steps. Defaults to 5e5. warmup_steps_bert (int, optional): Number of steps taken to increase learning rate from 0 to `learning_rate` for tuning the BERT encoder. Defaults to 2e4. warmup_steps_dec (int, optional): Number of steps taken to increase learning rate from 0 to `learning_rate` for tuning the decoder. Defaults to 1e4. learning_rate_bert (float, optional): Learning rate of the optimizer for the encoder. Defaults to 0.002. learning_rate_dec (float, optional): Learning rate of the optimizer for the decoder. Defaults to 0.2. optimization_method (string, optional): Optimization method used in fine tuning. Defaults to "adam". max_grad_norm (float, optional): Maximum gradient norm for gradient clipping. Defaults to 0. beta1 (float, optional): The exponential decay rate for the first moment estimates. Defaults to 0.9. beta2 (float, optional): The exponential decay rate for the second-moment estimates. This value should be set close to 1.0 on problems with a sparse gradient. Defaults to 0.99. decay_method (string, optional): learning rate decrease method. Default to 'noam'. gradient_accumulation_steps (int, optional): Number of batches to accumulate gradients on between each model parameter update. Defaults to 1. report_every (int, optional): The interval by steps to print out the training log. Defaults to 10. save_every (int, optional): The interval by steps to save the finetuned model. Defaults to 100. verbose (bool, optional): Whether to print out the training log. Defaults to True. seed (int, optional): Random seed used to improve reproducibility. Defaults to None. fp16 (bool, optional): Whether to use mixed precision training. Defaults to False. fp16_opt_level (str, optional): optimization level, refer to https://nvidia.github.io/apex/amp.html#opt-levels for details. Value choices are: "O0", "O1", "O2", "O3". Defaults to "O2". world_size (int, optional): Total number of GPUs that will be used. Defaults to 1. rank (int, optional): Global rank of the current GPU in distributed training. It's calculated with the rank of the current node in the cluster/world and the `local_rank` of the device in the current node. See an example in :file: `examples/text_summarization/ abstractive_summarization_bertsum_cnndm_distributed_train.py`. Defaults to 0. validation_function (function, optional): function used in fitting to validate the performance. Default to None. checkpoint (str, optional): file path for a checkpoint based on which the training continues. Default to None. """ # get device device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank) # move model to devices print("device is {}".format(device)) if checkpoint: checkpoint = torch.load(checkpoint, map_location="cpu") self.model.load_checkpoint(checkpoint["model"]) self.model = move_model_to_device(model=self.model, device=device) # init optimizer self.optim_bert = model_builder.build_optim_bert( self.model, optim=optimization_method, lr_bert=learning_rate_bert, warmup_steps_bert=warmup_steps_bert, max_grad_norm=max_grad_norm, beta1=beta1, beta2=beta2, ) self.optim_dec = model_builder.build_optim_dec( self.model, optim=optimization_method, lr_dec=learning_rate_dec, warmup_steps_dec=warmup_steps_dec, max_grad_norm=max_grad_norm, beta1=beta1, beta2=beta2, ) optimizers = [self.optim_bert, self.optim_dec] self.amp = get_amp(fp16) if self.amp: self.model, optim = self.amp.initialize(self.model, optimizers, opt_level=fp16_opt_level) global_step = 0 if checkpoint: if checkpoint["optimizers"]: for i in range(len(optimizers)): model_builder.load_optimizer_checkpoint( optimizers[i], checkpoint["optimizers"][i]) if self.amp and "amp" in checkpoint and checkpoint["amp"]: self.amp.load_state_dict(checkpoint["amp"]) if "global_step" in checkpoint and checkpoint["global_step"]: global_step = checkpoint["global_step"] / world_size print("global_step is {}".format(global_step)) self.model = parallelize_model(model=self.model, device=device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank, apex=self.amp) if local_rank == -1: sampler = RandomSampler(train_dataset) else: sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank) def collate_fn(data): return self.processor.collate(data, block_size=self.max_pos_length, device=device) train_dataloader = DataLoader(train_dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate_fn) # compute the max number of training steps max_steps = compute_training_steps( train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, ) super().fine_tune( train_dataloader=train_dataloader, get_inputs=BertSumAbsProcessor.get_inputs, device=device, num_gpus=num_gpus, max_steps=max_steps, global_step=global_step, max_grad_norm=max_grad_norm, gradient_accumulation_steps=gradient_accumulation_steps, verbose=verbose, seed=seed, report_every=report_every, save_every=save_every, clip_grad_norm=False, optimizer=optimizers, scheduler=None, fp16=fp16, amp=self.amp, validation_function=validation_function, ) # release GPU memories self.model.cpu() torch.cuda.empty_cache() self.save_model(max_steps)
def predict(self, token_ids, input_mask, labels=None, batch_size=32, num_gpus=None, probabilities=False): """ Predict token labels on the testing data. Args: token_ids (list): List of lists. Each sublist contains numerical token ids corresponding to the tokens in the input text data. input_mask (list): List of lists. Each sublist contains the attention mask of the input token list, 1 for input tokens and 0 for padded tokens, so that padded tokens are not attended to. labels (list, optional): List of lists. Each sublist contains numerical token labels of an input sentence/paragraph. If provided, it's used to compute the evaluation loss. Default value is None. batch_size (int, optional): Testing batch size. Defaults to 32. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. Defaults to None. Returns: list or namedtuple(list, ndarray): List of lists of predicted token labels or ([token labels], probabilities) if probabilities is True. The probabilities output is an n x m array, where n is the size of the testing data and m is the number of tokens in each input sublist. The probability values are the softmax probability of the predicted class. """ test_dataloader = create_data_loader( input_ids=token_ids, input_mask=input_mask, label_ids=labels, batch_size=batch_size, sample_method="sequential", ) device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) self.model.eval() eval_loss = 0 nb_eval_steps = 0 for step, batch in enumerate( tqdm(test_dataloader, desc="Iteration", mininterval=10)): batch = tuple(t.to(device) for t in batch) true_label_available = False if labels: b_input_ids, b_input_mask, b_labels = batch true_label_available = True else: b_input_ids, b_input_mask = batch with torch.no_grad(): logits = self.model(b_input_ids, attention_mask=b_input_mask) if true_label_available: active_loss = b_input_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels)[active_loss] active_labels = b_labels.view(-1)[active_loss] loss_fct = nn.CrossEntropyLoss() tmp_eval_loss = loss_fct(active_logits, active_labels) eval_loss += tmp_eval_loss.mean().item() logits = logits.detach().cpu() if step == 0: logits_all = logits.numpy() else: logits_all = np.append(logits_all, logits, axis=0) nb_eval_steps += 1 predictions = [list(p) for p in np.argmax(logits_all, axis=2)] if true_label_available: validation_loss = eval_loss / nb_eval_steps print("Evaluation loss: {}".format(validation_loss)) if probabilities: return namedtuple("Predictions", "classes probabilities")( predictions, np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2)) else: return predictions
def fit( self, token_ids, input_mask, labels, num_gpus=None, num_epochs=1, batch_size=32, learning_rate=2e-5, warmup_proportion=None, ): """ Fine-tunes the BERT classifier using the given training data. Args: token_ids (list): List of lists. Each sublist contains numerical token ids corresponding to the tokens in the input text data. input_mask (list): List of lists. Each sublist contains the attention mask of the input token id list. 1 for input tokens and 0 for padded tokens, so that padded tokens are not attended to. labels (list): List of lists, each sublist contains numerical token labels of an input sentence/paragraph. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. Defaults to None. num_epochs (int, optional): Number of training epochs. Defaults to 1. batch_size (int, optional): Training batch size. Defaults to 32. learning_rate (float, optional): learning rate of the BertAdam optimizer. Defaults to 2e-5. warmup_proportion (float, optional): Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% of training. Defaults to None. """ train_dataloader = create_data_loader( input_ids=token_ids, input_mask=input_mask, label_ids=labels, sample_method="random", batch_size=batch_size, ) device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) if num_gpus is None: num_gpus_used = torch.cuda.device_count() else: num_gpus_used = min(num_gpus, torch.cuda.device_count()) num_train_optimization_steps = max( (int(len(token_ids) / batch_size) * num_epochs), 1) optimizer = self._get_optimizer( learning_rate=learning_rate, num_train_optimization_steps=num_train_optimization_steps, warmup_proportion=warmup_proportion, ) self.model.train() for _ in trange(int(num_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", mininterval=30)): batch = tuple(t.to(device) for t in batch) b_token_ids, b_input_mask, b_label_ids = batch loss = self.model(input_ids=b_token_ids, attention_mask=b_input_mask, labels=b_label_ids) if num_gpus_used > 1: # mean() to average on multi-gpu. loss = loss.mean() # Accumulate parameter gradients loss.backward() tr_loss += loss.item() nb_tr_steps += 1 # Update parameters based on current gradients optimizer.step() # Reset parameter gradients to zero optimizer.zero_grad() train_loss = tr_loss / nb_tr_steps print("Train loss: {}".format(train_loss)) torch.cuda.empty_cache()
def fit( self, token_ids, input_mask, labels, val_token_ids, val_input_mask, val_labels, token_type_ids=None, val_token_type_ids=None, verbose=True, logging_steps=0, save_steps=0, val_steps=0, ): """Fine-tunes the XLNet classifier using the given training data. Args: token_ids (list): List of training token id lists. input_mask (list): List of input mask lists. labels (list): List of training labels. token_type_ids (list, optional): List of lists. Each sublist contains segment ids indicating if the token belongs to the first sentence(0) or second sentence(1). Only needed for two-sentence tasks. verbose (bool, optional): If True, shows the training progress and loss values. Defaults to True. """ device, num_gpus = get_device(self.num_gpus) self.model = move_to_device(self.model, device, self.num_gpus) token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) input_mask_tensor = torch.tensor(input_mask, dtype=torch.long) labels_tensor = torch.tensor(labels, dtype=torch.long) val_token_ids_tensor = torch.tensor(val_token_ids, dtype=torch.long) val_input_mask_tensor = torch.tensor(val_input_mask, dtype=torch.long) val_labels_tensor = torch.tensor(val_labels, dtype=torch.long) if token_type_ids: token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long) val_token_type_ids_tensor = torch.tensor(val_token_type_ids, dtype=torch.long) train_dataset = TensorDataset( token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor ) val_dataset = TensorDataset( val_token_ids_tensor, val_input_mask_tensor, val_token_type_ids_tensor, val_labels_tensor, ) else: train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor) val_dataset = TensorDataset( val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor ) # define optimizer and model parameters param_optimizer = list(self.model.named_parameters()) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": self.weight_decay, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] val_sampler = RandomSampler(val_dataset) val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=self.batch_size) num_examples = len(token_ids) num_batches = int(np.ceil(num_examples / self.batch_size)) num_train_optimization_steps = num_batches * self.num_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=self.lr, eps=self.adam_eps) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=self.warmup_steps, t_total=num_train_optimization_steps ) global_step = 0 self.model.train() optimizer.zero_grad() for epoch in range(self.num_epochs): train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=self.batch_size ) tr_loss = 0.0 logging_loss = 0.0 val_loss = 0.0 for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if token_type_ids: x_batch, mask_batch, token_type_ids_batch, y_batch = tuple( t.to(device) for t in batch ) else: token_type_ids_batch = None x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch) outputs = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=y_batch, ) loss = outputs[0] # model outputs are always tuple in pytorch-transformers loss.sum().backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) tr_loss += loss.sum().item() optimizer.step() # Update learning rate schedule scheduler.step() optimizer.zero_grad() global_step += 1 # logging of learning rate and loss if logging_steps > 0 and global_step % logging_steps == 0: mlflow.log_metric("learning rate", scheduler.get_lr()[0], step=global_step) mlflow.log_metric( "training loss", (tr_loss - logging_loss) / (logging_steps * self.batch_size), step=global_step, ) logging_loss = tr_loss # model checkpointing if save_steps > 0 and global_step % save_steps == 0: checkpoint_dir = os.path.join(os.getcwd(), "checkpoints") if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + "/" + str(global_step) + ".pth" torch.save(self.model.state_dict(), checkpoint_path) mlflow.log_artifact(checkpoint_path) # model validation if val_steps > 0 and global_step % val_steps == 0: # run model on validation set self.model.eval() val_loss = 0.0 for j, val_batch in enumerate(val_dataloader): if token_type_ids: val_x_batch, val_mask_batch, val_token_type_ids_batch, val_y_batch = tuple( t.to(device) for t in val_batch ) else: token_type_ids_batch = None val_x_batch, val_mask_batch, val_y_batch = tuple( t.to(device) for t in val_batch ) val_outputs = self.model( input_ids=val_x_batch, token_type_ids=val_token_type_ids_batch, attention_mask=val_mask_batch, labels=val_y_batch, ) vloss = val_outputs[0] val_loss += vloss.sum().item() mlflow.log_metric( "validation loss", val_loss / len(val_dataset), step=global_step ) self.model.train() if verbose: if i % ((num_batches // 10) + 1) == 0: if val_loss > 0: print( "epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f};\ average val loss:{:.6f}".format( epoch + 1, self.num_epochs, i + 1, min(i + 1 + num_batches // 10, num_batches), num_batches, tr_loss / (i + 1), val_loss / (j + 1), ) ) else: print( "epoch:{}/{}; batch:{}->{}/{}; average train loss:{:.6f}".format( epoch + 1, self.num_epochs, i + 1, min(i + 1 + num_batches // 10, num_batches), num_batches, tr_loss / (i + 1), ) ) checkpoint_dir = os.path.join(os.getcwd(), "checkpoints") if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + "/" + "final" + ".pth" torch.save(self.model.state_dict(), checkpoint_path) mlflow.log_artifact(checkpoint_path) # empty cache del [x_batch, y_batch, mask_batch, token_type_ids_batch] if val_steps > 0: del [val_x_batch, val_y_batch, val_mask_batch, val_token_type_ids_batch] torch.cuda.empty_cache()
def predict( self, token_ids, input_mask, token_type_ids=None, num_gpus=None, batch_size=8, probabilities=False, ): """Scores the given dataset and returns the predicted classes. Args: token_ids (list): List of training token lists. input_mask (list): List of input mask lists. token_type_ids (list, optional): List of lists. Each sublist contains segment ids indicating if the token belongs to the first sentence(0) or second sentence(1). Only needed for two-sentence tasks. num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. batch_size (int, optional): Scoring batch size. Defaults to 8. probabilities (bool, optional): If True, the predicted probability distribution is also returned. Defaults to False. Returns: 1darray, namedtuple(1darray, ndarray): Predicted classes or (classes, probabilities) if probabilities is True. """ device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) self.model.eval() preds = [] with tqdm(total=len(token_ids)) as pbar: for i in range(0, len(token_ids), batch_size): start = i end = start + batch_size x_batch = torch.tensor(token_ids[start:end], dtype=torch.long, device=device) mask_batch = torch.tensor(input_mask[start:end], dtype=torch.long, device=device) token_type_ids_batch = torch.tensor( token_type_ids[start:end], dtype=torch.long, device=device ) with torch.no_grad(): pred_batch = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) preds.append(pred_batch[0].cpu()) if i % batch_size == 0: pbar.update(batch_size) preds = np.concatenate(preds) if probabilities: return namedtuple("Predictions", "classes probabilities")( preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy() ) else: return preds.argmax(axis=1)
def test_get_device_local_rank(): device, gpus = get_device(local_rank=0) assert isinstance(device, torch.device) assert device.type == "cuda" assert device.index == 0 assert gpus == 1
def predict( self, token_ids, input_mask, token_type_ids=None, num_gpus=None, batch_size=32, probabilities=False, ): """Scores the given dataset and returns the predicted classes. Args: token_ids (list): List of training token lists. input_mask (list): List of input mask lists. token_type_ids (list, optional): List of lists. Each sublist contains segment ids indicating if the token belongs to the first sentence(0) or second sentence(1). Only needed for two-sentence tasks. num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. batch_size (int, optional): Scoring batch size. Defaults to 32. probabilities (bool, optional): If True, the predicted probability distribution is also returned. Defaults to False. Returns: 1darray, namedtuple(1darray, ndarray): Predicted classes or (classes, probabilities) if probabilities is True. """ device, num_gpus = get_device(num_gpus) self.model = move_model_to_device(self.model, device, num_gpus) # score self.model.eval() token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) input_mask_tensor = torch.tensor(input_mask, dtype=torch.long) if token_type_ids: token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long) test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, token_type_ids_tensor) else: test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size) preds = [] for i, batch in enumerate(tqdm(test_dataloader, desc="Iteration")): if token_type_ids: x_batch, mask_batch, token_type_ids_batch = tuple( t.to(device) for t in batch) else: token_type_ids_batch = None x_batch, mask_batch = tuple(t.to(device) for t in batch) with torch.no_grad(): p_batch = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) preds.append(p_batch.cpu()) preds = np.concatenate(preds) if probabilities: return namedtuple("Predictions", "classes probabilities")( preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy()) else: return preds.argmax(axis=1)
def predict( self, test_dataset, per_gpu_batch_size=4, max_tgt_length=64, beam_size=1, need_score_traces=False, length_penalty=0, forbid_duplicate_ngrams=True, forbid_ignore_word=".", s2s_config=S2SConfig(), num_gpus=None, gpu_ids=None, local_rank=-1, fp16=False, verbose=True, ): """ Method for predicting, i.e. generating summaries. Args: test_dataset (S2SAbsSumDataset): Testing dataset. per_gpu_batch_size (int, optional): Number of testing samples in each batch per GPU. Defaults to 4. max_tgt_length (int, optional): Maximum number of tokens in output sequence. Defaults to 64. beam_size (int, optional): Beam size of beam search. Defaults to 1. need_score_traces (bool, optional): Whether to return score traces of beam search. Defaults to False. length_penalty (float, optional): Length penalty for beam search. Defaults to 0. forbid_duplicate_ngrams (bool, optional): Whether to forbid duplicate n-grams when generating output. Size of the n-gram is determined by `S2SConfig.ngram_size` which defaults to 3. Defaults to True. forbid_ignore_word (str, optional): Words to ignore when forbidding duplicate ngrams. Multiple words should be separated by "|", for example, ".|[X_SEP]". Defaults to ".". s2s_config (S2SConfig, optional): Some default decoding settings that the users usually don't need to change. Defaults to S2SConfig(). num_gpus (int, optional): Number of GPUs to use. Ignored if `gpu_ids` is provided. Defaults to None and all available GPUs are used. gpu_ids (list, optional): List of GPU IDs ot use. Defaults to None and GPUs used are determined by num_gpus. local_rank (int, optional): Rank of the device in distributed training. Defaults to -1 which means non-distributed training. fp16 (bool, optional): Whether to use 16-bit mixed precision through Apex. Defaults to False. verbose(bool, optional): Whether to output predicting log. Defaults to True. Returns: List or tuple of lists: List of generated summaries. If `need_score_traces` is True, also returns the score traces of beam search. """ if need_score_traces and beam_size <= 1: raise ValueError( "Score trace is only available for beam search with beam size > 1." ) if max_tgt_length >= self.max_seq_length - 2: raise ValueError("Maximum tgt length exceeds max seq length - 2.") # preprocessing pipeline if self._model_type == "roberta": is_roberta = True no_segment_embedding = True vocab = self.tokenizer.encoder else: is_roberta = False no_segment_embedding = False vocab = self.tokenizer.vocab if not self._model_name.startswith("unilm1.2"): if self._model_name.startswith( "unilm-") or self._model_name.startswith("unilm1-"): new_segment_ids = True else: new_segment_ids = False else: new_segment_ids = False cls_token = "<s>" if is_roberta else "[CLS]" sep_token = "</s>" if is_roberta else "[SEP]" pad_token = "<pad>" if is_roberta else "[PAD]" mask_token = "<mask>" if is_roberta else "[MASK]" max_src_length = self.max_seq_length - 2 - max_tgt_length bi_uni_pipeline = [] bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seqDecoder( list(vocab.keys()), self.tokenizer.convert_tokens_to_ids, self.max_seq_length, max_tgt_length=max_tgt_length, new_segment_ids=new_segment_ids, mode=s2s_config.mode, num_qkv=s2s_config.num_qkv, s2s_special_token=s2s_config.s2s_special_token, s2s_add_segment=s2s_config.s2s_add_segment, s2s_share_segment=s2s_config.s2s_share_segment, pos_shift=s2s_config.pos_shift, cls_token=cls_token, sep_token=sep_token, pad_token=pad_token, )) def collate_fn(input_batch): buf_id = [x[0] for x in input_batch] buf = [x[1][:max_src_length] for x in input_batch] max_a_len = max([len(x) for x in buf]) instances = [] for instance in [(x, max_a_len) for x in buf]: for proc in bi_uni_pipeline: instance = proc(instance) instances.append(instance) batch = seq2seq_loader.batch_list_to_batch_tensors(instances) return (batch, buf_id) # prepare decoder pair_num_relation = 0 cls_num_labels = 2 type_vocab_size = (6 + (1 if s2s_config.s2s_add_segment else 0) if new_segment_ids else 2) ( mask_word_id, eos_word_ids, sos_word_id, ) = self.tokenizer.convert_tokens_to_ids( [mask_token, sep_token, sep_token]) forbid_ignore_set = None if forbid_ignore_word: w_list = [] for w in forbid_ignore_word.split("|"): if w.startswith("[") and w.endswith("]"): w_list.append(w.upper()) else: w_list.append(w) forbid_ignore_set = set( self.tokenizer.convert_tokens_to_ids(w_list)) if hasattr(self.model, "module"): state_dict = self.model.module.state_dict() else: state_dict = self.model.state_dict() model = BertForSeq2SeqDecoder.from_pretrained( self._bert_model_name, state_dict=state_dict, num_labels=cls_num_labels, num_rel=pair_num_relation, type_vocab_size=type_vocab_size, task_idx=3, mask_word_id=mask_word_id, search_beam_size=beam_size, length_penalty=length_penalty, eos_id=eos_word_ids, sos_id=sos_word_id, forbid_duplicate_ngrams=forbid_duplicate_ngrams, forbid_ignore_set=forbid_ignore_set, ngram_size=s2s_config.forbid_ngram_size, min_len=s2s_config.min_len, mode=s2s_config.mode, max_position_embeddings=self.max_seq_length, ffn_type=s2s_config.ffn_type, num_qkv=s2s_config.num_qkv, seg_emb=s2s_config.seg_emb, pos_shift=s2s_config.pos_shift, is_roberta=is_roberta, no_segment_embedding=no_segment_embedding, ) del state_dict if fp16: model.half() # get device device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank) # # move model model = move_model_to_device(model=model, device=device) batch_size = per_gpu_batch_size * max(1, num_gpus) model = parallelize_model( model=model, device=device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank, ) # torch.cuda.empty_cache() model.eval() first_batch = True batch_count = 0 output_lines = [""] * len(test_dataset) score_trace_list = [None] * len(test_dataset) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader( test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn, ) for batch, buf_id in tqdm(test_dataloader, desc="Evaluating", disable=not verbose): batch_count += 1 with torch.no_grad(): batch = [ t.to(device) if t is not None else None for t in batch ] ( input_ids, token_type_ids, position_ids, input_mask, mask_qkv, task_idx, ) = batch traces = model( input_ids, token_type_ids, position_ids, input_mask, task_idx=task_idx, mask_qkv=mask_qkv, ) if beam_size > 1: traces = {k: v.tolist() for k, v in traces.items()} output_ids = traces["pred_seq"] else: output_ids = traces.tolist() for i in range(len(batch[0])): w_ids = output_ids[i] output_buf = self.tokenizer.convert_ids_to_tokens(w_ids) output_tokens = [] for t in output_buf: if t in (sep_token, pad_token): break output_tokens.append(t) if is_roberta: output_sequence = self.tokenizer.convert_tokens_to_string( output_tokens) else: output_sequence = " ".join(detokenize(output_tokens)) if "\n" in output_sequence: output_sequence = " [X_SEP] ".join( output_sequence.split("\n")) output_lines[buf_id[i]] = output_sequence if first_batch or batch_count % 50 == 0: logger.info("{} = {}".format(buf_id[i], output_sequence)) if need_score_traces: score_trace_list[buf_id[i]] = { "scores": traces["scores"][i], "wids": traces["wids"][i], "ptrs": traces["ptrs"][i], } first_batch = False del model del batch torch.cuda.empty_cache() if need_score_traces: return output_lines, score_trace_list else: return output_lines
def fit( self, train_dataset, num_gpus=None, gpu_ids=None, batch_size=3000, local_rank=-1, max_steps=5e5, warmup_steps=1e5, learning_rate=2e-3, optimization_method="adam", max_grad_norm=0, beta1=0.9, beta2=0.999, decay_method="noam", gradient_accumulation_steps=1, report_every=50, verbose=True, seed=None, save_every=-1, world_size=1, rank=0, use_preprocessed_data=False, **kwargs, ): """ Fine-tune pre-trained transofmer models for extractive summarization. Args: train_dataset (ExtSumProcessedIterableDataset): Training dataset. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. gpu_ids (list): List of GPU IDs to be used. If set to None, the first num_gpus GPUs will be used. Defaults to None. batch_size (int, optional): Maximum number of tokens in each batch. local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to -1, which means non-distributed training. max_steps (int, optional): Maximum number of training steps. Defaults to 5e5. warmup_steps (int, optional): Number of steps taken to increase learning rate from 0 to `learning_rate`. Defaults to 1e5. learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to 5e-5. optimization_method (string, optional): Optimization method used in fine tuning. max_grad_norm (float, optional): Maximum gradient norm for gradient clipping. Defaults to 0. gradient_accumulation_steps (int, optional): Number of batches to accumulate gradients on between each model parameter update. Defaults to 1. decay_method (string, optional): learning rate decrease method. Defaulta to 'noam'. report_every (int, optional): The interval by steps to print out the trainint log. Defaults to 50. beta1 (float, optional): The exponential decay rate for the first moment estimates. Defaults to 0.9. beta2 (float, optional): The exponential decay rate for the second-moment estimates. This value should be set close to 1.0 on problems with a sparse gradient. Defaults to 0.99. verbose (bool, optional): Whether to print out the training log. Defaults to True. seed (int, optional): Random seed used to improve reproducibility. Defaults to None. rank (int, optional): Global rank of the current GPU in distributed training. It's calculated with the rank of the current node in the cluster/world and the `local_rank` of the device in the current node. See an example in :file: `examples/text_summarization/ extractive_summarization_cnndm_distributed_train.py`. Defaults to 0. """ # get device device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank) # move model self.model = move_model_to_device(model=self.model, device=device) # init optimizer optimizer = model_builder.build_optim( self.model, optimization_method, learning_rate, max_grad_norm, beta1, beta2, decay_method, warmup_steps, ) self.model = parallelize_model( model=self.model, device=device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank, ) # batch_size is the number of tokens in a batch if use_preprocessed_data: train_dataloader = get_dataloader( train_dataset.get_stream(), is_labeled=True, batch_size=batch_size, world_size=world_size, rank=rank, local_rank=local_rank, ) else: if local_rank == -1: sampler = RandomSampler(train_dataset) else: sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank) def collate_fn(data): return self.processor.collate(data, block_size=self.max_pos_length, device=device) train_dataloader = DataLoader( train_dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate_fn, ) # compute the max number of training steps max_steps = compute_training_steps( train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, ) super().fine_tune( train_dataloader=train_dataloader, get_inputs=ExtSumProcessor.get_inputs, device=device, num_gpus=num_gpus, max_steps=max_steps, max_grad_norm=max_grad_norm, gradient_accumulation_steps=gradient_accumulation_steps, optimizer=optimizer, scheduler=None, verbose=verbose, seed=seed, report_every=report_every, clip_grad_norm=False, save_every=save_every, )
def predict( self, test_dataset, num_gpus=None, gpu_ids=None, batch_size=16, sentence_separator="<q>", top_n=3, block_trigram=True, cal_lead=False, verbose=True, local_rank=-1, ): """ Predict the summarization for the input data iterator. Args: test_dataset (Dataset): Dataset for which the summary to be predicted num_gpus (int, optional): The number of GPUs used in prediction. Defaults to 1. gpu_ids (list): List of GPU IDs to be used. If set to None, the first num_gpus GPUs will be used. Defaults to None. batch_size (int, optional): The number of test examples in each batch. Defaults to 16. sentence_separator (str, optional): String to be inserted between sentences in the prediction. Defaults to '<q>'. top_n (int, optional): The number of sentences that should be selected from the paragraph as summary. Defaults to 3. block_trigram (bool, optional): voolean value which specifies whether the summary should include any sentence that has the same trigram as the already selected sentences. Defaults to True. cal_lead (bool, optional): Boolean value which specifies whether the prediction uses the first few sentences as summary. Defaults to False. verbose (bool, optional): Whether to print out the training log. Defaults to True. Returns: List of strings which are the summaries """ device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank) def collate_processed_data(dict_list): # tuple_batch = [list(col) for col in zip(*[d.values() for d in dict_list] if dict_list is None or len(dict_list) <= 0: return None tuple_batch = [list(d.values()) for d in dict_list] # generate mask and mask_cls, and only select tensors for the model input # the labels was never used in prediction, set is_labeled as False batch = Batch(tuple_batch, is_labeled=False) return batch def collate(data): return self.processor.collate(data, block_size=self.max_pos_length, train_mode=False, device=device) if len(test_dataset) == 0: return None if "segs" in test_dataset[0]: collate_fn = collate_processed_data else: collate_fn = collate test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader( test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn, ) sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus, gpu_ids=gpu_ids) sent_scores_list = list(sent_scores) scores_list = [] for i in sent_scores_list: scores_list.extend(i) prediction = [] for i in range(len(test_dataset)): temp_pred = get_pred( test_dataset[i], scores_list[i], cal_lead=cal_lead, sentence_separator=sentence_separator, block_trigram=block_trigram, top_n=top_n, ) prediction.extend(temp_pred) # release GPU memories self.model.cpu() torch.cuda.empty_cache() return prediction
def fit( self, train_loader, epoch, bert_optimizer=None, num_epochs=1, num_gpus=None, lr=2e-5, warmup_proportion=None, fp16_allreduce=False, num_train_optimization_steps=10, ): """ Method to fine-tune the bert classifier using the given training data Args: train_loader(torch.DataLoader): Torch Dataloader created from Torch Dataset epoch(int): Current epoch number of training. bert_optimizer(optimizer): optimizer can be BERTAdam for local and Dsitributed if Horovod num_epochs(int): the number of epochs to run num_gpus(int): the number of gpus. If None is specified, all available GPUs will be used. lr (float): learning rate of the adam optimizer. defaults to 2e-5. warmup_proportion (float, optional): proportion of training to perform linear learning rate warmup for. e.g., 0.1 = 10% of training. defaults to none. fp16_allreduce(bool): if true, use fp16 compression during allreduce num_train_optimization_steps: number of steps the optimizer should take. """ device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) if bert_optimizer is None: bert_optimizer = self.create_optimizer( num_train_optimization_steps=num_train_optimization_steps, lr=lr, warmup_proportion=warmup_proportion, fp16_allreduce=fp16_allreduce, ) if self.use_distributed: hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) loss_func = nn.CrossEntropyLoss().to(device) # train self.model.train() # training mode token_type_ids_batch = None num_print = 1000 for batch_idx, data in enumerate(train_loader): x_batch = data["token_ids"] x_batch = x_batch.cuda() y_batch = data["labels"] y_batch = y_batch.cuda() mask_batch = data["input_mask"] mask_batch = mask_batch.cuda() if "token_type_ids" in data and data["token_type_ids"] is not None: token_type_ids_batch = data["token_type_ids"] token_type_ids_batch = token_type_ids_batch.cuda() bert_optimizer.zero_grad() y_h = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) loss = loss_func(y_h, y_batch).mean() loss.backward() bert_optimizer.synchronize() bert_optimizer.step() if batch_idx % num_print == 0: print( "Train Epoch: {}/{} ({:.0f}%) \t Batch:{} \tLoss: {:.6f}". format( epoch, num_epochs, 100.0 * batch_idx / len(train_loader), batch_idx + 1, loss.item(), )) del [x_batch, y_batch, mask_batch, token_type_ids_batch] torch.cuda.empty_cache()
def fine_tune( self, train_dataloader, get_inputs, num_gpus=None, gpu_ids=None, max_steps=-1, max_grad_norm=1.0, gradient_accumulation_steps=1, optimizer=None, scheduler=None, fp16=False, fp16_opt_level="O1", local_rank=-1, verbose=True, seed=None, report_every=10, clip_grad_norm=True, ): # get device device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank) if seed is not None: Transformer.set_seed(seed, num_gpus > 0) if fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex") self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level) # move model self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank) # init training global_step = 0 tr_loss = 0.0 accum_loss = 0 self.model.train() self.model.zero_grad() # train start = time.time() while global_step < max_steps: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose) for step, batch in enumerate(epoch_iterator): inputs = get_inputs(batch, device, self.model_name) outputs = self.model(**inputs) loss = outputs[0] if num_gpus > 1: loss = loss.mean() if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() accum_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: global_step += 1 if clip_grad_norm: if fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) if global_step % report_every == 0 and verbose: end = time.time() print( "loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, step:{3:.0f}/{4:.0f}".format( accum_loss / report_every, end - start, len(batch), global_step, max_steps, ) ) accum_loss = 0 start = end optimizer.step() if scheduler: scheduler.step() self.model.zero_grad() if global_step > max_steps: epoch_iterator.close() break return global_step, tr_loss / global_step
def predict( self, test_dataset, num_gpus=None, gpu_ids=None, local_rank=-1, batch_size=16, alpha=0.6, beam_size=5, min_length=15, max_length=150, fp16=False, verbose=True, ): """ Predict the summarization for the input data iterator. Args: test_dataset (SummarizationDataset): Dataset for which the summary to be predicted. num_gpus (int, optional): The number of GPUs used in prediction. Defaults to 1. gpu_ids (list): List of GPU IDs to be used. If set to None, the first num_gpus GPUs will be used. Defaults to None. local_rank (int, optional): Local rank of the device in distributed inferencing. Defaults to -1, which means non-distributed inferencing. batch_size (int, optional): The number of test examples in each batch. Defaults to 16. alpha (float, optional): Length penalty. Defaults to 0.6. beam_size (int, optional): Beam size of beam search. Defaults to 5. min_length (int, optional): Minimum number of tokens in the output sequence. Defaults to 15. max_length (int, optional): Maximum number of tokens in output sequence. Defaults to 150. fp16 (bool, optional): Whether to use half-precision model for prediction. Defaults to False. verbose (bool, optional): Whether to print out the training log. Defaults to True. Returns: List of strings which are the summaries """ device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank) # move model to devices def this_model_move_callback(model, device): model = move_model_to_device(model, device) return parallelize_model(model, device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank) if fp16: self.model = self.model self.model = move_model_to_device(self.model, device) self.model.eval() predictor = build_predictor( self.processor.tokenizer, self.processor.symbols, self.model, alpha=alpha, beam_size=beam_size, min_length=min_length, max_length=max_length, ) predictor = this_model_move_callback(predictor, device) self.model = parallelize_model( self.model, device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank, ) test_sampler = SequentialSampler(test_dataset) def collate_fn(data): return self.processor.collate(data, self.max_pos_length, device, train_mode=False) test_dataloader = DataLoader( test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn, ) print("dataset length is {}".format(len(test_dataset))) def format_summary(translation): """ Transforms the output of the `from_batch` function into nicely formatted summaries. """ raw_summary = translation summary = (raw_summary.replace("[unused0]", "").replace( "[unused3]", "").replace("[CLS]", "").replace("[SEP]", "").replace( "[PAD]", "").replace("[unused1]", "").replace(r" +", " ").replace( " [unused2] ", ".").replace("[unused2]", "").strip()) return summary def generate_summary_from_tokenid(preds, pred_score): batch_size = preds.size()[0] # batch.batch_size translations = [] for b in range(batch_size): if len(preds[b]) < 1: pred_sents = "" else: pred_sents = self.processor.tokenizer.convert_ids_to_tokens( [int(n) for n in preds[b] if int(n) != 0]) pred_sents = " ".join(pred_sents).replace(" ##", "") translations.append(pred_sents) return translations generated_summaries = [] for batch in tqdm(test_dataloader, desc="Generating summary", disable=not verbose): input = self.processor.get_inputs(batch, device, "bert", train_mode=False) translations, scores = predictor(**input) translations_text = generate_summary_from_tokenid( translations, scores) summaries = [format_summary(t) for t in translations_text] generated_summaries.extend(summaries) # release GPU memories # self.model.cpu() # torch.cuda.empty_cache() return generated_summaries
def fit( self, token_ids, input_mask, labels, token_type_ids=None, num_gpus=None, num_epochs=1, batch_size=32, lr=2e-5, warmup_proportion=None, verbose=True, ): """Fine-tunes the BERT classifier using the given training data. Args: token_ids (list): List of training token id lists. input_mask (list): List of input mask lists. labels (list): List of training labels. token_type_ids (list, optional): List of lists. Each sublist contains segment ids indicating if the token belongs to the first sentence(0) or second sentence(1). Only needed for two-sentence tasks. num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. num_epochs (int, optional): Number of training epochs. Defaults to 1. batch_size (int, optional): Training batch size. Defaults to 32. lr (float): Learning rate of the Adam optimizer. Defaults to 2e-5. warmup_proportion (float, optional): Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% of training. Defaults to None. verbose (bool, optional): If True, shows the training progress and loss values. Defaults to True. """ device, num_gpus = get_device(num_gpus) self.model = move_model_to_device(self.model, device, num_gpus) token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) input_mask_tensor = torch.tensor(input_mask, dtype=torch.long) labels_tensor = torch.tensor(labels, dtype=torch.long) if token_type_ids: token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long) train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor) else: train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) # define optimizer and model parameters param_optimizer = list(self.model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_batches = len(train_dataloader) num_train_optimization_steps = num_batches * num_epochs if warmup_proportion is None: opt = BertAdam(optimizer_grouped_parameters, lr=lr) else: opt = BertAdam( optimizer_grouped_parameters, lr=lr, t_total=num_train_optimization_steps, warmup=warmup_proportion, ) # define loss function loss_func = nn.CrossEntropyLoss().to(device) # train self.model.train() # training mode for epoch in range(num_epochs): training_loss = 0 for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if token_type_ids: x_batch, mask_batch, token_type_ids_batch, y_batch = tuple( t.to(device) for t in batch) else: token_type_ids_batch = None x_batch, mask_batch, y_batch = tuple( t.to(device) for t in batch) opt.zero_grad() y_h = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) loss = loss_func(y_h, y_batch).mean() training_loss += loss.item() loss.backward() opt.step() if verbose: if i % ((num_batches // 10) + 1) == 0: print( "epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f}" .format( epoch + 1, num_epochs, i + 1, min(i + 1 + num_batches // 10, num_batches), num_batches, training_loss / (i + 1), )) # empty cache del [x_batch, y_batch, mask_batch, token_type_ids_batch] torch.cuda.empty_cache()
def test_get_device_all_gpus(): device, gpus = get_device() assert isinstance(device, torch.device) assert device.type == "cuda" assert gpus == torch.cuda.device_count()
def predict(self, test_loader, num_gpus=None, probabilities=False): """ Method to predict the results on the test loader. Only evaluates for non distributed workload on the head node in a distributed setup. Args: test_loader(torch Dataloader): Torch Dataloader created from Torch Dataset num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. probabilities (bool, optional): If True, the predicted probability distribution is also returned. Defaults to False. Returns: 1darray, dict(1darray, 1darray, ndarray): Predicted classes and target labels or a dictionary with classes, target labels, probabilities) if probabilities is True. """ device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) # score self.model.eval() preds = [] test_labels = [] for i, data in enumerate(tqdm(test_loader, desc="Iteration")): x_batch = data["token_ids"] x_batch = x_batch.cuda() mask_batch = data["input_mask"] mask_batch = mask_batch.cuda() y_batch = data["labels"] token_type_ids_batch = None if "token_type_ids" in data and data["token_type_ids"] is not None: token_type_ids_batch = data["token_type_ids"] token_type_ids_batch = token_type_ids_batch.cuda() with torch.no_grad(): p_batch = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) preds.append(p_batch.cpu()) test_labels.append(y_batch) preds = np.concatenate(preds) test_labels = np.concatenate(test_labels) if probabilities: return { "Predictions": preds.argmax(axis=1), "Target": test_labels, "classes probabilities": nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(), } else: return preds.argmax(axis=1), test_labels
def fine_tune( self, train_dataloader, get_inputs, max_steps=-1, num_train_epochs=1, max_grad_norm=1.0, gradient_accumulation_steps=1, n_gpu=1, optimizer=None, scheduler=None, weight_decay=0.0, learning_rate=5e-5, adam_epsilon=1e-8, warmup_steps=0, fp16=False, fp16_opt_level="O1", local_rank=-1, verbose=True, seed=None, ): device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1) if seed is not None: Transformer.set_seed(seed, num_gpus > 0) if max_steps > 0: t_total = max_steps num_train_epochs = ( max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1) else: t_total = len(train_dataloader ) // gradient_accumulation_steps * num_train_epochs if optimizer is None: no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) if scheduler is None: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex" ) self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level) if local_rank != -1: self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True, ) else: if isinstance(self.model, torch.nn.DataParallel): self.model = self.model.module if num_gpus > 1: self.model = torch.nn.DataParallel(self.model, device_ids=list( range(num_gpus))) self.model.to(device) self.model.train() global_step = 0 tr_loss = 0.0 self.model.zero_grad() train_iterator = trange(int(num_train_epochs), desc="Epoch", disable=local_rank not in [-1, 0] or not verbose) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose) for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(device) for t in batch) inputs = get_inputs(batch, self.model_name) outputs = self.model(**inputs) loss = outputs[0] if num_gpus > 1: loss = loss.mean() if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps if step % 10 == 0 and verbose: tqdm.write("Loss:{:.6f}".format(loss)) if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) tr_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() self.model.zero_grad() global_step += 1 if max_steps > 0 and global_step > max_steps: epoch_iterator.close() break if max_steps > 0 and global_step > max_steps: train_iterator.close() break # empty cache del [batch] torch.cuda.empty_cache() return global_step, tr_loss / global_step