def test_move_to_device_gpu(model): # test when device.type="cuda" model_cuda = move_to_device(model, torch.device("cuda")) num_cuda_devices = torch.cuda.device_count() if num_cuda_devices > 1: assert isinstance(model_cuda, DataParallel) else: assert isinstance(model_cuda, Sequential) model_cuda_1_gpu = move_to_device(model, torch.device("cuda"), num_gpus=1) assert isinstance(model_cuda_1_gpu, Sequential) model_cuda_1_more_gpu = move_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices + 1) if num_cuda_devices > 1: assert isinstance(model_cuda_1_more_gpu, DataParallel) else: assert isinstance(model_cuda_1_more_gpu, Sequential) model_cuda_same_gpu = move_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices) if num_cuda_devices > 1: assert isinstance(model_cuda_same_gpu, DataParallel) else: assert isinstance(model_cuda_same_gpu, Sequential)
def test_move_to_device_cpu_parallelized(model): # test when input model is parallelized model_parallelized = nn.DataParallel(model) model_parallelized_output = move_to_device(model_parallelized, torch.device("cpu")) assert isinstance(model_parallelized_output, nn.modules.container.Sequential)
def get_hidden_states(self, text, batch_size=32): """Extract the hidden states from the pretrained model Args: text: List of documents to extract features from. batch_size: Batch size, defaults to 32. Returns: pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]). """ device = get_device("cpu" if self.num_gpus == 0 or self.cuda else "gpu") self.model = move_to_device(self.model, device, self.num_gpus) self.model.eval() tokens = self.tokenizer.tokenize(text) tokens, input_ids, input_mask, input_type_ids = self.tokenizer.preprocess_encoder_tokens( tokens, max_len=self.max_len ) input_ids = torch.tensor(input_ids, dtype=torch.long, device=device) input_mask = torch.tensor(input_mask, dtype=torch.long, device=device) input_type_ids = torch.arange(input_ids.size(0), dtype=torch.long, device=device) eval_data = TensorDataset(input_ids, input_mask, input_type_ids) eval_dataloader = DataLoader( eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size ) hidden_states = {"text_index": [], "token": [], "layer_index": [], "values": []} for (input_ids_tensor, input_mask_tensor, example_indices_tensor) in eval_dataloader: with torch.no_grad(): all_encoder_layers, _ = self.model( input_ids_tensor, token_type_ids=None, attention_mask=input_mask_tensor ) self.embedding_dim = all_encoder_layers[0].size()[-1] for b, example_index in enumerate(example_indices_tensor): for (i, token) in enumerate(tokens[example_index.item()]): for (j, layer_index) in enumerate(self.layer_index): layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() layer_output = layer_output[b] hidden_states["text_index"].append(example_index.item()) hidden_states["token"].append(token) hidden_states["layer_index"].append(layer_index) hidden_states["values"].append( [round(x.item(), 6) for x in layer_output[i]] ) # empty cache del [input_ids_tensor, input_mask_tensor, example_indices_tensor] torch.cuda.empty_cache() # empty cache del [input_ids, input_mask, input_type_ids] torch.cuda.empty_cache() return pd.DataFrame.from_dict(hidden_states)
def predict(self, test_loader, num_gpus=None, probabilities=False): """ Method to predict the results on the test loader. Only evaluates for non distributed workload on the head node in a distributed setup. Args: test_loader(torch Dataloader): Torch Dataloader created from Torch Dataset num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. probabilities (bool, optional): If True, the predicted probability distribution is also returned. Defaults to False. Returns: 1darray, dict(1darray, 1darray, ndarray): Predicted classes and target labels or a dictionary with classes, target labels, probabilities) if probabilities is True. """ device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) # score self.model.eval() preds = [] test_labels = [] for i, data in enumerate(tqdm(test_loader, desc="Iteration")): x_batch = data["token_ids"] x_batch = x_batch.cuda() mask_batch = data["input_mask"] mask_batch = mask_batch.cuda() y_batch = data["labels"] token_type_ids_batch = None if "token_type_ids" in data and data["token_type_ids"] is not None: token_type_ids_batch = data["token_type_ids"] token_type_ids_batch = token_type_ids_batch.cuda() with torch.no_grad(): p_batch = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) preds.append(p_batch.cpu()) test_labels.append(y_batch) preds = np.concatenate(preds) test_labels = np.concatenate(test_labels) if probabilities: return { "Predictions": preds.argmax(axis=1), "Target": test_labels, "classes probabilities": nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(), } else: return preds.argmax(axis=1), test_labels
def fit( self, train_loader, epoch, bert_optimizer=None, num_epochs=1, num_gpus=None, lr=2e-5, warmup_proportion=None, fp16_allreduce=False, num_train_optimization_steps=10, ): """ Method to fine-tune the bert classifier using the given training data Args: train_loader(torch.DataLoader): Torch Dataloader created from Torch Dataset epoch(int): Current epoch number of training. bert_optimizer(optimizer): optimizer can be BERTAdam for local and Dsitributed if Horovod num_epochs(int): the number of epochs to run num_gpus(int): the number of gpus. If None is specified, all available GPUs will be used. lr (float): learning rate of the adam optimizer. defaults to 2e-5. warmup_proportion (float, optional): proportion of training to perform linear learning rate warmup for. e.g., 0.1 = 10% of training. defaults to none. fp16_allreduce(bool): if true, use fp16 compression during allreduce num_train_optimization_steps: number of steps the optimizer should take. """ device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) if bert_optimizer is None: bert_optimizer = self.create_optimizer( num_train_optimization_steps=num_train_optimization_steps, lr=lr, warmup_proportion=warmup_proportion, fp16_allreduce=fp16_allreduce, ) if self.use_distributed: hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) loss_func = nn.CrossEntropyLoss().to(device) # train self.model.train() # training mode token_type_ids_batch = None num_print = 1000 for batch_idx, data in enumerate(train_loader): x_batch = data["token_ids"] x_batch = x_batch.cuda() y_batch = data["labels"] y_batch = y_batch.cuda() mask_batch = data["input_mask"] mask_batch = mask_batch.cuda() if "token_type_ids" in data and data["token_type_ids"] is not None: token_type_ids_batch = data["token_type_ids"] token_type_ids_batch = token_type_ids_batch.cuda() bert_optimizer.zero_grad() y_h = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) loss = loss_func(y_h, y_batch).mean() loss.backward() bert_optimizer.synchronize() bert_optimizer.step() if batch_idx % num_print == 0: print( "Train Epoch: {}/{} ({:.0f}%) \t Batch:{} \tLoss: {:.6f}". format( epoch, num_epochs, 100.0 * batch_idx / len(train_loader), batch_idx + 1, loss.item(), )) del [x_batch, y_batch, mask_batch, token_type_ids_batch] torch.cuda.empty_cache()
def predict(self, token_ids, input_mask, labels=None, batch_size=32, num_gpus=None, probabilities=False): """ Predict token labels on the testing data. Args: token_ids (list): List of lists. Each sublist contains numerical token ids corresponding to the tokens in the input text data. input_mask (list): List of lists. Each sublist contains the attention mask of the input token list, 1 for input tokens and 0 for padded tokens, so that padded tokens are not attended to. labels (list, optional): List of lists. Each sublist contains numerical token labels of an input sentence/paragraph. If provided, it's used to compute the evaluation loss. Default value is None. batch_size (int, optional): Testing batch size. Defaults to 32. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. Defaults to None. Returns: list or namedtuple(list, ndarray): List of lists of predicted token labels or ([token labels], probabilities) if probabilities is True. The probabilities output is an n x m array, where n is the size of the testing data and m is the number of tokens in each input sublist. The probability values are the softmax probability of the predicted class. """ test_dataloader = create_data_loader( input_ids=token_ids, input_mask=input_mask, label_ids=labels, batch_size=batch_size, sample_method="sequential", ) device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) self.model.eval() eval_loss = 0 nb_eval_steps = 0 for step, batch in enumerate( tqdm(test_dataloader, desc="Iteration", mininterval=10)): batch = tuple(t.to(device) for t in batch) true_label_available = False if labels: b_input_ids, b_input_mask, b_labels = batch true_label_available = True else: b_input_ids, b_input_mask = batch with torch.no_grad(): logits = self.model(b_input_ids, attention_mask=b_input_mask) if true_label_available: active_loss = b_input_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels)[active_loss] active_labels = b_labels.view(-1)[active_loss] loss_fct = nn.CrossEntropyLoss() tmp_eval_loss = loss_fct(active_logits, active_labels) eval_loss += tmp_eval_loss.mean().item() logits = logits.detach().cpu() if step == 0: logits_all = logits.numpy() else: logits_all = np.append(logits_all, logits, axis=0) nb_eval_steps += 1 predictions = [list(p) for p in np.argmax(logits_all, axis=2)] if true_label_available: validation_loss = eval_loss / nb_eval_steps print("Evaluation loss: {}".format(validation_loss)) if probabilities: return namedtuple("Predictions", "classes probabilities")( predictions, np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2)) else: return predictions
def fit( self, token_ids, input_mask, labels, num_gpus=None, num_epochs=1, batch_size=32, learning_rate=2e-5, warmup_proportion=None, ): """ Fine-tunes the BERT classifier using the given training data. Args: token_ids (list): List of lists. Each sublist contains numerical token ids corresponding to the tokens in the input text data. input_mask (list): List of lists. Each sublist contains the attention mask of the input token id list. 1 for input tokens and 0 for padded tokens, so that padded tokens are not attended to. labels (list): List of lists, each sublist contains numerical token labels of an input sentence/paragraph. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. Defaults to None. num_epochs (int, optional): Number of training epochs. Defaults to 1. batch_size (int, optional): Training batch size. Defaults to 32. learning_rate (float, optional): learning rate of the BertAdam optimizer. Defaults to 2e-5. warmup_proportion (float, optional): Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% of training. Defaults to None. """ train_dataloader = create_data_loader( input_ids=token_ids, input_mask=input_mask, label_ids=labels, sample_method="random", batch_size=batch_size, ) device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) if num_gpus is None: num_gpus_used = torch.cuda.device_count() else: num_gpus_used = min(num_gpus, torch.cuda.device_count()) num_train_optimization_steps = max( (int(len(token_ids) / batch_size) * num_epochs), 1) optimizer = self._get_optimizer( learning_rate=learning_rate, num_train_optimization_steps=num_train_optimization_steps, warmup_proportion=warmup_proportion, ) self.model.train() for _ in trange(int(num_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", mininterval=30)): batch = tuple(t.to(device) for t in batch) b_token_ids, b_input_mask, b_label_ids = batch loss = self.model(input_ids=b_token_ids, attention_mask=b_input_mask, labels=b_label_ids) if num_gpus_used > 1: # mean() to average on multi-gpu. loss = loss.mean() # Accumulate parameter gradients loss.backward() tr_loss += loss.item() nb_tr_steps += 1 # Update parameters based on current gradients optimizer.step() # Reset parameter gradients to zero optimizer.zero_grad() train_loss = tr_loss / nb_tr_steps print("Train loss: {}".format(train_loss)) torch.cuda.empty_cache()
def fit( self, token_ids, input_mask, labels, val_token_ids, val_input_mask, val_labels, token_type_ids=None, val_token_type_ids=None, verbose=True, logging_steps=0, save_steps=0, val_steps=0, ): """Fine-tunes the XLNet classifier using the given training data. Args: token_ids (list): List of training token id lists. input_mask (list): List of input mask lists. labels (list): List of training labels. token_type_ids (list, optional): List of lists. Each sublist contains segment ids indicating if the token belongs to the first sentence(0) or second sentence(1). Only needed for two-sentence tasks. verbose (bool, optional): If True, shows the training progress and loss values. Defaults to True. """ device, num_gpus = get_device(self.num_gpus) self.model = move_to_device(self.model, device, self.num_gpus) token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) input_mask_tensor = torch.tensor(input_mask, dtype=torch.long) labels_tensor = torch.tensor(labels, dtype=torch.long) val_token_ids_tensor = torch.tensor(val_token_ids, dtype=torch.long) val_input_mask_tensor = torch.tensor(val_input_mask, dtype=torch.long) val_labels_tensor = torch.tensor(val_labels, dtype=torch.long) if token_type_ids: token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long) val_token_type_ids_tensor = torch.tensor(val_token_type_ids, dtype=torch.long) train_dataset = TensorDataset( token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor ) val_dataset = TensorDataset( val_token_ids_tensor, val_input_mask_tensor, val_token_type_ids_tensor, val_labels_tensor, ) else: train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor) val_dataset = TensorDataset( val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor ) # define optimizer and model parameters param_optimizer = list(self.model.named_parameters()) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": self.weight_decay, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] val_sampler = RandomSampler(val_dataset) val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=self.batch_size) num_examples = len(token_ids) num_batches = int(np.ceil(num_examples / self.batch_size)) num_train_optimization_steps = num_batches * self.num_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=self.lr, eps=self.adam_eps) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=self.warmup_steps, t_total=num_train_optimization_steps ) global_step = 0 self.model.train() optimizer.zero_grad() for epoch in range(self.num_epochs): train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=self.batch_size ) tr_loss = 0.0 logging_loss = 0.0 val_loss = 0.0 for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if token_type_ids: x_batch, mask_batch, token_type_ids_batch, y_batch = tuple( t.to(device) for t in batch ) else: token_type_ids_batch = None x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch) outputs = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=y_batch, ) loss = outputs[0] # model outputs are always tuple in pytorch-transformers loss.sum().backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) tr_loss += loss.sum().item() optimizer.step() # Update learning rate schedule scheduler.step() optimizer.zero_grad() global_step += 1 # logging of learning rate and loss if logging_steps > 0 and global_step % logging_steps == 0: mlflow.log_metric("learning rate", scheduler.get_lr()[0], step=global_step) mlflow.log_metric( "training loss", (tr_loss - logging_loss) / (logging_steps * self.batch_size), step=global_step, ) logging_loss = tr_loss # model checkpointing if save_steps > 0 and global_step % save_steps == 0: checkpoint_dir = os.path.join(os.getcwd(), "checkpoints") if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + "/" + str(global_step) + ".pth" torch.save(self.model.state_dict(), checkpoint_path) mlflow.log_artifact(checkpoint_path) # model validation if val_steps > 0 and global_step % val_steps == 0: # run model on validation set self.model.eval() val_loss = 0.0 for j, val_batch in enumerate(val_dataloader): if token_type_ids: val_x_batch, val_mask_batch, val_token_type_ids_batch, val_y_batch = tuple( t.to(device) for t in val_batch ) else: token_type_ids_batch = None val_x_batch, val_mask_batch, val_y_batch = tuple( t.to(device) for t in val_batch ) val_outputs = self.model( input_ids=val_x_batch, token_type_ids=val_token_type_ids_batch, attention_mask=val_mask_batch, labels=val_y_batch, ) vloss = val_outputs[0] val_loss += vloss.sum().item() mlflow.log_metric( "validation loss", val_loss / len(val_dataset), step=global_step ) self.model.train() if verbose: if i % ((num_batches // 10) + 1) == 0: if val_loss > 0: print( "epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f};\ average val loss:{:.6f}".format( epoch + 1, self.num_epochs, i + 1, min(i + 1 + num_batches // 10, num_batches), num_batches, tr_loss / (i + 1), val_loss / (j + 1), ) ) else: print( "epoch:{}/{}; batch:{}->{}/{}; average train loss:{:.6f}".format( epoch + 1, self.num_epochs, i + 1, min(i + 1 + num_batches // 10, num_batches), num_batches, tr_loss / (i + 1), ) ) checkpoint_dir = os.path.join(os.getcwd(), "checkpoints") if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_path = checkpoint_dir + "/" + "final" + ".pth" torch.save(self.model.state_dict(), checkpoint_path) mlflow.log_artifact(checkpoint_path) # empty cache del [x_batch, y_batch, mask_batch, token_type_ids_batch] if val_steps > 0: del [val_x_batch, val_y_batch, val_mask_batch, val_token_type_ids_batch] torch.cuda.empty_cache()
def predict( self, token_ids, input_mask, token_type_ids=None, num_gpus=None, batch_size=8, probabilities=False, ): """Scores the given dataset and returns the predicted classes. Args: token_ids (list): List of training token lists. input_mask (list): List of input mask lists. token_type_ids (list, optional): List of lists. Each sublist contains segment ids indicating if the token belongs to the first sentence(0) or second sentence(1). Only needed for two-sentence tasks. num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. batch_size (int, optional): Scoring batch size. Defaults to 8. probabilities (bool, optional): If True, the predicted probability distribution is also returned. Defaults to False. Returns: 1darray, namedtuple(1darray, ndarray): Predicted classes or (classes, probabilities) if probabilities is True. """ device, num_gpus = get_device(num_gpus) self.model = move_to_device(self.model, device, num_gpus) self.model.eval() preds = [] with tqdm(total=len(token_ids)) as pbar: for i in range(0, len(token_ids), batch_size): start = i end = start + batch_size x_batch = torch.tensor(token_ids[start:end], dtype=torch.long, device=device) mask_batch = torch.tensor(input_mask[start:end], dtype=torch.long, device=device) token_type_ids_batch = torch.tensor( token_type_ids[start:end], dtype=torch.long, device=device ) with torch.no_grad(): pred_batch = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) preds.append(pred_batch[0].cpu()) if i % batch_size == 0: pbar.update(batch_size) preds = np.concatenate(preds) if probabilities: return namedtuple("Predictions", "classes probabilities")( preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy() ) else: return preds.argmax(axis=1)
def fit( self, token_ids, input_mask, labels, token_type_ids=None, num_gpus=None, num_epochs=1, batch_size=32, lr=2e-5, warmup_proportion=None, verbose=True, ): """Fine-tunes the BERT classifier using the given training data. Args: token_ids (list): List of training token id lists. input_mask (list): List of input mask lists. labels (list): List of training labels. token_type_ids (list, optional): List of lists. Each sublist contains segment ids indicating if the token belongs to the first sentence(0) or second sentence(1). Only needed for two-sentence tasks. num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. num_epochs (int, optional): Number of training epochs. Defaults to 1. batch_size (int, optional): Training batch size. Defaults to 32. lr (float): Learning rate of the Adam optimizer. Defaults to 2e-5. warmup_proportion (float, optional): Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% of training. Defaults to None. verbose (bool, optional): If True, shows the training progress and loss values. Defaults to True. """ device = get_device( "cpu" if num_gpus == 0 or not self.cuda else "gpu" ) self.model = move_to_device(self.model, device, num_gpus) token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) input_mask_tensor = torch.tensor(input_mask, dtype=torch.long) labels_tensor = torch.tensor(labels, dtype=torch.long) if token_type_ids: token_type_ids_tensor = torch.tensor( token_type_ids, dtype=torch.long ) train_dataset = TensorDataset( token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor, ) else: train_dataset = TensorDataset( token_ids_tensor, input_mask_tensor, labels_tensor ) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=batch_size ) # define optimizer and model parameters param_optimizer = list(self.model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_batches = len(train_dataloader) num_train_optimization_steps = num_batches * num_epochs if warmup_proportion is None: opt = BertAdam(optimizer_grouped_parameters, lr=lr) else: opt = BertAdam( optimizer_grouped_parameters, lr=lr, t_total=num_train_optimization_steps, warmup=warmup_proportion, ) # define loss function loss_func = nn.CrossEntropyLoss().to(device) # train self.model.train() # training mode for epoch in range(num_epochs): training_loss = 0 for i, batch in enumerate( tqdm(train_dataloader, desc="Iteration") ): if token_type_ids: x_batch, mask_batch, token_type_ids_batch, y_batch = tuple( t.to(device) for t in batch ) else: token_type_ids_batch = None x_batch, mask_batch, y_batch = tuple( t.to(device) for t in batch ) opt.zero_grad() y_h = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) loss = loss_func(y_h, y_batch).mean() training_loss += loss.item() loss.backward() opt.step() if verbose: if i % ((num_batches // 10) + 1) == 0: print( "epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f}".format( epoch + 1, num_epochs, i + 1, min(i + 1 + num_batches // 10, num_batches), num_batches, training_loss / (i + 1), ) ) # empty cache del [x_batch, y_batch, mask_batch, token_type_ids_batch] torch.cuda.empty_cache()
def predict( self, token_ids, input_mask, token_type_ids=None, num_gpus=None, batch_size=32, probabilities=False, ): """Scores the given dataset and returns the predicted classes. Args: token_ids (list): List of training token lists. input_mask (list): List of input mask lists. token_type_ids (list, optional): List of lists. Each sublist contains segment ids indicating if the token belongs to the first sentence(0) or second sentence(1). Only needed for two-sentence tasks. num_gpus (int, optional): The number of gpus to use. If None is specified, all available GPUs will be used. Defaults to None. batch_size (int, optional): Scoring batch size. Defaults to 32. probabilities (bool, optional): If True, the predicted probability distribution is also returned. Defaults to False. Returns: 1darray, namedtuple(1darray, ndarray): Predicted classes or (classes, probabilities) if probabilities is True. """ device = get_device( "cpu" if num_gpus == 0 or not self.cuda else "gpu" ) self.model = move_to_device(self.model, device, num_gpus) # score self.model.eval() token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) input_mask_tensor = torch.tensor(input_mask, dtype=torch.long) if token_type_ids: token_type_ids_tensor = torch.tensor( token_type_ids, dtype=torch.long ) test_dataset = TensorDataset( token_ids_tensor, input_mask_tensor, token_type_ids_tensor ) else: test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader( test_dataset, sampler=test_sampler, batch_size=batch_size ) preds = [] for i, batch in enumerate(tqdm(test_dataloader, desc="Iteration")): if token_type_ids: x_batch, mask_batch, token_type_ids_batch = tuple( t.to(device) for t in batch ) else: token_type_ids_batch = None x_batch, mask_batch = tuple(t.to(device) for t in batch) with torch.no_grad(): p_batch = self.model( input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None, ) preds.append(p_batch.cpu()) preds = np.concatenate(preds) if probabilities: return namedtuple("Predictions", "classes probabilities")( preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(), ) else: return preds.argmax(axis=1)
def test_move_to_device_exception_cuda_zero_gpus(model): # test when device.type is cuda, but num_gpus is 0 with pytest.raises(ValueError): move_to_device(model, torch.device("cuda"), num_gpus=0)
def test_move_to_device_exception_gpu_model_on_cpu_machine(model): # test when the model is moved to a gpu but it is a cpu machine with pytest.raises(Exception): move_to_device(model, torch.device("cuda"))
def test_move_to_device_exception_wrong_type(model): # test when device.type is not "cuda" or "cpu" with pytest.raises(Exception): move_to_device(model, torch.device("opengl"))
def test_move_to_device_exception_not_torch_device(model): # test when device is not torch.device with pytest.raises(ValueError): move_to_device(model, "abc")
def test_move_to_device_cpu(model): # test when device.type="cpu" model_cpu = move_to_device(model, torch.device("cpu")) assert isinstance(model_cpu, nn.modules.container.Sequential)