def test_move_to_device_gpu(model):
    # test when device.type="cuda"
    model_cuda = move_to_device(model, torch.device("cuda"))
    num_cuda_devices = torch.cuda.device_count()

    if num_cuda_devices > 1:
        assert isinstance(model_cuda, DataParallel)
    else:
        assert isinstance(model_cuda, Sequential)

    model_cuda_1_gpu = move_to_device(model, torch.device("cuda"), num_gpus=1)
    assert isinstance(model_cuda_1_gpu, Sequential)

    model_cuda_1_more_gpu = move_to_device(model,
                                           torch.device("cuda"),
                                           num_gpus=num_cuda_devices + 1)
    if num_cuda_devices > 1:
        assert isinstance(model_cuda_1_more_gpu, DataParallel)
    else:
        assert isinstance(model_cuda_1_more_gpu, Sequential)

    model_cuda_same_gpu = move_to_device(model,
                                         torch.device("cuda"),
                                         num_gpus=num_cuda_devices)
    if num_cuda_devices > 1:
        assert isinstance(model_cuda_same_gpu, DataParallel)
    else:
        assert isinstance(model_cuda_same_gpu, Sequential)
def test_move_to_device_cpu_parallelized(model):
    # test when input model is parallelized
    model_parallelized = nn.DataParallel(model)
    model_parallelized_output = move_to_device(model_parallelized,
                                               torch.device("cpu"))
    assert isinstance(model_parallelized_output,
                      nn.modules.container.Sequential)
Пример #3
0
    def get_hidden_states(self, text, batch_size=32):
        """Extract the hidden states from the pretrained model
        
        Args:
            text: List of documents to extract features from.
            batch_size: Batch size, defaults to 32.
        
        Returns:
            pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]). 
        """
        device = get_device("cpu" if self.num_gpus == 0 or self.cuda else "gpu")
        self.model = move_to_device(self.model, device, self.num_gpus)

        self.model.eval()

        tokens = self.tokenizer.tokenize(text)

        tokens, input_ids, input_mask, input_type_ids = self.tokenizer.preprocess_encoder_tokens(
            tokens, max_len=self.max_len
        )

        input_ids = torch.tensor(input_ids, dtype=torch.long, device=device)
        input_mask = torch.tensor(input_mask, dtype=torch.long, device=device)
        input_type_ids = torch.arange(input_ids.size(0), dtype=torch.long, device=device)

        eval_data = TensorDataset(input_ids, input_mask, input_type_ids)
        eval_dataloader = DataLoader(
            eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size
        )

        hidden_states = {"text_index": [], "token": [], "layer_index": [], "values": []}
        for (input_ids_tensor, input_mask_tensor, example_indices_tensor) in eval_dataloader:
            with torch.no_grad():
                all_encoder_layers, _ = self.model(
                    input_ids_tensor, token_type_ids=None, attention_mask=input_mask_tensor
                )
                self.embedding_dim = all_encoder_layers[0].size()[-1]

            for b, example_index in enumerate(example_indices_tensor):
                for (i, token) in enumerate(tokens[example_index.item()]):
                    for (j, layer_index) in enumerate(self.layer_index):
                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
                        layer_output = layer_output[b]
                        hidden_states["text_index"].append(example_index.item())
                        hidden_states["token"].append(token)
                        hidden_states["layer_index"].append(layer_index)
                        hidden_states["values"].append(
                            [round(x.item(), 6) for x in layer_output[i]]
                        )

            # empty cache
            del [input_ids_tensor, input_mask_tensor, example_indices_tensor]
            torch.cuda.empty_cache()

        # empty cache
        del [input_ids, input_mask, input_type_ids]
        torch.cuda.empty_cache()

        return pd.DataFrame.from_dict(hidden_states)
    def predict(self, test_loader, num_gpus=None, probabilities=False):
        """

        Method to predict the results on the test loader. Only evaluates for non distributed
        workload on the head node in a distributed setup.

        Args:
            test_loader(torch Dataloader): Torch Dataloader created from Torch Dataset
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            probabilities (bool, optional):
                If True, the predicted probability distribution
                is also returned. Defaults to False.

        Returns:
            1darray, dict(1darray, 1darray, ndarray): Predicted classes and target labels or
                a dictionary with classes, target labels, probabilities) if probabilities is True.
        """
        device, num_gpus = get_device(num_gpus)
        self.model = move_to_device(self.model, device, num_gpus)

        # score
        self.model.eval()

        preds = []
        test_labels = []
        for i, data in enumerate(tqdm(test_loader, desc="Iteration")):
            x_batch = data["token_ids"]
            x_batch = x_batch.cuda()

            mask_batch = data["input_mask"]
            mask_batch = mask_batch.cuda()

            y_batch = data["labels"]

            token_type_ids_batch = None
            if "token_type_ids" in data and data["token_type_ids"] is not None:
                token_type_ids_batch = data["token_type_ids"]
                token_type_ids_batch = token_type_ids_batch.cuda()

            with torch.no_grad():
                p_batch = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=None,
                )
            preds.append(p_batch.cpu())
            test_labels.append(y_batch)

        preds = np.concatenate(preds)
        test_labels = np.concatenate(test_labels)

        if probabilities:
            return {
                "Predictions":
                preds.argmax(axis=1),
                "Target":
                test_labels,
                "classes probabilities":
                nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(),
            }
        else:
            return preds.argmax(axis=1), test_labels
    def fit(
        self,
        train_loader,
        epoch,
        bert_optimizer=None,
        num_epochs=1,
        num_gpus=None,
        lr=2e-5,
        warmup_proportion=None,
        fp16_allreduce=False,
        num_train_optimization_steps=10,
    ):
        """
        Method to fine-tune the bert classifier using the given training data

        Args:
            train_loader(torch.DataLoader): Torch Dataloader created from Torch Dataset
            epoch(int): Current epoch number of training.
            bert_optimizer(optimizer): optimizer can be BERTAdam for local and Dsitributed if Horovod
            num_epochs(int): the number of epochs to run
            num_gpus(int): the number of gpus. If None is specified, all available GPUs will be used.
            lr (float): learning rate of the adam optimizer. defaults to 2e-5.
            warmup_proportion (float, optional): proportion of training to
                perform linear learning rate warmup for. e.g., 0.1 = 10% of
                training. defaults to none.
            fp16_allreduce(bool): if true, use fp16 compression during allreduce
            num_train_optimization_steps: number of steps the optimizer should take.
        """

        device, num_gpus = get_device(num_gpus)

        self.model = move_to_device(self.model, device, num_gpus)

        if bert_optimizer is None:
            bert_optimizer = self.create_optimizer(
                num_train_optimization_steps=num_train_optimization_steps,
                lr=lr,
                warmup_proportion=warmup_proportion,
                fp16_allreduce=fp16_allreduce,
            )

        if self.use_distributed:
            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)

        loss_func = nn.CrossEntropyLoss().to(device)

        # train
        self.model.train()  # training mode

        token_type_ids_batch = None

        num_print = 1000
        for batch_idx, data in enumerate(train_loader):

            x_batch = data["token_ids"]
            x_batch = x_batch.cuda()

            y_batch = data["labels"]
            y_batch = y_batch.cuda()

            mask_batch = data["input_mask"]
            mask_batch = mask_batch.cuda()

            if "token_type_ids" in data and data["token_type_ids"] is not None:
                token_type_ids_batch = data["token_type_ids"]
                token_type_ids_batch = token_type_ids_batch.cuda()

            bert_optimizer.zero_grad()

            y_h = self.model(
                input_ids=x_batch,
                token_type_ids=token_type_ids_batch,
                attention_mask=mask_batch,
                labels=None,
            )

            loss = loss_func(y_h, y_batch).mean()
            loss.backward()

            bert_optimizer.synchronize()
            bert_optimizer.step()

            if batch_idx % num_print == 0:
                print(
                    "Train Epoch: {}/{} ({:.0f}%) \t Batch:{} \tLoss: {:.6f}".
                    format(
                        epoch,
                        num_epochs,
                        100.0 * batch_idx / len(train_loader),
                        batch_idx + 1,
                        loss.item(),
                    ))

        del [x_batch, y_batch, mask_batch, token_type_ids_batch]
        torch.cuda.empty_cache()
Пример #6
0
    def predict(self,
                token_ids,
                input_mask,
                labels=None,
                batch_size=32,
                num_gpus=None,
                probabilities=False):
        """
        Predict token labels on the testing data.

        Args:
            token_ids (list): List of lists. Each sublist contains
                numerical token ids corresponding to the tokens in the input
                text data.
            input_mask (list): List of lists. Each sublist contains
                the attention mask of the input token list, 1 for input
                tokens and 0 for padded tokens, so that padded tokens are
                not attended to.
            labels (list, optional): List of lists. Each sublist contains
                numerical token labels of an input sentence/paragraph.
                If provided, it's used to compute the evaluation loss.
                Default value is None.
            batch_size (int, optional): Testing batch size. Defaults to 32.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used. Defaults to None.

        Returns:
            list or namedtuple(list, ndarray): List of lists of predicted
                token labels or ([token labels], probabilities) if
                probabilities is True. The probabilities output is an n x m
                array, where n is the size of the testing data and m is the
                number of tokens in each input sublist. The probability
                values are the softmax probability of the predicted class.
        """
        test_dataloader = create_data_loader(
            input_ids=token_ids,
            input_mask=input_mask,
            label_ids=labels,
            batch_size=batch_size,
            sample_method="sequential",
        )
        device, num_gpus = get_device(num_gpus)

        self.model = move_to_device(self.model, device, num_gpus)

        self.model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        for step, batch in enumerate(
                tqdm(test_dataloader, desc="Iteration", mininterval=10)):
            batch = tuple(t.to(device) for t in batch)
            true_label_available = False
            if labels:
                b_input_ids, b_input_mask, b_labels = batch
                true_label_available = True
            else:
                b_input_ids, b_input_mask = batch

            with torch.no_grad():
                logits = self.model(b_input_ids, attention_mask=b_input_mask)
                if true_label_available:
                    active_loss = b_input_mask.view(-1) == 1
                    active_logits = logits.view(-1,
                                                self.num_labels)[active_loss]
                    active_labels = b_labels.view(-1)[active_loss]
                    loss_fct = nn.CrossEntropyLoss()
                    tmp_eval_loss = loss_fct(active_logits, active_labels)

                    eval_loss += tmp_eval_loss.mean().item()

            logits = logits.detach().cpu()

            if step == 0:
                logits_all = logits.numpy()
            else:
                logits_all = np.append(logits_all, logits, axis=0)

            nb_eval_steps += 1

        predictions = [list(p) for p in np.argmax(logits_all, axis=2)]

        if true_label_available:
            validation_loss = eval_loss / nb_eval_steps
            print("Evaluation loss: {}".format(validation_loss))

        if probabilities:
            return namedtuple("Predictions", "classes probabilities")(
                predictions,
                np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2))
        else:
            return predictions
Пример #7
0
    def fit(
        self,
        token_ids,
        input_mask,
        labels,
        num_gpus=None,
        num_epochs=1,
        batch_size=32,
        learning_rate=2e-5,
        warmup_proportion=None,
    ):
        """
        Fine-tunes the BERT classifier using the given training data.

        Args:
            token_ids (list): List of lists. Each sublist contains
                numerical token ids corresponding to the tokens in the input
                text data.
            input_mask (list): List of lists. Each sublist contains
                the attention mask of the input token id list. 1 for input
                tokens and 0 for padded tokens, so that padded tokens are
                not attended to.
            labels (list): List of lists, each sublist contains numerical
                token labels of an input sentence/paragraph.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used. Defaults to None.
            num_epochs (int, optional): Number of training epochs.
                Defaults to 1.
            batch_size (int, optional): Training batch size. Defaults to 32.
            learning_rate (float, optional): learning rate of the BertAdam
                optimizer. Defaults to 2e-5.
            warmup_proportion (float, optional): Proportion of training to
                perform linear learning rate warmup for. E.g., 0.1 = 10% of
                training. Defaults to None.
        """

        train_dataloader = create_data_loader(
            input_ids=token_ids,
            input_mask=input_mask,
            label_ids=labels,
            sample_method="random",
            batch_size=batch_size,
        )

        device, num_gpus = get_device(num_gpus)

        self.model = move_to_device(self.model, device, num_gpus)

        if num_gpus is None:
            num_gpus_used = torch.cuda.device_count()
        else:
            num_gpus_used = min(num_gpus, torch.cuda.device_count())

        num_train_optimization_steps = max(
            (int(len(token_ids) / batch_size) * num_epochs), 1)
        optimizer = self._get_optimizer(
            learning_rate=learning_rate,
            num_train_optimization_steps=num_train_optimization_steps,
            warmup_proportion=warmup_proportion,
        )

        self.model.train()
        for _ in trange(int(num_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", mininterval=30)):
                batch = tuple(t.to(device) for t in batch)
                b_token_ids, b_input_mask, b_label_ids = batch

                loss = self.model(input_ids=b_token_ids,
                                  attention_mask=b_input_mask,
                                  labels=b_label_ids)

                if num_gpus_used > 1:
                    # mean() to average on multi-gpu.
                    loss = loss.mean()
                # Accumulate parameter gradients
                loss.backward()

                tr_loss += loss.item()
                nb_tr_steps += 1

                # Update parameters based on current gradients
                optimizer.step()
                # Reset parameter gradients to zero
                optimizer.zero_grad()

            train_loss = tr_loss / nb_tr_steps
            print("Train loss: {}".format(train_loss))

            torch.cuda.empty_cache()
Пример #8
0
    def fit(
        self,
        token_ids,
        input_mask,
        labels,
        val_token_ids,
        val_input_mask,
        val_labels,
        token_type_ids=None,
        val_token_type_ids=None,
        verbose=True,
        logging_steps=0,
        save_steps=0,
        val_steps=0,
    ):
        """Fine-tunes the XLNet classifier using the given training data.

        Args:
            token_ids (list): List of training token id lists.
            input_mask (list): List of input mask lists.
            labels (list): List of training labels.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            verbose (bool, optional): If True, shows the training progress and
                loss values. Defaults to True.
        """

        device, num_gpus = get_device(self.num_gpus)
        self.model = move_to_device(self.model, device, self.num_gpus)

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
        labels_tensor = torch.tensor(labels, dtype=torch.long)

        val_token_ids_tensor = torch.tensor(val_token_ids, dtype=torch.long)
        val_input_mask_tensor = torch.tensor(val_input_mask, dtype=torch.long)
        val_labels_tensor = torch.tensor(val_labels, dtype=torch.long)

        if token_type_ids:
            token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)
            val_token_type_ids_tensor = torch.tensor(val_token_type_ids, dtype=torch.long)

            train_dataset = TensorDataset(
                token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor
            )

            val_dataset = TensorDataset(
                val_token_ids_tensor,
                val_input_mask_tensor,
                val_token_type_ids_tensor,
                val_labels_tensor,
            )

        else:

            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor)

            val_dataset = TensorDataset(
                val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor
            )

        # define optimizer and model parameters
        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

        val_sampler = RandomSampler(val_dataset)

        val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=self.batch_size)

        num_examples = len(token_ids)
        num_batches = int(np.ceil(num_examples / self.batch_size))
        num_train_optimization_steps = num_batches * self.num_epochs

        optimizer = AdamW(optimizer_grouped_parameters, lr=self.lr, eps=self.adam_eps)
        scheduler = WarmupLinearSchedule(
            optimizer, warmup_steps=self.warmup_steps, t_total=num_train_optimization_steps
        )

        global_step = 0
        self.model.train()
        optimizer.zero_grad()
        for epoch in range(self.num_epochs):

            train_sampler = RandomSampler(train_dataset)

            train_dataloader = DataLoader(
                train_dataset, sampler=train_sampler, batch_size=self.batch_size
            )

            tr_loss = 0.0
            logging_loss = 0.0
            val_loss = 0.0

            for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                if token_type_ids:
                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
                        t.to(device) for t in batch
                    )
                else:
                    token_type_ids_batch = None
                    x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch)

                outputs = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=y_batch,
                )

                loss = outputs[0]  # model outputs are always tuple in pytorch-transformers

                loss.sum().backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)

                tr_loss += loss.sum().item()
                optimizer.step()
                # Update learning rate schedule
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
                # logging of learning rate and loss
                if logging_steps > 0 and global_step % logging_steps == 0:
                    mlflow.log_metric("learning rate", scheduler.get_lr()[0], step=global_step)
                    mlflow.log_metric(
                        "training loss",
                        (tr_loss - logging_loss) / (logging_steps * self.batch_size),
                        step=global_step,
                    )
                    logging_loss = tr_loss
                # model checkpointing
                if save_steps > 0 and global_step % save_steps == 0:
                    checkpoint_dir = os.path.join(os.getcwd(), "checkpoints")
                    if not os.path.isdir(checkpoint_dir):
                        os.makedirs(checkpoint_dir)
                    checkpoint_path = checkpoint_dir + "/" + str(global_step) + ".pth"
                    torch.save(self.model.state_dict(), checkpoint_path)
                    mlflow.log_artifact(checkpoint_path)
                # model validation
                if val_steps > 0 and global_step % val_steps == 0:
                    # run model on validation set
                    self.model.eval()
                    val_loss = 0.0
                    for j, val_batch in enumerate(val_dataloader):
                        if token_type_ids:
                            val_x_batch, val_mask_batch, val_token_type_ids_batch, val_y_batch = tuple(
                                t.to(device) for t in val_batch
                            )
                        else:
                            token_type_ids_batch = None
                            val_x_batch, val_mask_batch, val_y_batch = tuple(
                                t.to(device) for t in val_batch
                            )
                        val_outputs = self.model(
                            input_ids=val_x_batch,
                            token_type_ids=val_token_type_ids_batch,
                            attention_mask=val_mask_batch,
                            labels=val_y_batch,
                        )
                        vloss = val_outputs[0]
                        val_loss += vloss.sum().item()
                    mlflow.log_metric(
                        "validation loss", val_loss / len(val_dataset), step=global_step
                    )
                    self.model.train()

                if verbose:
                    if i % ((num_batches // 10) + 1) == 0:
                        if val_loss > 0:
                            print(
                                "epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f};\
                                 average val loss:{:.6f}".format(
                                    epoch + 1,
                                    self.num_epochs,
                                    i + 1,
                                    min(i + 1 + num_batches // 10, num_batches),
                                    num_batches,
                                    tr_loss / (i + 1),
                                    val_loss / (j + 1),
                                )
                            )
                        else:
                            print(
                                "epoch:{}/{}; batch:{}->{}/{}; average train loss:{:.6f}".format(
                                    epoch + 1,
                                    self.num_epochs,
                                    i + 1,
                                    min(i + 1 + num_batches // 10, num_batches),
                                    num_batches,
                                    tr_loss / (i + 1),
                                )
                            )
        checkpoint_dir = os.path.join(os.getcwd(), "checkpoints")
        if not os.path.isdir(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        checkpoint_path = checkpoint_dir + "/" + "final" + ".pth"
        torch.save(self.model.state_dict(), checkpoint_path)
        mlflow.log_artifact(checkpoint_path)
        # empty cache
        del [x_batch, y_batch, mask_batch, token_type_ids_batch]
        if val_steps > 0:
            del [val_x_batch, val_y_batch, val_mask_batch, val_token_type_ids_batch]
        torch.cuda.empty_cache()
Пример #9
0
    def predict(
        self,
        token_ids,
        input_mask,
        token_type_ids=None,
        num_gpus=None,
        batch_size=8,
        probabilities=False,
    ):
        """Scores the given dataset and returns the predicted classes.

        Args:
            token_ids (list): List of training token lists.
            input_mask (list): List of input mask lists.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            batch_size (int, optional): Scoring batch size. Defaults to 8.
            probabilities (bool, optional):
                If True, the predicted probability distribution
                is also returned. Defaults to False.
        Returns:
            1darray, namedtuple(1darray, ndarray): Predicted classes or
                (classes, probabilities) if probabilities is True.
        """

        device, num_gpus = get_device(num_gpus)
        self.model = move_to_device(self.model, device, num_gpus)

        self.model.eval()
        preds = []

        with tqdm(total=len(token_ids)) as pbar:
            for i in range(0, len(token_ids), batch_size):
                start = i
                end = start + batch_size
                x_batch = torch.tensor(token_ids[start:end], dtype=torch.long, device=device)
                mask_batch = torch.tensor(input_mask[start:end], dtype=torch.long, device=device)

                token_type_ids_batch = torch.tensor(
                    token_type_ids[start:end], dtype=torch.long, device=device
                )

                with torch.no_grad():
                    pred_batch = self.model(
                        input_ids=x_batch,
                        token_type_ids=token_type_ids_batch,
                        attention_mask=mask_batch,
                        labels=None,
                    )
                    preds.append(pred_batch[0].cpu())
                    if i % batch_size == 0:
                        pbar.update(batch_size)

            preds = np.concatenate(preds)

            if probabilities:
                return namedtuple("Predictions", "classes probabilities")(
                    preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy()
                )
            else:
                return preds.argmax(axis=1)
Пример #10
0
    def fit(
        self,
        token_ids,
        input_mask,
        labels,
        token_type_ids=None,
        num_gpus=None,
        num_epochs=1,
        batch_size=32,
        lr=2e-5,
        warmup_proportion=None,
        verbose=True,
    ):
        """Fine-tunes the BERT classifier using the given training data.

        Args:
            token_ids (list): List of training token id lists.
            input_mask (list): List of input mask lists.
            labels (list): List of training labels.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            num_epochs (int, optional): Number of training epochs.
                Defaults to 1.
            batch_size (int, optional): Training batch size. Defaults to 32.
            lr (float): Learning rate of the Adam optimizer. Defaults to 2e-5.
            warmup_proportion (float, optional): Proportion of training to
                perform linear learning rate warmup for. E.g., 0.1 = 10% of
                training. Defaults to None.
            verbose (bool, optional): If True, shows the training progress and
                loss values. Defaults to True.
        """

        device = get_device(
            "cpu" if num_gpus == 0 or not self.cuda else "gpu"
        )
        self.model = move_to_device(self.model, device, num_gpus)

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
        labels_tensor = torch.tensor(labels, dtype=torch.long)

        if token_type_ids:
            token_type_ids_tensor = torch.tensor(
                token_type_ids, dtype=torch.long
            )
            train_dataset = TensorDataset(
                token_ids_tensor,
                input_mask_tensor,
                token_type_ids_tensor,
                labels_tensor,
            )
        else:
            train_dataset = TensorDataset(
                token_ids_tensor, input_mask_tensor, labels_tensor
            )
        train_sampler = RandomSampler(train_dataset)

        train_dataloader = DataLoader(
            train_dataset, sampler=train_sampler, batch_size=batch_size
        )
        # define optimizer and model parameters
        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p
                    for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.01,
            },
            {
                "params": [
                    p
                    for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

        num_batches = len(train_dataloader)
        num_train_optimization_steps = num_batches * num_epochs

        if warmup_proportion is None:
            opt = BertAdam(optimizer_grouped_parameters, lr=lr)
        else:
            opt = BertAdam(
                optimizer_grouped_parameters,
                lr=lr,
                t_total=num_train_optimization_steps,
                warmup=warmup_proportion,
            )

        # define loss function
        loss_func = nn.CrossEntropyLoss().to(device)

        # train
        self.model.train()  # training mode

        for epoch in range(num_epochs):
            training_loss = 0
            for i, batch in enumerate(
                tqdm(train_dataloader, desc="Iteration")
            ):
                if token_type_ids:
                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
                        t.to(device) for t in batch
                    )
                else:
                    token_type_ids_batch = None
                    x_batch, mask_batch, y_batch = tuple(
                        t.to(device) for t in batch
                    )

                opt.zero_grad()

                y_h = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=None,
                )
                loss = loss_func(y_h, y_batch).mean()

                training_loss += loss.item()

                loss.backward()
                opt.step()
                if verbose:
                    if i % ((num_batches // 10) + 1) == 0:
                        print(
                            "epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f}".format(
                                epoch + 1,
                                num_epochs,
                                i + 1,
                                min(i + 1 + num_batches // 10, num_batches),
                                num_batches,
                                training_loss / (i + 1),
                            )
                        )
        # empty cache
        del [x_batch, y_batch, mask_batch, token_type_ids_batch]
        torch.cuda.empty_cache()
Пример #11
0
    def predict(
        self,
        token_ids,
        input_mask,
        token_type_ids=None,
        num_gpus=None,
        batch_size=32,
        probabilities=False,
    ):
        """Scores the given dataset and returns the predicted classes.

        Args:
            token_ids (list): List of training token lists.
            input_mask (list): List of input mask lists.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            batch_size (int, optional): Scoring batch size. Defaults to 32.
            probabilities (bool, optional):
                If True, the predicted probability distribution
                is also returned. Defaults to False.
        Returns:
            1darray, namedtuple(1darray, ndarray): Predicted classes or
                (classes, probabilities) if probabilities is True.
        """
        device = get_device(
            "cpu" if num_gpus == 0 or not self.cuda else "gpu"
        )
        self.model = move_to_device(self.model, device, num_gpus)

        # score
        self.model.eval()

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)

        if token_type_ids:
            token_type_ids_tensor = torch.tensor(
                token_type_ids, dtype=torch.long
            )
            test_dataset = TensorDataset(
                token_ids_tensor, input_mask_tensor, token_type_ids_tensor
            )
        else:
            test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor)

        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(
            test_dataset, sampler=test_sampler, batch_size=batch_size
        )

        preds = []
        for i, batch in enumerate(tqdm(test_dataloader, desc="Iteration")):
            if token_type_ids:
                x_batch, mask_batch, token_type_ids_batch = tuple(
                    t.to(device) for t in batch
                )
            else:
                token_type_ids_batch = None
                x_batch, mask_batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                p_batch = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=None,
                )
            preds.append(p_batch.cpu())

        preds = np.concatenate(preds)

        if probabilities:
            return namedtuple("Predictions", "classes probabilities")(
                preds.argmax(axis=1),
                nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(),
            )
        else:
            return preds.argmax(axis=1)
def test_move_to_device_exception_cuda_zero_gpus(model):
    # test when device.type is cuda, but num_gpus is 0
    with pytest.raises(ValueError):
        move_to_device(model, torch.device("cuda"), num_gpus=0)
def test_move_to_device_exception_gpu_model_on_cpu_machine(model):
    # test when the model is moved to a gpu but it is a cpu machine
    with pytest.raises(Exception):
        move_to_device(model, torch.device("cuda"))
def test_move_to_device_exception_wrong_type(model):
    # test when device.type is not "cuda" or "cpu"
    with pytest.raises(Exception):
        move_to_device(model, torch.device("opengl"))
def test_move_to_device_exception_not_torch_device(model):
    # test when device is not torch.device
    with pytest.raises(ValueError):
        move_to_device(model, "abc")
def test_move_to_device_cpu(model):
    # test when device.type="cpu"
    model_cpu = move_to_device(model, torch.device("cpu"))
    assert isinstance(model_cpu, nn.modules.container.Sequential)