示例#1
0
    def _get_predictions(self,
                         data,
                         break_ties="random",
                         return_probs=False,
                         **kwargs):
        """Computes predictions in batch, given a labeled dataset

        Args:
            data: a Pytorch DataLoader, Dataset, or tuple with Tensors (X,Y):
                X: The input for the predict method
                Y: An [n] or [n, 1] torch.Tensor or np.ndarray of target labels
                    in {1,...,k}
            break_ties: How to break ties when making predictions
            return_probs: Return the predicted probabilities as well

        Returns:
            Y_p: A Tensor of predictions
            Y: A Tensor of labels
            [Optionally: Y_s: An [n, k] np.ndarray of predicted probabilities]
        """
        data_loader = self._create_data_loader(data)
        Y_p = []
        Y = []
        Y_s = []

        # Do batch evaluation by default, getting the predictions and labels
        for batch_num, data in enumerate(data_loader):
            Xb, Yb = data
            Y.append(self._to_numpy(Yb))

            # Optionally move to device
            if self.config["device"] != "cpu":
                Xb = place_on_gpu(Xb)

            # Append predictions and labels from DataLoader
            Y_pb, Y_sb = self.predict(Xb,
                                      break_ties=break_ties,
                                      return_probs=True,
                                      **kwargs)
            Y_p.append(self._to_numpy(Y_pb))
            Y_s.append(self._to_numpy(Y_sb))
        Y_p, Y, Y_s = map(self._stack_batches, [Y_p, Y, Y_s])
        if return_probs:
            return Y_p, Y, Y_s
        else:
            return Y_p, Y
示例#2
0
    def _train_model(self,
                     train_data,
                     loss_fn,
                     valid_data=None,
                     log_writer=None,
                     restore_state={}):
        """The internal training routine called by train_model() after setup

        Args:
            train_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of
                X (data) and Y (labels) for the train split
            loss_fn: the loss function to minimize (maps *data -> loss)
            valid_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of
                X (data) and Y (labels) for the dev split
            restore_state: a dictionary containing model weights (optimizer, main network) and training information

        If valid_data is not provided, then no checkpointing or
        evaluation on the dev set will occur.
        """
        # Set model to train mode
        self.train()
        train_config = self.config["train_config"]

        # Convert data to DataLoaders
        train_loader = self._create_data_loader(train_data)
        valid_loader = self._create_data_loader(valid_data)
        epoch_size = len(train_loader.dataset)

        # Move model to GPU
        if self.config["verbose"] and self.config["device"] != "cpu":
            print("Using GPU...")
        self.to(self.config["device"])

        # Set training components
        self._set_writer(train_config)
        self._set_logger(train_config, epoch_size)
        self._set_checkpointer(train_config)
        self._set_optimizer(train_config)
        self._set_scheduler(train_config)

        # Restore model if necessary
        if restore_state:
            start_iteration = self._restore_training_state(restore_state)
        else:
            start_iteration = 0

        # Train the model
        metrics_hist = {}  # The most recently seen value for all metrics
        for epoch in range(start_iteration, train_config["n_epochs"]):
            progress_bar = (train_config["progress_bar"]
                            and self.config["verbose"]
                            and self.logger.log_unit == "epochs")

            t = tqdm(
                enumerate(train_loader),
                total=len(train_loader),
                disable=(not progress_bar),
            )

            self.running_loss = 0.0
            self.running_examples = 0
            for batch_num, data in t:
                # NOTE: actual batch_size may not equal config's target batch_size
                batch_size = len(data[0])

                # Moving data to device
                if self.config["device"] != "cpu":
                    data = place_on_gpu(data)

                # Zero the parameter gradients
                self.optimizer.zero_grad()

                # Forward pass to calculate the average loss per example
                loss = loss_fn(*data)
                if torch.isnan(loss):
                    msg = "Loss is NaN. Consider reducing learning rate."
                    raise Exception(msg)

                # Backward pass to calculate gradients
                # Loss is an average loss per example
                loss.backward()

                # Perform optimizer step
                self.optimizer.step()

                # Calculate metrics, log, and checkpoint as necessary
                metrics_dict = self._execute_logging(train_loader,
                                                     valid_loader, loss,
                                                     batch_size)
                metrics_hist.update(metrics_dict)

                # tqdm output
                t.set_postfix(loss=metrics_dict["train/loss"])

            # Apply learning rate scheduler
            self._update_scheduler(epoch, metrics_hist)

        self.eval()

        # Restore best model if applicable
        if self.checkpointer and self.checkpointer.checkpoint_best:
            self.checkpointer.load_best_model(model=self)

        # Write log if applicable
        if self.writer:
            if self.writer.include_config:
                self.writer.add_config(self.config)
            self.writer.close()

        # Print confusion matrix if applicable
        if self.config["verbose"]:
            print("Finished Training")
            if valid_loader is not None:
                self.score(
                    valid_loader,
                    metric=train_config["validation_metric"],
                    verbose=True,
                    print_confusion_matrix=True,
                )
示例#3
0
    def _train_model(self, train_data, loss_fn, dev_data=None, log_writer=None):
        """The internal training routine called by train_model() after setup

        Args:
            train_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of
                X (data) and Y (labels) for the train split
            loss_fn: the loss function to minimize (maps *data -> loss)
            dev_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of
                X (data) and Y (labels) for the dev split
            log_writer: a metal.utils.LogWriter object for logging

        If dev_data is not provided, then no checkpointing or
        evaluation on the dev set will occur.
        """
        train_config = self.config["train_config"]
        evaluate_dev = dev_data is not None

        # Add config to log_writer if provided
        if log_writer is not None:
            log_writer.add_config(self.config)

        # Convert data to DataLoaders
        train_loader = self._create_data_loader(train_data)
        dev_loader = self._create_data_loader(dev_data)

        # Set the optimizer
        optimizer = self._set_optimizer(train_config)

        # Set the lr scheduler
        scheduler_config = train_config["scheduler_config"]
        lr_scheduler = self._set_scheduler(scheduler_config, optimizer)

        # Create the checkpointer if applicable
        if evaluate_dev and train_config["checkpoint"]:
            checkpointer = self._create_checkpointer(
                train_config["checkpoint_config"]
            )

        # Moving model to GPU
        if self.config["use_cuda"]:
            if self.config["verbose"]:
                print("Using GPU...")
            self.cuda()

        # Train the model
        for epoch in range(train_config["n_epochs"]):
            epoch_loss = 0.0
            t = tqdm(
                enumerate(train_loader),
                total=len(train_loader),
                disable=(
                    train_config["disable_prog_bar"]
                    or not self.config["verbose"]
                ),
            )

            for batch_num, data in t:

                # Moving data to GPU
                if self.config["use_cuda"]:
                    data = place_on_gpu(data)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward pass to calculate outputs
                loss = loss_fn(*data)
                if torch.isnan(loss):
                    msg = "Loss is NaN. Consider reducing learning rate."
                    raise Exception(msg)

                # Backward pass to calculate gradients
                loss.backward()

                # TODO: restore this once it has unit tests
                # Clip gradients
                # if grad_clip:
                #     torch.nn.utils.clip_grad_norm(
                #        self.net.parameters(), grad_clip)

                # Perform optimizer step
                optimizer.step()

                # Keep running sum of losses
                epoch_loss += loss.detach()

                # tqdm output
                running_loss = epoch_loss / (len(data[0]) * (batch_num + 1))
                t.set_postfix(avg_loss=float(running_loss))

            # Calculate average loss per training example
            # Saving division until this stage protects against the potential
            # mistake of averaging batch losses when the last batch is an orphan
            train_loss = epoch_loss / len(train_loader.dataset)

            # Checkpoint performance on dev

            if evaluate_dev and (epoch % train_config["validation_freq"] == 0):
                val_metric = train_config["validation_metric"]
                dev_score = self.score(
                    dev_loader,
                    metric=val_metric,
                    verbose=False,
                    print_confusion_matrix=False,
                )

                if train_config["checkpoint"]:
                    checkpointer.checkpoint(self, epoch, dev_score)

            # Apply learning rate scheduler
            if (
                lr_scheduler is not None
                and epoch + 1 >= scheduler_config["lr_freeze"]
            ):
                if scheduler_config["scheduler"] == "reduce_on_plateau":
                    if evaluate_dev:
                        lr_scheduler.step(dev_score)
                else:
                    lr_scheduler.step()

            # Report progress
            if self.config["verbose"] and (
                epoch % train_config["print_every"] == 0
                or epoch == train_config["n_epochs"] - 1
            ):
                msg = f"[E:{epoch}]\tTrain Loss: {train_loss:.3f}"
                if evaluate_dev:
                    msg += f"\tDev score: {dev_score:.3f}"
                print(msg)

            # Also write train loss (+ dev score) to log_writer if available
            if log_writer is not None and (
                epoch % train_config["print_every"] == 0
                or epoch == train_config["n_epochs"] - 1
            ):
                tls = float(train_loss.cpu().numpy())
                log_writer.add_scalar("train-loss", tls, epoch)
                if evaluate_dev:
                    log_writer.add_scalar("dev-score", dev_score, epoch)
                log_writer.write()

        # Restore best model if applicable
        if evaluate_dev and train_config["checkpoint"]:
            checkpointer.restore(model=self)

            if log_writer is not None:
                log_writer.log["checkpoint_iter"] = checkpointer.best_iteration

        # Print confusion matrix if applicable
        if self.config["verbose"]:
            print("Finished Training")
            if evaluate_dev:
                self.score(
                    dev_loader,
                    metric=["accuracy"],
                    verbose=True,
                    print_confusion_matrix=True,
                )

        # Close log_writer if available
        if log_writer is not None:
            log_writer.close()