Пример #1
0
    def marginals(self, X):
        """
        Compute the marginals for the given candidates X.
        Note: split into batches to avoid OOM errors.

        :param X: The input data which is a (list of Candidate objects, a sparse
            matrix of corresponding features) pair or a list of
            (Candidate, features) pairs.
        :type X: pair or list
        """

        nn.Module.train(self, False)

        if self._check_input(X):
            X = self._preprocess_data(X)

        dataloader = DataLoader(
            MultiModalDataset(X),
            batch_size=self.settings["batch_size"],
            collate_fn=self._collate_fn(),
            shuffle=False,
        )

        marginals = torch.Tensor([])

        for X_batch in dataloader:
            marginal = self._non_cuda(self._calc_logits(X_batch))
            marginals = torch.cat((marginals, marginal), 0)

        return F.softmax(marginals, dim=-1).detach().numpy()
Пример #2
0
    def train(
        self,
        X_train,
        Y_train,
        n_epochs=25,
        lr=0.01,
        batch_size=256,
        shuffle=True,
        X_dev=None,
        Y_dev=None,
        print_freq=5,
        dev_ckpt=True,
        dev_ckpt_delay=0.75,
        b=0.5,
        pos_label=1,
        seed=1234,
        host_device="CPU",
    ):
        """
        Generic training procedure for PyTorch model

        :param X_train: The training data which is a (list of Candidate objects,
            a sparse matrix of corresponding features) pair.
        :type X_train: pair
        :param Y_train: Array of marginal probabilities for each Candidate.
        :type Y_train: list or numpy.array
        :param n_epochs: Number of training epochs.
        :type n_epochs: int
        :param lr: Learning rate.
        :type lr: float
        :param batch_size: Batch size for learning model.
        :type batch_size: int
        :param shuffle: If True, shuffle training data every epoch.
        :type shuffle: bool
        :param X_dev: Candidates for evaluation, same format as X_train.
        :param Y_dev: Labels for evaluation, same format as Y_train.
        :param print_freq: number of epochs at which to print status, and if present,
            evaluate the dev set (X_dev, Y_dev).
        :type print_freq: int
        :param dev_ckpt: If True, save a checkpoint whenever highest score
            on (X_dev, Y_dev) reached. Note: currently only evaluates at
            every @print_freq epochs.
        :param dev_ckpt_delay: Start dev checkpointing after this portion
            of n_epochs.
        :type dev_ckpt_delay: float
        :param b: Decision boundary *for binary setting only*.
        :type b: float
        :param pos_label: Positive class index *for binary setting only*. Default: 1
        :type pos_label: int
        :param seed: Random seed
        :type seed: int
        :param host_device: Host device
        :type host_device: str
        """

        # Update training parameters
        self.settings.update({
            "n_epochs": n_epochs,
            "lr": lr,
            "batch_size": batch_size,
            "shuffle": shuffle,
            "seed": 1234,
            "host_device": host_device,
        })

        # Set random seed
        self._set_random_seed(self.settings["seed"])

        self._check_input(X_train)
        verbose = print_freq > 0

        # Update cardinality of the model with training marginals
        self.cardinality = Y_train.shape[1]

        # Make sure marginals are in [0,1] (v.s e.g. [-1, 1])
        if not np.all(Y_train.sum(axis=1) - 1 < 1e-10):
            raise ValueError("Y_train must be row-stochastic (rows sum to 1).")
        if not np.all(Y_train >= 0):
            raise ValueError("Y_train must have values in [0,1].")

        # Remove unlabeled examples
        diffs = Y_train.max(axis=1) - Y_train.min(axis=1)
        train_idxs = np.where(diffs > 1e-6)[0]

        self._update_settings(X_train)

        _X_train, _Y_train = self._preprocess_data(X_train,
                                                   Y_train,
                                                   idxs=train_idxs,
                                                   train=True)

        train_dataloader = DataLoader(
            MultiModalDataset(_X_train, _Y_train),
            batch_size=self.settings["batch_size"],
            collate_fn=self._collate_fn(),
            shuffle=self.settings["shuffle"],
        )

        if X_dev is not None:
            _X_dev, _Y_dev = self._preprocess_data(X_dev, Y_dev)

        if self.settings["host_device"] in self._gpu:
            if not torch.cuda.is_available():
                self.settings["host_device"] = "CPU"
                self.logger.info("GPU is not available, switching to CPU...")
            else:
                self.logger.info("Using GPU...")

        self.logger.info(f"Settings: {self.settings}")

        # Build network
        self._build_model()
        self._setup_model_loss(self.settings["lr"])

        # Set up GPU if necessary
        if self.settings["host_device"] in self._gpu:
            nn.Module.cuda(self)

        # Run mini-batch SGD
        n = len(_X_train)
        if self.settings["batch_size"] > n:
            self.logger.info(f"Switching batch size to {n} for training.")
        batch_size = min(self.settings["batch_size"], n)

        if verbose:
            st = time()
            self.logger.info(f"[{self.name}] Training model")
            self.logger.info(f"[{self.name}] "
                             f"n_train={n} "
                             f"#epochs={self.settings['n_epochs']} "
                             f"batch size={batch_size}")

        dev_score_opt = 0.0

        for epoch in range(self.settings["n_epochs"]):
            iteration_losses = []

            nn.Module.train(self, True)

            for X_batch, Y_batch in train_dataloader:
                # zero gradients for each batch
                self.optimizer.zero_grad()

                output = self._calc_logits(X_batch)

                loss = self.loss(output, Y_batch)

                # Compute gradient
                loss.backward()

                # Update the parameters
                self.optimizer.step()

                iteration_losses.append(self._non_cuda(loss))

            # Print training stats and optionally checkpoint model
            if (verbose and (epoch + 1) % print_freq
                    == 0) or epoch + 1 == self.settings["n_epochs"]:
                # Log the training loss into tensorboard
                self.tensorboard_logger.add_scalar("loss", loss.item(),
                                                   epoch + 1)

                msg = (
                    f"[{self.name}] "
                    f"Epoch {epoch + 1} ({time() - st:.2f}s)\t"
                    f"Average loss={torch.stack(iteration_losses).mean():.6f}")
                if X_dev is not None:
                    scores = self.score(_X_dev,
                                        _Y_dev,
                                        b=b,
                                        pos_label=pos_label)

                    score = scores[
                        "accuracy"] if self.cardinality > 2 else scores["f1"]
                    score_label = "Acc." if self.cardinality > 2 else "F1"
                    msg += f"\tDev {score_label}={100.0 * score:.2f}"

                    # Log the evaulation score on dev set into tensorboard
                    for metric in scores.keys():
                        self.tensorboard_logger.add_scalar(
                            metric, scores[metric], epoch + 1)

                self.logger.info(msg)

                # Save checkpoint
                model_file = f"checkpoint_epoch_{epoch + 1}.pt"
                self.save(model_file=model_file,
                          save_dir=self.settings["log_dir"])

                # If best score on dev set so far and dev checkpointing is
                # active, save best checkpoint
                if (X_dev is not None and dev_ckpt
                        and epoch > dev_ckpt_delay * self.settings["n_epochs"]
                        and score > dev_score_opt):
                    dev_score_opt = score
                    self.logger.info(
                        f"Saving best checkpoint "
                        f'{self.settings["log_dir"]}/{model_file}.')
                    copyfile(
                        f'{self.settings["log_dir"]}/{model_file}',
                        f'{self.settings["log_dir"]}/best_model.pt',
                    )

                if (X_dev is None or dev_ckpt is False
                    ) and epoch + 1 == self.settings["n_epochs"]:
                    self.logger.info(
                        f"Saving final model as best checkpoint "
                        f'{self.settings["log_dir"]}/{model_file}.')
                    copyfile(
                        f'{self.settings["log_dir"]}/{model_file}',
                        f'{self.settings["log_dir"]}/best_model.pt',
                    )

        # Conclude training
        if verbose:
            self.logger.info(
                f"[{self.name}] Training done ({time() - st:.2f}s)")

        # Load the best checkpoint (i.e. best on dev set)
        self.logger.info("Loading best checkpoint")
        self.load(model_file="best_model.pt",
                  save_dir=self.settings["log_dir"])