def marginals(self, X): """ Compute the marginals for the given candidates X. Note: split into batches to avoid OOM errors. :param X: The input data which is a (list of Candidate objects, a sparse matrix of corresponding features) pair or a list of (Candidate, features) pairs. :type X: pair or list """ nn.Module.train(self, False) if self._check_input(X): X = self._preprocess_data(X) dataloader = DataLoader( MultiModalDataset(X), batch_size=self.settings["batch_size"], collate_fn=self._collate_fn(), shuffle=False, ) marginals = torch.Tensor([]) for X_batch in dataloader: marginal = self._non_cuda(self._calc_logits(X_batch)) marginals = torch.cat((marginals, marginal), 0) return F.softmax(marginals, dim=-1).detach().numpy()
def train( self, X_train, Y_train, n_epochs=25, lr=0.01, batch_size=256, shuffle=True, X_dev=None, Y_dev=None, print_freq=5, dev_ckpt=True, dev_ckpt_delay=0.75, b=0.5, pos_label=1, seed=1234, host_device="CPU", ): """ Generic training procedure for PyTorch model :param X_train: The training data which is a (list of Candidate objects, a sparse matrix of corresponding features) pair. :type X_train: pair :param Y_train: Array of marginal probabilities for each Candidate. :type Y_train: list or numpy.array :param n_epochs: Number of training epochs. :type n_epochs: int :param lr: Learning rate. :type lr: float :param batch_size: Batch size for learning model. :type batch_size: int :param shuffle: If True, shuffle training data every epoch. :type shuffle: bool :param X_dev: Candidates for evaluation, same format as X_train. :param Y_dev: Labels for evaluation, same format as Y_train. :param print_freq: number of epochs at which to print status, and if present, evaluate the dev set (X_dev, Y_dev). :type print_freq: int :param dev_ckpt: If True, save a checkpoint whenever highest score on (X_dev, Y_dev) reached. Note: currently only evaluates at every @print_freq epochs. :param dev_ckpt_delay: Start dev checkpointing after this portion of n_epochs. :type dev_ckpt_delay: float :param b: Decision boundary *for binary setting only*. :type b: float :param pos_label: Positive class index *for binary setting only*. Default: 1 :type pos_label: int :param seed: Random seed :type seed: int :param host_device: Host device :type host_device: str """ # Update training parameters self.settings.update({ "n_epochs": n_epochs, "lr": lr, "batch_size": batch_size, "shuffle": shuffle, "seed": 1234, "host_device": host_device, }) # Set random seed self._set_random_seed(self.settings["seed"]) self._check_input(X_train) verbose = print_freq > 0 # Update cardinality of the model with training marginals self.cardinality = Y_train.shape[1] # Make sure marginals are in [0,1] (v.s e.g. [-1, 1]) if not np.all(Y_train.sum(axis=1) - 1 < 1e-10): raise ValueError("Y_train must be row-stochastic (rows sum to 1).") if not np.all(Y_train >= 0): raise ValueError("Y_train must have values in [0,1].") # Remove unlabeled examples diffs = Y_train.max(axis=1) - Y_train.min(axis=1) train_idxs = np.where(diffs > 1e-6)[0] self._update_settings(X_train) _X_train, _Y_train = self._preprocess_data(X_train, Y_train, idxs=train_idxs, train=True) train_dataloader = DataLoader( MultiModalDataset(_X_train, _Y_train), batch_size=self.settings["batch_size"], collate_fn=self._collate_fn(), shuffle=self.settings["shuffle"], ) if X_dev is not None: _X_dev, _Y_dev = self._preprocess_data(X_dev, Y_dev) if self.settings["host_device"] in self._gpu: if not torch.cuda.is_available(): self.settings["host_device"] = "CPU" self.logger.info("GPU is not available, switching to CPU...") else: self.logger.info("Using GPU...") self.logger.info(f"Settings: {self.settings}") # Build network self._build_model() self._setup_model_loss(self.settings["lr"]) # Set up GPU if necessary if self.settings["host_device"] in self._gpu: nn.Module.cuda(self) # Run mini-batch SGD n = len(_X_train) if self.settings["batch_size"] > n: self.logger.info(f"Switching batch size to {n} for training.") batch_size = min(self.settings["batch_size"], n) if verbose: st = time() self.logger.info(f"[{self.name}] Training model") self.logger.info(f"[{self.name}] " f"n_train={n} " f"#epochs={self.settings['n_epochs']} " f"batch size={batch_size}") dev_score_opt = 0.0 for epoch in range(self.settings["n_epochs"]): iteration_losses = [] nn.Module.train(self, True) for X_batch, Y_batch in train_dataloader: # zero gradients for each batch self.optimizer.zero_grad() output = self._calc_logits(X_batch) loss = self.loss(output, Y_batch) # Compute gradient loss.backward() # Update the parameters self.optimizer.step() iteration_losses.append(self._non_cuda(loss)) # Print training stats and optionally checkpoint model if (verbose and (epoch + 1) % print_freq == 0) or epoch + 1 == self.settings["n_epochs"]: # Log the training loss into tensorboard self.tensorboard_logger.add_scalar("loss", loss.item(), epoch + 1) msg = ( f"[{self.name}] " f"Epoch {epoch + 1} ({time() - st:.2f}s)\t" f"Average loss={torch.stack(iteration_losses).mean():.6f}") if X_dev is not None: scores = self.score(_X_dev, _Y_dev, b=b, pos_label=pos_label) score = scores[ "accuracy"] if self.cardinality > 2 else scores["f1"] score_label = "Acc." if self.cardinality > 2 else "F1" msg += f"\tDev {score_label}={100.0 * score:.2f}" # Log the evaulation score on dev set into tensorboard for metric in scores.keys(): self.tensorboard_logger.add_scalar( metric, scores[metric], epoch + 1) self.logger.info(msg) # Save checkpoint model_file = f"checkpoint_epoch_{epoch + 1}.pt" self.save(model_file=model_file, save_dir=self.settings["log_dir"]) # If best score on dev set so far and dev checkpointing is # active, save best checkpoint if (X_dev is not None and dev_ckpt and epoch > dev_ckpt_delay * self.settings["n_epochs"] and score > dev_score_opt): dev_score_opt = score self.logger.info( f"Saving best checkpoint " f'{self.settings["log_dir"]}/{model_file}.') copyfile( f'{self.settings["log_dir"]}/{model_file}', f'{self.settings["log_dir"]}/best_model.pt', ) if (X_dev is None or dev_ckpt is False ) and epoch + 1 == self.settings["n_epochs"]: self.logger.info( f"Saving final model as best checkpoint " f'{self.settings["log_dir"]}/{model_file}.') copyfile( f'{self.settings["log_dir"]}/{model_file}', f'{self.settings["log_dir"]}/best_model.pt', ) # Conclude training if verbose: self.logger.info( f"[{self.name}] Training done ({time() - st:.2f}s)") # Load the best checkpoint (i.e. best on dev set) self.logger.info("Loading best checkpoint") self.load(model_file="best_model.pt", save_dir=self.settings["log_dir"])