def _get_predictions(self, data, break_ties="random", return_probs=False, **kwargs): """Computes predictions in batch, given a labeled dataset Args: data: a Pytorch DataLoader, Dataset, or tuple with Tensors (X,Y): X: The input for the predict method Y: An [n] or [n, 1] torch.Tensor or np.ndarray of target labels in {1,...,k} break_ties: How to break ties when making predictions return_probs: Return the predicted probabilities as well Returns: Y_p: A Tensor of predictions Y: A Tensor of labels [Optionally: Y_s: An [n, k] np.ndarray of predicted probabilities] """ data_loader = self._create_data_loader(data) Y_p = [] Y = [] Y_s = [] # Do batch evaluation by default, getting the predictions and labels for batch_num, data in enumerate(data_loader): Xb, Yb = data Y.append(self._to_numpy(Yb)) # Optionally move to device if self.config["device"] != "cpu": Xb = place_on_gpu(Xb) # Append predictions and labels from DataLoader Y_pb, Y_sb = self.predict(Xb, break_ties=break_ties, return_probs=True, **kwargs) Y_p.append(self._to_numpy(Y_pb)) Y_s.append(self._to_numpy(Y_sb)) Y_p, Y, Y_s = map(self._stack_batches, [Y_p, Y, Y_s]) if return_probs: return Y_p, Y, Y_s else: return Y_p, Y
def _train_model(self, train_data, loss_fn, valid_data=None, log_writer=None, restore_state={}): """The internal training routine called by train_model() after setup Args: train_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of X (data) and Y (labels) for the train split loss_fn: the loss function to minimize (maps *data -> loss) valid_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of X (data) and Y (labels) for the dev split restore_state: a dictionary containing model weights (optimizer, main network) and training information If valid_data is not provided, then no checkpointing or evaluation on the dev set will occur. """ # Set model to train mode self.train() train_config = self.config["train_config"] # Convert data to DataLoaders train_loader = self._create_data_loader(train_data) valid_loader = self._create_data_loader(valid_data) epoch_size = len(train_loader.dataset) # Move model to GPU if self.config["verbose"] and self.config["device"] != "cpu": print("Using GPU...") self.to(self.config["device"]) # Set training components self._set_writer(train_config) self._set_logger(train_config, epoch_size) self._set_checkpointer(train_config) self._set_optimizer(train_config) self._set_scheduler(train_config) # Restore model if necessary if restore_state: start_iteration = self._restore_training_state(restore_state) else: start_iteration = 0 # Train the model metrics_hist = {} # The most recently seen value for all metrics for epoch in range(start_iteration, train_config["n_epochs"]): progress_bar = (train_config["progress_bar"] and self.config["verbose"] and self.logger.log_unit == "epochs") t = tqdm( enumerate(train_loader), total=len(train_loader), disable=(not progress_bar), ) self.running_loss = 0.0 self.running_examples = 0 for batch_num, data in t: # NOTE: actual batch_size may not equal config's target batch_size batch_size = len(data[0]) # Moving data to device if self.config["device"] != "cpu": data = place_on_gpu(data) # Zero the parameter gradients self.optimizer.zero_grad() # Forward pass to calculate the average loss per example loss = loss_fn(*data) if torch.isnan(loss): msg = "Loss is NaN. Consider reducing learning rate." raise Exception(msg) # Backward pass to calculate gradients # Loss is an average loss per example loss.backward() # Perform optimizer step self.optimizer.step() # Calculate metrics, log, and checkpoint as necessary metrics_dict = self._execute_logging(train_loader, valid_loader, loss, batch_size) metrics_hist.update(metrics_dict) # tqdm output t.set_postfix(loss=metrics_dict["train/loss"]) # Apply learning rate scheduler self._update_scheduler(epoch, metrics_hist) self.eval() # Restore best model if applicable if self.checkpointer and self.checkpointer.checkpoint_best: self.checkpointer.load_best_model(model=self) # Write log if applicable if self.writer: if self.writer.include_config: self.writer.add_config(self.config) self.writer.close() # Print confusion matrix if applicable if self.config["verbose"]: print("Finished Training") if valid_loader is not None: self.score( valid_loader, metric=train_config["validation_metric"], verbose=True, print_confusion_matrix=True, )
def _train_model(self, train_data, loss_fn, dev_data=None, log_writer=None): """The internal training routine called by train_model() after setup Args: train_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of X (data) and Y (labels) for the train split loss_fn: the loss function to minimize (maps *data -> loss) dev_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of X (data) and Y (labels) for the dev split log_writer: a metal.utils.LogWriter object for logging If dev_data is not provided, then no checkpointing or evaluation on the dev set will occur. """ train_config = self.config["train_config"] evaluate_dev = dev_data is not None # Add config to log_writer if provided if log_writer is not None: log_writer.add_config(self.config) # Convert data to DataLoaders train_loader = self._create_data_loader(train_data) dev_loader = self._create_data_loader(dev_data) # Set the optimizer optimizer = self._set_optimizer(train_config) # Set the lr scheduler scheduler_config = train_config["scheduler_config"] lr_scheduler = self._set_scheduler(scheduler_config, optimizer) # Create the checkpointer if applicable if evaluate_dev and train_config["checkpoint"]: checkpointer = self._create_checkpointer( train_config["checkpoint_config"] ) # Moving model to GPU if self.config["use_cuda"]: if self.config["verbose"]: print("Using GPU...") self.cuda() # Train the model for epoch in range(train_config["n_epochs"]): epoch_loss = 0.0 t = tqdm( enumerate(train_loader), total=len(train_loader), disable=( train_config["disable_prog_bar"] or not self.config["verbose"] ), ) for batch_num, data in t: # Moving data to GPU if self.config["use_cuda"]: data = place_on_gpu(data) # Zero the parameter gradients optimizer.zero_grad() # Forward pass to calculate outputs loss = loss_fn(*data) if torch.isnan(loss): msg = "Loss is NaN. Consider reducing learning rate." raise Exception(msg) # Backward pass to calculate gradients loss.backward() # TODO: restore this once it has unit tests # Clip gradients # if grad_clip: # torch.nn.utils.clip_grad_norm( # self.net.parameters(), grad_clip) # Perform optimizer step optimizer.step() # Keep running sum of losses epoch_loss += loss.detach() # tqdm output running_loss = epoch_loss / (len(data[0]) * (batch_num + 1)) t.set_postfix(avg_loss=float(running_loss)) # Calculate average loss per training example # Saving division until this stage protects against the potential # mistake of averaging batch losses when the last batch is an orphan train_loss = epoch_loss / len(train_loader.dataset) # Checkpoint performance on dev if evaluate_dev and (epoch % train_config["validation_freq"] == 0): val_metric = train_config["validation_metric"] dev_score = self.score( dev_loader, metric=val_metric, verbose=False, print_confusion_matrix=False, ) if train_config["checkpoint"]: checkpointer.checkpoint(self, epoch, dev_score) # Apply learning rate scheduler if ( lr_scheduler is not None and epoch + 1 >= scheduler_config["lr_freeze"] ): if scheduler_config["scheduler"] == "reduce_on_plateau": if evaluate_dev: lr_scheduler.step(dev_score) else: lr_scheduler.step() # Report progress if self.config["verbose"] and ( epoch % train_config["print_every"] == 0 or epoch == train_config["n_epochs"] - 1 ): msg = f"[E:{epoch}]\tTrain Loss: {train_loss:.3f}" if evaluate_dev: msg += f"\tDev score: {dev_score:.3f}" print(msg) # Also write train loss (+ dev score) to log_writer if available if log_writer is not None and ( epoch % train_config["print_every"] == 0 or epoch == train_config["n_epochs"] - 1 ): tls = float(train_loss.cpu().numpy()) log_writer.add_scalar("train-loss", tls, epoch) if evaluate_dev: log_writer.add_scalar("dev-score", dev_score, epoch) log_writer.write() # Restore best model if applicable if evaluate_dev and train_config["checkpoint"]: checkpointer.restore(model=self) if log_writer is not None: log_writer.log["checkpoint_iter"] = checkpointer.best_iteration # Print confusion matrix if applicable if self.config["verbose"]: print("Finished Training") if evaluate_dev: self.score( dev_loader, metric=["accuracy"], verbose=True, print_confusion_matrix=True, ) # Close log_writer if available if log_writer is not None: log_writer.close()