def evaluate_by_epochs(self, dataloader): """Evaluate dataset using the averaged models. In each epoch each process loads models and averages them. The averaged model is used to evaluate train / validation dataset. Args: dataloader (:obj:`torch.utils.data.DataLoader`): The dataset to be evaluated. Returns: list: list of stats of models in each epoch. """ stats_list = [] for epoch in range(self.epochs): # Same model for all workers. model = self._load_model(epoch) model.eval() stats = {"epoch": epoch, "count": 0, "total_loss": 0} for metric in self.metrics: stats['total_' + metric.name] = 0 data_iter = iterate_dataloader( dataloader, self.dtype, self.max_batch_per_epoch, self.use_cuda) with torch.no_grad(): for i, (data, target) in enumerate(data_iter): output = model(data) # Compute loss and metrics. count = len(target) stats["count"] += count stats['total_loss'] += self.loss_function( output, target) * count for metric in self.metrics: stats['total_' + metric.name] += metric(output, target) * count logger.info("E{:4}B{:4}: total loss={:10.3e}" .format(epoch, i, stats['total_loss'] / stats['count'])) # Keep globally averaged loss / metrics, etc. stats["loss"] = global_average( stats["total_loss"], stats["count"]).item() for metric in self.metrics: stats[metric.name] = global_average( stats['total_' + metric.name], stats['count']).item() del stats['total_' + metric.name] del stats['count'], stats['total_loss'] stats_list.append(stats) return stats_list
def _validate( dataloader, model, loss_function, metrics, dtype, transform_target_type=None, use_cuda=False, max_batch_per_epoch=None, ): """Evaluate the model on the test dataset. Args: dataloader (:obj:`torch.utils.data.DataLoader`): The validation set model (`obj`:torch.nn.Module): The model to train loss_function (`obj`:torch.nn.Module): The loss function metrics (list): List of metrics to track dtype (str): The datatype to use, one of `fp32`or `fp64` transform_target_type (str): Datatype to convert data to, default: `None` use_cuda (bool): Whether to use GPU for training, default: `False` max_batch_per_epoch (int): Maximum number of batches tot rain for per epoch, default: `None` (all batches) """ # Initialize the accumulators for loss and metrics losses = AverageMeter() for metric in metrics: metric.reset() # Each worker computer their own losses and metrics with torch.no_grad(): data_iter = iterate_dataloader( dataloader, dtype, max_batch_per_epoch, use_cuda, transform_target_type ) for data, target in data_iter: # Inference output = model(data) # Compute loss loss = loss_function(output, target) # Update loss losses.update(loss.item(), data.size(0)) # Update metrics for metric in metrics: metric_value = metric(loss, output, target) metric.update(metric_value, data.size(0)) # Aggregate metrics and loss for all workers metrics_averages = {metric: metric.average().item() for metric in metrics} loss_average = global_average(losses.sum, losses.count).item() return metrics_averages, loss_average
def train_epoch(self, dataloader): """Train model for one epoch of data. Args: dataloader (:obj:`torch.utils.data.DataLoader`): The train set """ self.tracker.epoch_stats = { k: AverageMeter() for k in ["loss"] + [m.name for m in self.metrics] } # switch to train mode self.model.train() data_iter = iterate_dataloader(dataloader, self.dtype, self.max_batch_per_epoch, self.use_cuda, self.transform_target_type) for batch_idx, (data, target) in enumerate(data_iter): self.tracker.batch_stats = [("start", time.time())] if self.schedule_per == 'batch': self.scheduler.step() # Clear gradients in the optimizer. self.optimizer.zero_grad() self.tracker.batch_stats.append(('init', time.time())) # Compute the output output = self.model(data) self.tracker.batch_stats.append(('fwd_pass', time.time())) # Compute the loss loss = self.loss_function(output, target) self.tracker.batch_stats.append(('comp_loss', time.time())) # Backprop loss.backward() self.tracker.batch_stats.append(('backprop', time.time())) # Aggregate gradients/parameters from all workers and apply updates to model self.optimizer.step() self.tracker.batch_stats.append(('opt_step', time.time())) self.record_train_batch_stats(batch_idx, loss.item(), output, target)
def validate(self, dataloader): r"""Validate the quality of the model in terms of loss and metrics. Args: dataloader (:obj:`torch.utils.data.DataLoader`): The validation set """ # Turn on evaluation mode for the model self.model.eval() # Initialize the accumulators for loss and metrics losses = AverageMeter() for metric in self.metrics: metric.reset() # Each worker computer their own losses and metrics with torch.no_grad(): data_iter = iterate_dataloader(dataloader, self.dtype, self.max_batch_per_epoch, self.use_cuda, self.transform_target_type) for data, target in data_iter: # Inference output = self.model(data) # Compute loss loss = self.loss_function(output, target) # Update loss losses.update(loss.item(), data.size(0)) # Update metrics for metric in self.metrics: metric_value = metric(output, target) metric.update(metric_value, data.size(0)) # Aggregate metrics and loss for all workers metrics_averages = { metric.name: metric.average().item() for metric in self.metrics } loss_average = global_average(losses.sum, losses.count).item() return metrics_averages, loss_average
def test_iterate_dataloader(mocker): dataloader = [(torch.IntTensor([0]), torch.IntTensor([1])), (torch.IntTensor([2]), torch.IntTensor([3]))] it = iterate_dataloader(dataloader, 'fp32', max_batch_per_epoch=2, transform_target_type=True) first = next(it) assert first[0].dtype == torch.float32 assert first[1].dtype == torch.float32 assert first[0].data.item() == 0.0 assert first[1].item() == 1.0 second = next(it) assert second[0].dtype == torch.float32 assert second[1].dtype == torch.float32 assert second[0].data.item() == 2.0 assert second[1].item() == 3.0
def train_round( dataloader, model, optimizer, loss_function, metrics, scheduler, dtype, schedule_per="epoch", transform_target_type=None, use_cuda=False, max_batch_per_epoch=None, tracker=None, ): """ Performs max_batch_per_epoch batches of training (or full trainset if not specified) Args: dataloader (:obj:`torch.utils.data.DataLoader`): The train set model (`obj`:torch.nn.Module): The model to train optimizer (`obj`:torch.optim): The optimizer loss_function (`obj`:torch.nn.Module): The loss function metrics (list): List of metrics to track scheduler (`obj`:torch.optim.lr_scheduler): Learning Rate scheduler dtype (str): The datatype to use, one of `fp32`or `fp64` scheduler_per (str): Learning Rate scheduler mode, one of `batch` or `epoch` transform_target_type (str): Datatype to convert data to, default: `None` use_cuda (bool): Whether to use GPU for training, default: `False` max_batch_per_epoch (int): Maximum number of batches tot rain for per epoch, default: `None` (all batches) tracker (`obj`:mlbench_core.utils.Tracker): Tracker object to use. """ model.train() if tracker: tracker.train() data_iter = iterate_dataloader(dataloader, dtype, max_batch_per_epoch, use_cuda, transform_target_type) num_batches_per_device_train = len(dataloader) for batch_idx, (data, target) in enumerate(data_iter): if tracker: tracker.batch_start() # Clear gradients in the optimizer. optimizer.zero_grad() if tracker: tracker.record_batch_step("init") # Compute the output output = model(data) if tracker: tracker.record_batch_step("fwd_pass") # Compute the loss loss = loss_function(output, target) if tracker: tracker.record_batch_step("comp_loss") # Backprop loss.backward() if tracker: tracker.record_batch_step("backprop") # Aggregate gradients/parameters from all workers and apply updates to model optimizer.step() if tracker: tracker.record_batch_step("opt_step") if schedule_per == "batch": scheduler.step() if tracker: tracker.batch_end() _record_train_batch_stats( batch_idx, loss.item(), output, target, metrics, tracker, num_batches_per_device_train, ) if schedule_per == "epoch": scheduler.step()