def validate(self, trainer: 'CallbackTrainer'):
        # If the trainer has MovingAverage objects, use their weights for validation.
        for moving_average in self.moving_averages:
            moving_average.assign_average_value()

        with torch.no_grad():
            # We have a validation set, so compute all the metrics on it.
            logger.info("Validating")

            trainer.model.eval()

            num_gpus = len(trainer._cuda_devices)  # pylint: disable=protected-access

            raw_val_generator = self.iterator(self.instances,
                                              num_epochs=1,
                                              shuffle=False)
            val_generator = lazy_groups_of(raw_val_generator, num_gpus)
            num_validation_batches = math.ceil(
                self.iterator.get_num_batches(self.instances) / num_gpus)
            val_generator_tqdm = Tqdm.tqdm(val_generator,
                                           total=num_validation_batches)

            batches_this_epoch = 0
            val_loss = 0
            for batch_group in val_generator_tqdm:

                loss = trainer.batch_loss(batch_group, for_training=False)
                if loss is not None:
                    # You shouldn't necessarily have to compute a loss for validation, so we allow for
                    # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                    # currently only used as the divisor for the loss function, so we can safely only
                    # count those batches for which we actually have a loss.  If this variable ever
                    # gets used for something else, we might need to change things around a bit.
                    batches_this_epoch += 1
                    val_loss += loss.detach().cpu().numpy()
                if self.loss_tracker is not None:
                    ''' update validation regular / irregular loss status '''
                    if trainer.model._effective_encoder is trainer.model._sketch_encoder:
                        self.loss_tracker.cumulated_regular_loss += loss.detach(
                        ).cpu().numpy()
                        self.loss_tracker.regular_batch_count += 1.
                    else:
                        self.loss_tracker.cumulated_irregular_loss += loss.detach(
                        ).cpu().numpy()
                        self.loss_tracker.irregular_batch_count += 1.
                # Update the description with the latest metrics
                val_metrics = training_util.get_metrics(
                    trainer.model, val_loss, batches_this_epoch)
                description = training_util.description_from_metrics(
                    val_metrics)
                val_generator_tqdm.set_description(description, refresh=False)

            trainer.val_metrics = training_util.get_metrics(trainer.model,
                                                            val_loss,
                                                            batches_this_epoch,
                                                            reset=True)

        # If the trainer has a moving average, restore
        for moving_average in self.moving_averages:
            moving_average.restore()
示例#2
0
    def train_one_batch_group(self, batch_group: List[TensorDict]) -> str:
        """
        Handles the training for a single batch group.
        Fires off the events BATCH_START, FORWARD, BACKWARD, and BATCH_END.
        """
        self.handler.fire_event(Events.BATCH_START)
        self.optimizer.zero_grad()

        self.batches_this_epoch += 1
        self.batch_num_total += 1

        self.handler.fire_event(Events.FORWARD)
        loss = self.batch_loss(batch_group, for_training=True)

        if torch.isnan(loss):
            raise ValueError("nan loss encountered")

        loss.backward()
        self.train_loss += loss.item()

        self.handler.fire_event(Events.BACKWARD)

        self.optimizer.step()

        # Update the description with the latest metrics
        self.train_metrics = training_util.get_metrics(self.model,
                                                       self.train_loss,
                                                       self.batches_this_epoch)

        self.handler.fire_event(Events.BATCH_END)

        return training_util.description_from_metrics(self.train_metrics)
示例#3
0
 def get_desc_from_metrics(self, metrics, epoch=None):
     description = training_util.description_from_metrics(metrics)
     if epoch is None:
         description = f'epoch: -- rank: {dist.get_rank()} || {description}'
     else:
         description = f'epoch: {epoch} rank: {dist.get_rank()} || {description}'
     return description
示例#4
0
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        num_gpus = len(self._cuda_devices)

        raw_val_generator = val_iterator(self._validation_data,
                                         num_epochs=1,
                                         shuffle=False)
        val_generator = lazy_groups_of(raw_val_generator, num_gpus)
        num_validation_batches = math.ceil(
            val_iterator.get_num_batches(self._validation_data) / num_gpus)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        print("val gene called")
        batches_this_epoch = 0
        val_loss = 0
        try:
            few_shot = val_generator.__next__()
        except:
            print("Error could not do few shot validation")
            return batches_this_epoch, val_loss
        self.reptile_inner_update(few_shot)
        self.model.eval()

        with torch.no_grad():
            for batch_group in val_generator_tqdm:
                loss = self.batch_loss(batch_group, for_training=False)
                if loss is not None:
                    # You shouldn't necessarily have to compute a loss for validation, so we allow for
                    # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                    # currently only used as the divisor for the loss function, so we can safely only
                    # count those batches for which we actually have a loss.  If this variable ever
                    # gets used for something else, we might need to change things around a bit.
                    batches_this_epoch += 1
                    val_loss += loss.detach().cpu().numpy()

                # Update the description with the latest metrics
                val_metrics = training_util.get_metrics(
                    self.model, val_loss, batches_this_epoch)
                description = training_util.description_from_metrics(
                    val_metrics)
                val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, batches_this_epoch
示例#5
0
def evaluate(model: Model, instances: Iterable[Instance], task_name: str,
             data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]:
    """
    Evaluate a model for a particular tasks (usually after training).
    
    Parameters
    ----------
    model : ``allennlp.models.model.Model``, required
        The model to evaluate
    instances : ``Iterable[Instance]``, required
        The (usually test) dataset on which to evalute the model.
    task_name : ``str``, required
        The name of the tasks on which evaluate the model.
    data_iterator : ``DataIterator``
        Iterator that go through the dataset.
    cuda_device : ``int``
        Cuda device to use.
        
    Returns
    -------
    metrics :  ``Dict[str, Any]``
        A dictionary containing the metrics on the evaluated dataset.
    """
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))

        eval_loss = 0
        nb_batches = 0
        for tensor_batch in generator_tqdm:
            nb_batches += 1

            train_stages = ["stm", "sd", "valid"]
            task_index = TASKS_NAME.index(task_name)
            tensor_batch['task_index'] = torch.tensor(task_index)
            tensor_batch["reverse"] = torch.tensor(False)
            tensor_batch['for_training'] = torch.tensor(False)
            train_stage = train_stages.index("stm")
            tensor_batch['train_stage'] = torch.tensor(train_stage)
            tensor_batch = move_to_device(tensor_batch, 0)

            eval_output_dict = model.forward(**tensor_batch)
            loss = eval_output_dict["loss"]
            eval_loss += loss.item()
            metrics = model.get_metrics(task_name=task_name)
            metrics["stm_loss"] = float(eval_loss / nb_batches)

            description = training_util.description_from_metrics(metrics)
            generator_tqdm.set_description(description, refresh=False)

        metrics = model.get_metrics(task_name=task_name, reset=True)
        metrics["stm_loss"] = float(eval_loss / nb_batches)
        return metrics
示例#6
0
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._pytorch_model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        val_generator = val_iterator(self._validation_data,
                                     num_epochs=1,
                                     shuffle=False)
        num_validation_batches = val_iterator.get_num_batches(
            self._validation_data)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch in val_generator_tqdm:

            loss = self.batch_loss(batch, for_training=False)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(
                self.model,
                val_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, batches_this_epoch
示例#7
0
    def __call__(self, trainer: GradientDescentTrainer,
                 metrics: Dict[str, Any], epoch: int, **kwargs):

        trainer.model.get_metrics(True)
        if epoch < 0:
            return
        # for moving_average in self.moving_averages:
        #     moving_average.assign_average_value()

        with torch.no_grad():
            logger.info("Testing")
            trainer.model.eval()
            batches_this_epoch = 0
            val_loss = 0
            bar = tqdm(self.test_iterator, desc="testing")
            for batch_group in bar:
                outs = trainer.batch_outputs(batch_group, for_training=False)
                loss = outs["loss"]
                if self.writer is not None:
                    self.writer.write(outs)

                if loss is not None:
                    # You shouldn't necessarily have to compute a loss for validation, so we allow for
                    # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                    # currently only used as the divisor for the loss function, so we can safely only
                    # count those batches for which we actually have a loss.  If this variable ever
                    # gets used for something else, we might need to change things around a bit.
                    batches_this_epoch += 1
                    val_loss += loss.detach().cpu().numpy()

                # Update the description with the latest metrics
                val_metrics = get_metrics(trainer.model, val_loss, val_loss,
                                          batches_this_epoch)
                description = description_from_metrics(val_metrics)
                if self.name is not None:
                    description = "epoch: %d, dataset: %s, %s" % (
                        epoch, self.name, description)
                bar.set_description(description, refresh=False)

            trainer.val_metrics = get_metrics(trainer.model,
                                              val_loss,
                                              val_loss,
                                              batches_this_epoch,
                                              reset=False)
            if self.wandb_logger is not None:
                self.wandb_logger(trainer.val_metrics, epoch, prefix=self.name)
        # If the trainer has a moving average, restore
        # for moving_average in self.moving_averages:
        #     moving_average.restore()
        if self.writer is not None:
            self.writer.set_epoch(epoch + 1)
            self.writer.reset()
        trainer.model.get_metrics(True)
    def train_one_batch_group(self, batch_group):
        # Each batch_group should have only one batch
        batch, = batch_group
        array = batch["array"]

        # We should not have mixed batches:
        if len(set(batch["stage"])) != 1:
            raise ValueError("mixed batch")

        stage = batch["stage"][0]
        self.optimizer.stage = stage
        self.optimizer.zero_grad()

        if stage == "discriminator_real":
            # Generate real data and expect the discriminator to predict 1.
            output = self.model.discriminator(array, torch.ones(1))
            loss = output["loss"]
            self.discriminator_real_loss += loss.sum().item()
        elif stage == "discriminator_fake":
            # Generate fake data and expect the discriminator to predict 0.
            fake_data = self.model.generator(array)
            output = self.model.discriminator(fake_data["output"],
                                              torch.zeros(1))
            loss = output["loss"]
            self.discriminator_fake_loss += loss.sum().item()
        elif stage == "generator":
            # Generate fake data and try to fool the discriminator.
            generated = self.model.generator(array, self.model.discriminator)
            fake_data = generated["output"]
            loss = generated["loss"]
            self.generator_loss += loss.sum().item()

            self.fake_mean += fake_data.mean()
            self.fake_stdev += fake_data.std()
            self.count += 1

        self.train_loss += loss.sum().item()
        loss.backward()

        count = max(self.count, 1)
        self.train_metrics = {
            "gl": self.generator_loss / count,
            "dfl": self.discriminator_fake_loss / count,
            "drl": self.discriminator_real_loss / count,
            "mean": self.fake_mean / count,
            "stdev": self.fake_stdev / count
        }

        self.optimizer.step()

        return training_util.description_from_metrics(self.train_metrics)
示例#9
0
    def validate(self, trainer: "CallbackTrainer"):
        # If the trainer has MovingAverage objects, use their weights for validation.
        for moving_average in self.moving_averages:
            moving_average.assign_average_value()

        with torch.no_grad():
            # We have a validation set, so compute all the metrics on it.
            logger.info("Validating")

            trainer.model.eval()

            val_generator = self.iterator(self.instances,
                                          num_epochs=1,
                                          shuffle=False)
            num_validation_batches = self.iterator.get_num_batches(
                self.instances)
            val_generator_tqdm = Tqdm.tqdm(val_generator,
                                           total=num_validation_batches)

            batches_this_epoch = 0
            val_loss = 0
            for batch in val_generator_tqdm:

                loss = trainer.batch_loss(batch, for_training=False)
                if loss is not None:
                    # You shouldn't necessarily have to compute a loss for validation, so we allow for
                    # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                    # currently only used as the divisor for the loss function, so we can safely only
                    # count those batches for which we actually have a loss.  If this variable ever
                    # gets used for something else, we might need to change things around a bit.
                    batches_this_epoch += 1
                    val_loss += loss.detach().cpu().numpy()

                # Update the description with the latest metrics
                val_metrics = training_util.get_metrics(
                    trainer.model, val_loss, batches_this_epoch)
                description = training_util.description_from_metrics(
                    val_metrics)
                val_generator_tqdm.set_description(description, refresh=False)

            trainer.val_metrics = training_util.get_metrics(trainer.model,
                                                            val_loss,
                                                            batches_this_epoch,
                                                            reset=True)

        # If the trainer has a moving average, restore
        for moving_average in self.moving_averages:
            moving_average.restore()
示例#10
0
    def _validation_loss(self) -> Tuple[float, int]:
        logger.info("Validating")

        self.model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        num_gpus = len(self._cuda_devices)

        if getattr(self, "val_dataset", None) is None:
            self.val_dataset = DMDataSet(data=self._validation_data[0],
                                         batch_size=self.batch_size,
                                         num_gpus=num_gpus,
                                         shuffle=False)
        num_validation_batches = math.ceil(
            len(self.val_dataset) / self.batch_size / num_gpus)
        val_generator_tqdm = Tqdm.tqdm(self.val_dataset,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch_group in val_generator_tqdm:

            loss = self.batch_loss(batch_group, for_training=False)
            if loss is not None:
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(self.model, val_loss,
                                                    batches_this_epoch)
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, batches_this_epoch
示例#11
0
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self.model.eval()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        num_gpus = len(self._cuda_devices)

        raw_val_generator = val_iterator(self._validation_data,
                                         num_epochs=1,
                                         shuffle=False)
        val_generator = lazy_groups_of(raw_val_generator, num_gpus)
        num_validation_batches = math.ceil(
            val_iterator.get_num_batches(self._validation_data) / num_gpus)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch_group in val_generator_tqdm:

            loss = self.batch_loss(batch_group, for_training=False)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(self.model, val_loss,
                                                    batches_this_epoch)
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        return val_loss, batches_this_epoch
示例#12
0
    def validate(self, trainer: 'CallbackTrainer'):
        with torch.no_grad():
            # We have a validation set, so compute all the metrics on it.
            logger.info("Validating")

            trainer.model.eval()

            num_gpus = len(trainer._cuda_devices)  # pylint: disable=protected-access

            raw_val_generator = self.iterator(self.instances,
                                              num_epochs=1,
                                              shuffle=False)
            val_generator = lazy_groups_of(raw_val_generator, num_gpus)
            num_validation_batches = math.ceil(
                self.iterator.get_num_batches(self.instances) / num_gpus)
            val_generator_tqdm = Tqdm.tqdm(val_generator,
                                           total=num_validation_batches)

            batches_this_epoch = 0
            val_loss = 0
            for batch_group in val_generator_tqdm:

                loss = trainer.batch_loss(batch_group, for_training=False)
                if loss is not None:
                    # You shouldn't necessarily have to compute a loss for validation, so we allow for
                    # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                    # currently only used as the divisor for the loss function, so we can safely only
                    # count those batches for which we actually have a loss.  If this variable ever
                    # gets used for something else, we might need to change things around a bit.
                    batches_this_epoch += 1
                    val_loss += loss.detach().cpu().numpy()

                # Update the description with the latest metrics
                val_metrics = training_util.get_metrics(
                    trainer.model, val_loss, batches_this_epoch)
                description = training_util.description_from_metrics(
                    val_metrics)
                val_generator_tqdm.set_description(description, refresh=False)

            trainer.val_metrics = training_util.get_metrics(trainer.model,
                                                            val_loss,
                                                            batches_this_epoch,
                                                            reset=True)
示例#13
0
    def train_one_batch_group(self, batch_group: List[TensorDict]) -> str:
        """
        Handles the training for a single batch group.
        Fires off the events BATCH_START, FORWARD, BACKWARD, and BATCH_END.
        """
        self.handler.fire_event(Events.BATCH_START)
        self.optimizer.zero_grad()

        self.batches_this_epoch += 1
        self.batch_num_total += 1

        self.handler.fire_event(Events.FORWARD)
        loss = self.batch_loss(batch_group, for_training=True)

        if loss is None:
            return

        if torch.isnan(loss):
            logger.warning("NaN loss encountered.")
            return

        if self._use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.train_loss += loss.item()

        self.handler.fire_event(Events.BACKWARD)

        self.optimizer.step()

        # Update the description with the latest metrics
        self.train_metrics = training_util.get_metrics(self.model,
                                                       self.train_loss,
                                                       self.batches_this_epoch)

        self.handler.fire_event(Events.BATCH_END)

        return training_util.description_from_metrics(self.train_metrics)
示例#14
0
    def predict(self, dataset: Iterable[Instance]) -> None:
        self.init_confusion_matrices()
        pred_generator = self.iterator(dataset, num_epochs=1, shuffle=False)
        pred_generator_tqdm = tqdm(
            pred_generator, total=self.iterator.get_num_batches(dataset))

        self.model.eval()
        with torch.no_grad():
            batches_this_epoch = 0
            pred_loss = 0
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                output_dict = self.model(**batch)

                predictions = output_dict["forward_logits"]
                gold_labels = batch.get("forward_output_tokens").get("tokens")
                self.update_confusion_matrices(predictions, gold_labels)

                loss = output_dict["loss"]
                if loss is not None:
                    # You shouldn't necessarily have to compute a loss for validation, so we allow for
                    # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                    # currently only used as the divisor for the loss function, so we can safely only
                    # count those batches for which we actually have a loss.  If this variable ever
                    # gets used for something else, we might need to change things around a bit.
                    batches_this_epoch += 1
                    pred_loss += loss.detach().cpu().numpy()

                # Update the description with the latest metrics
                pred_metrics = training_util.get_metrics(
                    self.model, pred_loss, batches_this_epoch)
                description = training_util.description_from_metrics(
                    pred_metrics)
                pred_generator_tqdm.set_description(description, refresh=False)

        return pred_metrics
示例#15
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        cpu_memory_usage = []
        for worker, memory in common_util.peak_cpu_memory().items():
            cpu_memory_usage.append((worker, memory))
            logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}")
        gpu_memory_usage = []
        for gpu, memory in common_util.peak_gpu_memory().items():
            gpu_memory_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}")

        regularization_penalty = self.model.get_regularization_penalty()

        train_loss = 0.0
        batch_loss = 0.0
        train_reg_loss = None if regularization_penalty is None else 0.0
        batch_reg_loss = None if regularization_penalty is None else 0.0

        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps
        )

        logger.info("Training")

        num_training_batches: Union[int, float]
        try:
            len_data_loader = len(self.data_loader)
            num_training_batches = math.ceil(
                len_data_loader / self._num_gradient_accumulation_steps
            )
        except TypeError:
            num_training_batches = float("inf")

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's
        # progress is shown
        if self._primary:
            batch_group_generator_tqdm = Tqdm.tqdm(
                batch_group_generator, total=num_training_batches
            )
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if done_early:
                break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            # Zero gradients.
            # NOTE: this is actually more efficient than calling `self.optimizer.zero_grad()`
            # because it avoids a read op when the gradients are first updated below.
            for param_group in self.optimizer.param_groups:
                for p in param_group["params"]:
                    p.grad = None

            batch_loss = 0.0
            batch_group_outputs = []
            for batch in batch_group:
                with amp.autocast(self._use_amp):
                    batch_outputs = self.batch_outputs(batch, for_training=True)
                    batch_group_outputs.append(batch_outputs)
                    loss = batch_outputs["loss"]
                    reg_loss = batch_outputs.get("reg_loss")
                    if torch.isnan(loss):
                        raise ValueError("nan loss encountered")
                    loss = loss / len(batch_group)

                    batch_loss += loss.item()
                    if reg_loss is not None:
                        reg_loss = reg_loss / len(batch_group)
                        batch_reg_loss = reg_loss.item()
                        train_reg_loss += batch_reg_loss  # type: ignore

                if self._scaler is not None:
                    self._scaler.scale(loss).backward()
                else:
                    loss.backward()
            if len(batch_group_outputs) <= 0:
                continue

            train_loss += batch_loss

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._scaler is not None:
                self._scaler.step(self.optimizer)
                self._scaler.update()
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batch_loss,
                batch_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=self.cuda_device,
            )

            if batch_num_total % self.val_loss_steps == 0:
                logger.info("%s: %.4f" % ('train_loss', train_loss / batches_this_epoch))
                if self._validation_data_loader is not None:
                    with torch.no_grad():
                        # We have a validation set, so compute all the metrics on it.
                        val_loss, val_reg_loss, num_batches = self._validation_loss_n_step(batch_num_total)

                val_metrics = training_util.get_metrics(
                    self.model,
                    val_loss,
                    val_reg_loss,
                    num_batches=num_batches,
                    batch_loss=None,
                    batch_reg_loss=None,
                    reset=True,
                    world_size=self._world_size,
                    cuda_device=self.cuda_device,
                )
                # description = training_util.description_from_metrics(val_metrics)
                logger.info("%s: %.4f" % ('val_loss', val_loss / num_batches))
                # batch_group_generator_tqdm.set_description(description, refresh=False)

                self._pytorch_model.train()

            if self._primary:
                # Updating tqdm only for the primary as the trainers wouldn't have one
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description, refresh=False)

                if self._checkpointer is not None:
                    self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch)

            for callback in self._callbacks:
                callback.on_batch(
                    self,
                    batch_group,
                    batch_group_outputs,
                    metrics,
                    epoch,
                    batches_this_epoch,
                    is_training=True,
                    is_primary=self._primary,
                    batch_grad_norm=batch_grad_norm,
                )

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batch_loss=None,
            batch_reg_loss=None,
            num_batches=batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=self.cuda_device,
        )

        for (worker, memory) in cpu_memory_usage:
            metrics["worker_" + str(worker) + "_memory_MB"] = memory / (1024 * 1024)
        for (gpu_num, memory) in gpu_memory_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024)
        return metrics
    def _validation_loss(self, epoch: int) -> Tuple[float, float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self.model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_data_loader is not None:
            validation_data_loader = self._validation_data_loader
        else:
            raise ConfigurationError(
                "Validation results cannot be calculated without a validation_data_loader"
            )

        val_generator_tqdm = Tqdm.tqdm(validation_data_loader)

        for component_optimizer in self.component_optimizers.values():
            component_optimizer.reset_loss('validation')

        batches_this_epoch = 0
        done_early = False

        for batch in val_generator_tqdm:
            batches_this_epoch += 1

            batch_metrics = []
            batch_group = [batch]
            meta_batch = deepcopy(batch_group)

            # Train the Sub Models first
            for name, sub_model in self._pytorch_model.component_models.items(
            ):
                component_optimizer = self.component_optimizers[name]
                batch_group_outputs, metrics = component_optimizer.process_batch_group(
                    batch_group,
                    for_training=False,
                    batches_this_epoch=batches_this_epoch)
                batch_metrics.append(metrics)

                for i, batch_outputs in enumerate(batch_group_outputs):
                    meta_batch[i][name] = batch_outputs["output"]

            meta_optimizer = self.component_optimizers["meta"]
            meta_batch_outputs, meta_metrics = meta_optimizer.process_batch_group(
                meta_batch,
                for_training=False,
                batches_this_epoch=batches_this_epoch)
            batch_metrics.append(meta_metrics)

            all_metrics = ChainMap(*batch_metrics)
            description = training_util.description_from_metrics(all_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return all_metrics
示例#17
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        batch_group_generator = lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_group_size = 0
        for batch_group in batch_group_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)

            # Log parameter values to Tensorboard (only from the master)
            if self._tensorboard.should_log_this_batch() and self._master:
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                batch_group_size = sum(
                    training_util.get_batch_size(batch)
                    for batch in batch_group)
                cumulative_batch_group_size += batch_group_size
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_group_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {batch_group_size} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       batch_group_size)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if (self._model_save_interval is not None and
                (time.time() - last_save_time > self._model_save_interval)
                    and self._master):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        train_loss_lang1 = 0.0
        train_loss_lang2 = 0.0
        train_loss_cm = 0.0

        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0

        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()
            self.optimizer_lang1.zero_grad()
            self.optimizer_lang2.zero_grad()
            self.optimizer_cm.zero_grad()

            loss, loss_cm, loss_lang1, loss_lang2 = self.batch_loss(
                batch_group, for_training=True)

            if torch.isnan(loss):
                # if either on of loss_%s is nan, loss will be nan
                raise ValueError("nan loss encountered")

            #######
            # lang1
            #######
            loss_lang1.backward()
            train_loss_lang1 += loss_lang1.item()
            self.rescale_gradients()

            if self._learning_rate_scheduler_lang1:
                self._learning_rate_scheduler_lang1.step_batch(batch_num_total)
            if self._momentum_scheduler_lang1:
                self._momentum_scheduler_lang1.step_batch(batch_num_total)

            self.optimizer_lang1.step()
            self.optimizer_lang1.zero_grad()

            #######
            # cm
            #######
            loss_lang2.backward()
            train_loss_lang2 += loss_lang2.item()
            batch_grad_norm = self.rescale_gradients()

            if self._learning_rate_scheduler_lang2:
                self._learning_rate_scheduler_lang2.step_batch(batch_num_total)
            if self._momentum_scheduler_lang2:
                self._momentum_scheduler_lang2.step_batch(batch_num_total)

            self.optimizer_lang2.step()
            self.optimizer_lang2.zero_grad()

            #######
            # lang2
            #######
            loss_cm.backward()
            train_loss_cm += loss_cm.item()
            self.rescale_gradients()

            if self._learning_rate_scheduler_cm:
                self._learning_rate_scheduler_cm.step_batch(batch_num_total)
            if self._momentum_scheduler_cm:
                self._momentum_scheduler_cm.step_batch(batch_num_total)

            self.optimizer_cm.step()
            self.optimizer_cm.zero_grad()

            train_loss += loss.item()

            # Update the description with the latest metrics
            # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch)
            metrics = self.model.get_metrics(False)
            metrics["loss"] = float(
                train_loss /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            metrics["cm_loss"] = float(
                train_loss_cm /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            metrics["lang1_loss"] = float(
                train_loss_lang1 /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            metrics["lang2_loss"] = float(
                train_loss_lang2 /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer_lang1)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer_lang2)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer_cm)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.add_train_scalar("loss/cm_loss_train",
                                                   metrics["cm_loss"])
                self._tensorboard.add_train_scalar("loss/lang1_loss_train",
                                                   metrics["lang1_loss"])
                self._tensorboard.add_train_scalar("loss/lang2_loss_train",
                                                   metrics["lang2_loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True)
        metrics = self.model.get_metrics(reset=True)
        metrics["loss"] = float(
            train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["cm_loss"] = float(
            train_loss_cm /
            batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["lang1_loss"] = float(
            train_loss_lang1 /
            batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["lang2_loss"] = float(
            train_loss_lang2 /
            batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
示例#19
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = common_util.peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_group_size = 0
        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing training early! "
                        "This implies that there is an imbalance in your training "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)

            # Log parameter values to Tensorboard (only from the master)
            if self._tensorboard.should_log_this_batch() and self._master:
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                batch_group_size = sum(
                    training_util.get_batch_size(batch)
                    for batch in batch_group)
                cumulative_batch_group_size += batch_group_size
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_group_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {batch_group_size} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       batch_group_size)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if (self._model_save_interval is not None and
                (time.time() - last_save_time > self._model_save_interval)
                    and self._master):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
示例#20
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        cpu_memory_usage = []
        for worker, memory in common_util.peak_cpu_memory().items():
            cpu_memory_usage.append((worker, memory))
            logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}")
        gpu_memory_usage = []
        for gpu, memory in common_util.peak_gpu_memory().items():
            gpu_memory_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}")

        regularization_penalty = self.model.get_regularization_penalty()

        train_loss = 0.0
        batch_loss = 0.0
        train_reg_loss = None if regularization_penalty is None else 0.0
        batch_reg_loss = None if regularization_penalty is None else 0.0

        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps
        )

        logger.info("Training")

        num_training_batches: Union[int, float]
        try:
            len_data_loader = len(self.data_loader)
            num_training_batches = math.ceil(
                len_data_loader / self._num_gradient_accumulation_steps
            )
        except TypeError:
            num_training_batches = float("inf")

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's
        # progress is shown
        if self._primary:
            batch_group_generator_tqdm = Tqdm.tqdm(
                batch_group_generator, total=num_training_batches
            )
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if done_early:
                break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            # Zero gradients.
            # NOTE: this is actually more efficient than calling `self.optimizer.zero_grad()`
            # because it avoids a read op when the gradients are first updated below.
            for param_group in self.optimizer.param_groups:
                for p in param_group["params"]:
                    p.grad = None

            batch_loss = 0.0
            batch_group_outputs = []
            for batch in batch_group:
                if self._distributed:
                    # Check whether the other workers have stopped already (due to differing amounts of
                    # data in each). If so, we can't proceed because we would hang when we hit the
                    # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                    # here because NCCL process groups apparently don't support BoolTensor.
                    done = torch.tensor(0, device=self.cuda_device)
                    torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
                    if done.item() > 0:
                        done_early = True
                        logger.warning(
                            f"Worker {torch.distributed.get_rank()} finishing training early! "
                            "This implies that there is an imbalance in your training "
                            "data across the workers and that some amount of it will be "
                            "ignored. A small amount of this is fine, but a major imbalance "
                            "should be avoided. Note: This warning will appear unless your "
                            "data is perfectly balanced."
                        )
                        break

                with amp.autocast(self._use_amp):
                    batch_outputs = self.batch_outputs(batch, for_training=True)
                    batch_group_outputs.append(batch_outputs)
                    loss = batch_outputs["loss"]
                    reg_loss = batch_outputs.get("reg_loss")
                    if torch.isnan(loss):
                        raise ValueError("nan loss encountered")
                    loss = loss / len(batch_group)

                    batch_loss += loss.item()
                    if reg_loss is not None:
                        reg_loss = reg_loss / len(batch_group)
                        batch_reg_loss = reg_loss.item()
                        train_reg_loss += batch_reg_loss  # type: ignore

                if self._scaler is not None:
                    self._scaler.scale(loss).backward()
                else:
                    loss.backward()
            if len(batch_group_outputs) <= 0:
                continue

            train_loss += batch_loss

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._scaler is not None:
                self._scaler.step(self.optimizer)
                self._scaler.update()
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batch_loss,
                batch_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=self.cuda_device,
            )

            if self._primary:
                # Updating tqdm only for the primary as the trainers wouldn't have one
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description, refresh=False)

                if self._checkpointer is not None:
                    self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch)

            for callback in self._callbacks:
                callback.on_batch(
                    self,
                    batch_group,
                    batch_group_outputs,
                    metrics,
                    epoch,
                    batches_this_epoch,
                    is_training=True,
                    is_primary=self._primary,
                    batch_grad_norm=batch_grad_norm,
                )

        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batch_loss=None,
            batch_reg_loss=None,
            num_batches=batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=self.cuda_device,
        )

        for (worker, memory) in cpu_memory_usage:
            metrics["worker_" + str(worker) + "_memory_MB"] = memory / (1024 * 1024)
        for (gpu_num, memory) in gpu_memory_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024)
        return metrics
示例#21
0
    def _validation_loss(self, epoch: int) -> Tuple[float, float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._pytorch_model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_data_loader is not None:
            validation_data_loader = self._validation_data_loader
        else:
            raise ConfigurationError(
                "Validation results cannot be calculated without a validation_data_loader"
            )

        val_generator_tqdm = Tqdm.tqdm(validation_data_loader)
        batches_this_epoch = 0
        val_loss = 0
        val_reg_loss = 0
        done_early = False
        for batch in val_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing validation early! "
                        "This implies that there is an imbalance in your validation "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batch_outputs = self.batch_outputs(batch, for_training=False)
            loss = batch_outputs.get("loss")
            reg_loss = batch_outputs.get("reg_loss")
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()
                if reg_loss is not None:
                    val_reg_loss += reg_loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(
                self.model,
                val_loss,
                val_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

            if self._master:
                for callback in self._batch_callbacks:
                    callback(
                        self,
                        [batch],
                        [batch_outputs],
                        epoch,
                        batches_this_epoch,
                        is_training=False,
                    )

        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (validation)."
            )
            # Indicate that we're done so that any workers that have remaining data stop validation early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, val_reg_loss, batches_this_epoch
示例#22
0
    def _validation(self, n_epoch: int) -> Tuple[float, int]:
        ### Begin validation of the model ###
        logger.info("Validation - Begin")
        all_val_metrics = {}

        self._model.eval()  # Set the model into evaluation mode

        avg_accuracy = 0.0

        for task_idx, task in enumerate(self._task_list):
            logger.info("Validation - Task {}/{}: {}", task_idx + 1, self._n_tasks, task._name)

            val_loss = 0.0
            n_batches_val_this_epoch_this_task = 0
            n_val_batches = self._task_infos[task._name]["n_val_batches"]
            scheduler = self._schedulers[task._name]

            # Create tqdm generator for current tasks's validation
            data_iterator = task._data_iterator
            val_generator = data_iterator(task._validation_data, num_epochs=1, shuffle=False)
            val_generator_tqdm = tqdm.tqdm(val_generator, total=n_val_batches)

            # Iterate over each validation batch for this tasks
            for batch in val_generator_tqdm:
                n_batches_val_this_epoch_this_task += 1

                # Get the loss
                val_output_dict = self._forward(batch, task=task, for_training=False)
                loss = val_output_dict["stm_loss"]
                val_loss += loss.item()
                del loss

                # Get metrics for all progress so far, update tqdm, display description
                task_metrics = self._get_metrics(task=task)
                task_metrics["loss"] = float(val_loss / n_batches_val_this_epoch_this_task)
                description = training_util.description_from_metrics(task_metrics)
                val_generator_tqdm.set_description(description)

            # Get tasks validation metrics and store them in all_val_metrics
            task_metrics = self._get_metrics(task=task, reset=True)
            if task._name not in all_val_metrics:
                all_val_metrics[task._name] = {}
            for name, value in task_metrics.items():
                all_val_metrics[task._name][name] = value
            all_val_metrics[task._name]["loss"] = float(val_loss / n_batches_val_this_epoch_this_task)

            avg_accuracy += task_metrics["sentiment_acc"]

            # Tensorboard - Validation metrics for this epoch
            for metric_name, value in all_val_metrics[task._name].items():
                self._tensorboard.add_validation_scalar(
                    name="task_" + task._name + "/" + metric_name, value=value
                )

            ### Perform a patience check and update the history of validation metric for this tasks ###
            this_epoch_val_metric = all_val_metrics[task._name][task._val_metric]
            metric_history = self._metric_infos[task._name]["hist"]

            metric_history.append(this_epoch_val_metric)
            is_best_so_far, out_of_patience = self._check_history(
                metric_history=metric_history,
                cur_score=this_epoch_val_metric,
                should_decrease=task._val_metric_decreases,
            )

            if is_best_so_far:
                logger.info("Best model found for {}.", task._name)
                self._metric_infos[task._name]["best"] = (n_epoch, all_val_metrics)
            if out_of_patience and not self._metric_infos[task._name]["is_out_of_patience"]:
                self._metric_infos[task._name]["is_out_of_patience"] = True
                logger.info("Task {} is out of patience and vote to stop the training.", task._name)

            # The LRScheduler API is agnostic to whether your schedule requires a validation metric -
            # if it doesn't, the validation metric passed here is ignored.
            scheduler.step(this_epoch_val_metric, n_epoch)

        logger.info("Validation - End")
        return all_val_metrics, avg_accuracy
示例#23
0
    def _train_epoch(self, total_n_tr_batches: int, sampling_prob: List, reverse=False, train_D=False) -> Dict[
        str, float]:
        self._model.train()  # Set the model to "train" mode.

        if reverse:
            logger.info("Training Generator- Begin")
        elif not train_D:
            logger.info("Training Init Generator- Begin")

        if train_D:
            logger.info("Training Discriminator- Begin")
        logger.info("reverse is {}, train_D is {}", reverse, train_D)

        ### Reset training and trained batches counter before new training epoch ###
        for _, task_info in self._task_infos.items():
            task_info["tr_loss_cum"] = 0.0
            task_info['stm_loss'] = 0.0
            task_info['p_d_loss'] = 0.0
            task_info['s_d_loss'] = 0.0
            task_info['valid_loss'] = 0.0
            task_info["n_batches_trained_this_epoch"] = 0
        all_tr_metrics = {}  # BUG TO COMPLETE COMMENT TO MAKE IT MORE CLEAR

        ### Start training epoch ###
        epoch_tqdm = tqdm.tqdm(range(total_n_tr_batches), total=total_n_tr_batches)
        histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging())

        for step, _ in enumerate(epoch_tqdm):
            task_idx = np.argmax(np.random.multinomial(1, sampling_prob))
            task = self._task_list[task_idx]
            task_info = self._task_infos[task._name]

            ### One forward + backward pass ###
            # Call next batch to train
            batch = next(self._tr_generators[task._name])
            self._batch_num_total += 1
            task_info["n_batches_trained_this_epoch"] += 1

            # Load optimizer
            if not train_D:
                optimizer = self._optimizers[task._name]["all_params"]
            else:
                optimizer = self._optimizers[task._name]["exclude_share_encoder"]

            # Get the loss for this batch
            output_dict = self._forward(tensor_batch=batch, task=task, for_training=True, reverse=reverse)
            # if reverse or train_D:
            #     output_dict_fake = self._forward(tensor_batch=batch, task=task, for_training=True, reverse=True)
            # loss = output_dict["stm_loss"]
            # if train_D:
            #     loss = (output_dict["stm_loss"] + output_dict["s_d_loss"] + output_dict_fake["stm_loss"] +
            #             output_dict_fake["s_d_loss"]) / 2.0
            # if reverse:
            #     # loss = (output_dict["stm_loss"] + output_dict["p_d_loss"] + 0.005 * output_dict["s_d_loss"] +
            #     #         output_dict_fake["stm_loss"] + output_dict_fake["p_d_loss"] + 0.005 * output_dict_fake[
            #     #             "s_d_loss"]) / 2.0
            #     loss = (output_dict['loss'] + output_dict_fake['loss']) / 2.0
            loss = output_dict['loss']
            if self._gradient_accumulation_steps > 1:
                loss /= self._gradient_accumulation_steps
            loss.backward()
            task_info["tr_loss_cum"] += loss.item()
            task_info['stm_loss'] += output_dict['stm_loss'].item()
            task_info['p_d_loss'] += output_dict['p_d_loss'].item()
            task_info['s_d_loss'] += output_dict['s_d_loss'].item()
            task_info['valid_loss'] += output_dict['valid_loss'].item()
            # if reverse or train_D:
            #     task_info['stm_loss'] += output_dict_fake['stm_loss'].item()
            #     task_info['stm_loss'] /= 2.0
            #     task_info['p_d_loss'] += output_dict_fake['p_d_loss'].item()
            #     task_info['p_d_loss'] /= 2.0
            #     task_info['s_d_loss'] += output_dict_fake['s_d_loss'].item()
            #     task_info['s_d_loss'] /= 2.0
            #     task_info['valid_loss'] += output_dict_fake['valid_loss'].item()
            #     task_info['valid_loss'] /= 2.0
            del loss

            if (step + 1) % self._gradient_accumulation_steps == 0:
                batch_grad_norm = self._rescale_gradients()
                if self._tensorboard.should_log_histograms_this_batch():
                    param_updates = {name: param.detach().cpu().clone()
                                     for name, param in self._model.named_parameters()}
                    optimizer.step()
                    for name, param in self._model.named_parameters():
                        param_updates[name].sub_(param.detach().cpu())
                        update_norm = torch.norm(param_updates[name].view(-1, ))
                        param_norm = torch.norm(param.view(-1, )).cpu()
                        self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                           update_norm / (param_norm + 1e-7))
                else:
                    optimizer.step()
                optimizer.zero_grad()

            ### Get metrics for all progress so far, update tqdm, display description ###
            task_metrics = self._get_metrics(task=task)
            task_metrics["loss"] = float(
                task_info["tr_loss_cum"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001)
            )
            task_metrics["stm_loss"] = float(
                task_info["stm_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001)
            )
            task_metrics["p_d_loss"] = float(
                task_info["p_d_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001)
            )
            task_metrics["s_d_loss"] = float(
                task_info["s_d_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001)
            )
            task_metrics["valid_loss"] = float(
                task_info["valid_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_001)
            )
            description = training_util.description_from_metrics(task_metrics)
            epoch_tqdm.set_description(task._name + ", " + description)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(self._model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self._model, optimizer)

                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + task._name + "/" + k: v for k, v in task_metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self._model, histogram_parameters)
            self._global_step += 1

        ### Bookkeeping all the training metrics for all the tasks on the training epoch that just finished ###
        for task in self._task_list:
            task_info = self._task_infos[task._name]

            task_info["total_n_batches_trained"] += task_info["n_batches_trained_this_epoch"]
            task_info["last_log"] = time.time()

            task_metrics = self._get_metrics(task=task, reset=True)
            if task._name not in all_tr_metrics:
                all_tr_metrics[task._name] = {}
            for name, value in task_metrics.items():
                all_tr_metrics[task._name][name] = value
            all_tr_metrics[task._name]["loss"] = float(
                task_info["tr_loss_cum"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01)
            )
            all_tr_metrics[task._name]["stm_loss"] = float(
                task_info["stm_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01)
            )
            all_tr_metrics[task._name]["p_d_loss"] = float(
                task_info["p_d_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01)
            )
            all_tr_metrics[task._name]["s_d_loss"] = float(
                task_info["s_d_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01)
            )
            all_tr_metrics[task._name]["valid_loss"] = float(
                task_info["valid_loss"] / (task_info["n_batches_trained_this_epoch"] + 0.000_000_01)
            )

            # Tensorboard - Training metrics for this epoch
            for metric_name, value in all_tr_metrics[task._name].items():
                self._tensorboard.add_train_scalar(
                    name="task_" + task._name + "/" + metric_name, value=value
                )

        logger.info("Train - End")
        return all_tr_metrics
示例#24
0
    def semi_train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self.trainer._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.trainer.model.train()

        num_gpus = len(self.trainer._cuda_devices)

        self.trainer._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self.trainer._batch_num_total is None:
            self.trainer._batch_num_total = 0

        histogram_parameters = set(
            self.trainer.model.
            get_parameters_for_histogram_tensorboard_logging())
        #Pdb().set_trace()
        mixed_generator, num_training_batches = get_mixer(
            self.trainer.iterator, self.trainer.train_data,
            self.trainer.iterator, self.unlabelled_dataset, num_gpus,
            self.labelled_id, self.which_mixer, self.min_pct_of_unlabelled)
        #mixed_generator, num_training_batches = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer._validation_iterator,  self.unlabelled_dataset,num_gpus, self.labelled_id, self.which_mixer)

        #generator for lambda update
        mixed_generator_for_lambda, _ = get_mixer(self.trainer.iterator,
                                                  self.trainer.train_data,
                                                  self.trainer.iterator,
                                                  self.unlabelled_dataset,
                                                  num_gpus, self.labelled_id,
                                                  'cm', 1.0)
        #mixed_generator_for_lambda, _ = get_mixer(self.trainer._validation_iterator, self.trainer.train_data, self.trainer._validation_iterator,  self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm')

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(mixed_generator,
                                         total=num_training_batches)
        #train_generator_tqdm = Tqdm.tqdm(zip(train_generator,unlabelled_train_generator),
        #                                 total=num_training_batches)
        cumulative_batch_size = 0
        unlabelled_loss = 0
        unlabelled_batches_this_epoch = 0

        batches_since_last_step = 0
        agg_loss = 0.0
        flag = False
        batch_grad_norm = None
        for batch_group, group_id in train_generator_tqdm:
            #print(batch_group[0]['sentence']['tokens'].shape)
            if self.total_supervised_iters < self.dd_semi_warmup_iters and group_id != self.labelled_id:
                continue
            output_dict = self.batch_loss(
                batch_group,
                for_training=True,
                eval_metric=(group_id == self.labelled_id))
            penalties = defaultdict(float)

            if self.constraints_model is not None:
                penalties = self.constraints_model(
                    output_dict['task1_tag_logits'],
                    output_dict['task2_tag_logits'], output_dict['mask'])

            loss = 0.0
            if 'loss' in output_dict:
                loss = output_dict['loss']
                train_loss += loss.item()
            loss += output_dict.get('regularization_penalty', 0.0)

            loss += self.constraints_wt * penalties['loss']

            unlabelled_loss += penalties['loss'].item() if torch.is_tensor(
                penalties['loss']) else penalties['loss']

            agg_loss += loss
            batches_since_last_step += 1

            if batches_since_last_step == self.backprop_after_xbatches:
                #print("STEP THROUGH! : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss))
                batch_grad_norm = self.step(agg_loss)
                batches_since_last_step = 0
                agg_loss = 0.0
                flag = False
            else:
                flag = True
                #print("skipp : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss))

            if (group_id != self.labelled_id):
                unlabelled_batches_this_epoch += 1
                #self.trainer.optimizer.zero_grad()
                #loss.backward()
                #batch_grad_norm = self.trainer.rescale_gradients()
                #self.trainer.optimizer.step()
            else:
                self.total_supervised_iters += 1.0
                batches_this_epoch += 1
                self.trainer._batch_num_total += 1
                batch_num_total = self.trainer._batch_num_total

                #self.trainer.optimizer.zero_grad()
                #loss.backward()
                #batch_grad_norm = self.trainer.rescale_gradients()

                # This does nothing if batch_num_total is None or you are using an
                # LRScheduler which doesn't update per batch.
                if self.trainer._learning_rate_scheduler:
                    self.trainer._learning_rate_scheduler.step_batch(
                        batch_num_total)

                if self.trainer._tensorboard.should_log_histograms_this_batch(
                ):
                    # get the magnitude of parameter updates for logging
                    # We need a copy of current parameters to compute magnitude of updates,
                    # and copy them to CPU so large models won't go OOM on the GPU.
                    param_updates = {
                        name: param.detach().cpu().clone()
                        for name, param in
                        self.trainer.model.named_parameters()
                    }
                    #self.trainer.optimizer.step()
                    for name, param in self.trainer.model.named_parameters():
                        param_updates[name].sub_(param.detach().cpu())
                        update_norm = torch.norm(param_updates[name].view(
                            -1, ))
                        param_norm = torch.norm(param.view(-1, )).cpu()
                        self.trainer._tensorboard.add_train_scalar(
                            "gradient_update/" + name,
                            update_norm / (param_norm + 1e-7))
                else:
                    pass
                    #self.trainer.optimizer.step()

                # Update moving averages
                if self.trainer._moving_average is not None:
                    self.trainer._moving_average.apply(batch_num_total)
            #
                metrics = training_util.get_metrics(self.trainer.model,
                                                    train_loss,
                                                    batches_this_epoch)
                metrics["uloss"] = float(
                    unlabelled_loss /
                    (batches_this_epoch + unlabelled_batches_this_epoch))
                # Update the description with the latest metrics
                description = training_util.description_from_metrics(metrics)
                train_generator_tqdm.set_description(description,
                                                     refresh=False)

                # Log parameter values to Tensorboard
                if self.trainer._tensorboard.should_log_this_batch(
                ) and batch_grad_norm is not None:
                    self.trainer._tensorboard.log_parameter_and_gradient_statistics(
                        self.trainer.model, batch_grad_norm)
                    self.trainer._tensorboard.log_learning_rates(
                        self.trainer.model, self.trainer.optimizer)

                    self.trainer._tensorboard.add_train_scalar(
                        "loss/loss_train", metrics["loss"])
                    self.trainer._tensorboard.log_metrics(
                        {"epoch_metrics/" + k: v
                         for k, v in metrics.items()})

                if self.trainer._tensorboard.should_log_histograms_this_batch(
                ):
                    self.trainer._tensorboard.log_histograms(
                        self.trainer.model, histogram_parameters)

                if self.trainer._log_batch_size_period:
                    cur_batch = sum([
                        training_util.get_batch_size(batch)
                        for batch in batch_group
                    ])
                    cumulative_batch_size += cur_batch
                    if (batches_this_epoch -
                            1) % self.trainer._log_batch_size_period == 0:
                        average = cumulative_batch_size / batches_this_epoch
                        logger.info(
                            f"current batch size: {cur_batch} mean batch size: {average}"
                        )
                        self.trainer._tensorboard.add_train_scalar(
                            "current_batch_size", cur_batch)
                        self.trainer._tensorboard.add_train_scalar(
                            "mean_batch_size", average)

                # Save model if needed.
                if self.trainer._model_save_interval is not None and (
                        time.time() - last_save_time >
                        self.trainer._model_save_interval):
                    last_save_time = time.time()
                    self.trainer._save_checkpoint('{0}.{1}'.format(
                        epoch, training_util.time_to_str(int(last_save_time))))

            #lambda update
            #if  (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters) and (batches_this_epoch % self.dd_update_freq == 0):
            if (self.constraints_model
                    is not None) and (self.dd_optimizer is not None) and (
                        self.total_supervised_iters >= self.dd_warmup_iters
                    ) and (self.total_supervised_iters -
                           self.last_lambda_update >= self.dd_update_freq):
                for batch_group, group_id in mixed_generator_for_lambda:
                    self.lambda_update(batch_group)
                    self.last_lambda_update = self.total_supervised_iters
                    break

                self.count_lambda_updates += 1
                if (self.dd_increase_freq_after
                        is not None) and (self.count_lambda_updates %
                                          self.dd_increase_freq_after == 0):
                    self.dd_update_freq += self.dd_increase_freq_by
        if flag:
            batch_grad_norm = self.step(agg_loss)
            batches_since_last_step = 0
            agg_loss = 0.0
            flag = False

        #lambda update
        #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters):
        if (self.constraints_model
                is not None) and (self.dd_optimizer is not None) and (
                    self.total_supervised_iters >= self.dd_warmup_iters) and (
                        self.total_supervised_iters - self.last_lambda_update
                        >= self.dd_update_freq):
            for batch_group, group_id in mixed_generator_for_lambda:
                self.lambda_update(batch_group)
                self.last_lambda_update = self.total_supervised_iters
                break

            self.count_lambda_updates += 1
            if (self.dd_increase_freq_after
                    is not None) and (self.count_lambda_updates %
                                      self.dd_increase_freq_after == 0):
                self.dd_update_freq += self.dd_increase_freq_by

        metrics = training_util.get_metrics(self.trainer.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        metrics['lb'] = batches_this_epoch
        metrics['ub'] = unlabelled_batches_this_epoch
        metrics["uloss"] = float(
            unlabelled_loss /
            (batches_this_epoch + unlabelled_batches_this_epoch))
        if self.constraints_model is not None:
            lambda_stats_dict = self.constraints_model.lambda_stats()
            metrics.update(lambda_stats_dict)
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics
示例#25
0
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self.model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        num_gpus = len(self._cuda_devices)

        raw_val_generator = val_iterator(self._validation_data,
                                         num_epochs=1,
                                         shuffle=False)
        val_generator = lazy_groups_of(raw_val_generator, num_gpus)
        num_validation_batches = math.ceil(
            val_iterator.get_num_batches(self._validation_data) / num_gpus)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch_group in val_generator_tqdm:
            images = []
            text = []
            segment_ids = []
            labels = []
            num_negative_samples = self.num_negative_samples * 10
            for i in range(len(batch_group[0]['images'])):
                positive_index = random.randint(0, num_negative_samples)
                labels.append(positive_index)
                if self.retrieve_text:
                    instance_text = []
                    instance_segment_ids = []
                    for j in range(num_negative_samples + 1):
                        if j == positive_index:
                            instance_text.append(batch_group[0]['token_ids']
                                                 ['tokens'][i].tolist())
                            instance_segment_ids.append(
                                batch_group[0]['segment_ids'][i].tolist())
                        else:
                            negative_sample_index = random.choice(
                                self.val_indices)
                            text_field = TextField(
                                self.val_text_db[negative_sample_index],
                                self.val_token_indexers)
                            text_field.index(self.model.vocab)
                            padding_lengths = text_field.get_padding_lengths()
                            instance_text.append(
                                text_field.as_tensor(
                                    padding_lengths=padding_lengths)
                                ['tokens'].tolist())
                            instance_segment_ids.append(
                                self.val_segment_ids_db[negative_sample_index].
                                tolist())
                    text += instance_text
                    segment_ids += instance_segment_ids
                else:
                    instance_images = [
                        None for _ in range(num_negative_samples + 1)
                    ]
                    for j in range(num_negative_samples + 1):
                        if j == positive_index:
                            instance_images[j] = np.expand_dims(
                                batch_group[0]['images'][i].numpy(), 0)
                        else:
                            instance_images[j] = np.expand_dims(
                                random.choice(self.val_image_db), 0)
                    images += instance_images
            matching_label_field_name = "labels"
            if self.retrieve_text:
                max_text_len = max([len(sequence) for sequence in text])
                text = [
                    sequence +
                    [0 for _ in range(max_text_len - len(sequence))]
                    for sequence in text
                ]
                batch_group[0]['token_ids'] = {
                    'tokens': torch.LongTensor(text)
                }
                segment_ids = [
                    sequence +
                    [0 for _ in range(max_text_len - len(sequence))]
                    for sequence in segment_ids
                ]
                batch_group[0]['segment_ids'] = torch.from_numpy(
                    np.array(segment_ids, dtype=np.int64))
            else:
                batch_group[0]['images'] = torch.from_numpy(np.vstack(images))
            batch_group[0][matching_label_field_name] = torch.from_numpy(
                np.array(labels, dtype=np.int64))

            loss = self.batch_loss(batch_group, for_training=False)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(self.model, val_loss,
                                                    batches_this_epoch)
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, batches_this_epoch
示例#26
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle)
        batch_group_generator = lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps
        )
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps
        )

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(
                batch_group_generator, total=num_training_batches
            )
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        logger.info("Training")

        cumulative_batch_group_size = 0
        for batch_group in batch_group_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                self._writer.log({"step_loss": loss.item()}, step=self._batch_num_total)
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description, refresh=False)

            self._writer.log({"lr": self.optimizer.param_groups[0]['lr']},
                             step=self._batch_num_total)

            # Save model if needed.
            if (
                self._model_save_interval is not None
                and (time.time() - last_save_time > self._model_save_interval)
                and self._master
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                    "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time)))
                )

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
示例#27
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = common_util.peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        train_reg_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)

        logger.info("Training")

        num_training_batches = math.ceil(
            len(self.data_loader) / self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing training early! "
                        "This implies that there is an imbalance in your training "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            batch_group_outputs = []
            for batch in batch_group:
                batch_outputs = self.batch_outputs(batch, for_training=True)
                batch_group_outputs.append(batch_outputs)
                loss = batch_outputs["loss"]
                reg_loss = batch_outputs["reg_loss"]
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                reg_loss = reg_loss / len(batch_group)
                if self._opt_level is not None:
                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                train_loss += loss.item()
                train_reg_loss += reg_loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            param_updates = None
            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # Get the magnitude of parameter updates for logging.  We need to do some
                # computation before and after the optimizer step, and it's expensive because of
                # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so
                # we don't do this every batch, only when it's requested.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)
                self._tensorboard.log_batch(self.model, self.optimizer,
                                            batch_grad_norm, metrics,
                                            batch_group, param_updates)

            if self._master:
                self._checkpointer.maybe_save_checkpoint(
                    self, epoch, batches_this_epoch)
                for callback in self._batch_callbacks:
                    callback(
                        self,
                        batch_group,
                        batch_group_outputs,
                        epoch,
                        batches_this_epoch,
                        is_training=True,
                    )

        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
示例#28
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            images = []
            text = []
            segment_ids = []
            labels = []
            for i in range(len(batch_group[0]['images'])):
                positive_index = random.randint(0, self.num_negative_samples)
                labels.append(positive_index)
                if self.retrieve_text:
                    instance_text = []
                    instance_segment_ids = []
                    for j in range(self.num_negative_samples + 1):
                        if j == positive_index:
                            instance_text.append(batch_group[0]['token_ids']
                                                 ['tokens'][i, :].tolist())
                            instance_segment_ids.append(
                                batch_group[0]['segment_ids'][i].tolist())
                        else:
                            negative_sample_index = random.choice(
                                self.train_indices)
                            text_field = TextField(
                                self.train_text_db[negative_sample_index],
                                self.train_token_indexers)
                            text_field.index(self.model.vocab)
                            padding_lengths = text_field.get_padding_lengths()
                            instance_text.append(
                                text_field.as_tensor(
                                    padding_lengths=padding_lengths)
                                ['tokens'].tolist())
                            instance_segment_ids.append(
                                self.train_segment_ids_db[
                                    negative_sample_index].tolist())
                    text += instance_text
                    segment_ids += instance_segment_ids
                else:
                    instance_images = [
                        None for _ in range(self.num_negative_samples + 1)
                    ]
                    for j in range(self.num_negative_samples + 1):
                        if j == positive_index:
                            instance_images[j] = np.expand_dims(
                                batch_group[0]['images'][i].numpy(), 0)
                        else:
                            instance_images[j] = np.expand_dims(
                                random.choice(self.train_image_db), 0)
                    images += instance_images
            matching_label_field_name = "labels"
            if self.retrieve_text:
                max_text_len = max([len(sequence) for sequence in text])
                text = [
                    sequence +
                    [0 for _ in range(max_text_len - len(sequence))]
                    for sequence in text
                ]
                batch_group[0]['token_ids'] = {
                    'tokens': torch.LongTensor(text)
                }
                segment_ids = [
                    sequence +
                    [0 for _ in range(max_text_len - len(sequence))]
                    for sequence in segment_ids
                ]
                batch_group[0]['segment_ids'] = torch.from_numpy(
                    np.array(segment_ids, dtype=np.int64))
            else:
                batch_group[0]['images'] = torch.from_numpy(np.vstack(images))
            batch_group[0][matching_label_field_name] = torch.from_numpy(
                np.array(labels, dtype=np.int64))
            loss = self.batch_loss(batch_group, for_training=True)

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics
示例#29
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging())


        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch_group in train_generator_tqdm:
            self.model.train()
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            loss = self.batch_loss(batch_group, for_training=True)

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().cpu().clone()
                                 for name, param in self.model.named_parameters()}
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model, self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"])
                self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model, histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size/batches_this_epoch
                    logger.info(f"current batch size: {cur_batch} mean batch size: {average}")
                    self._tensorboard.add_train_scalar("current_batch_size", cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size", average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, training_util.time_to_str(int(last_save_time)))
                )
            if self._early_stopping_by_batch and self._batch_num_total % 10 == 0:
                if self._validation_data is not None:
                    with torch.no_grad():
                        # We have a validation set, so compute all the metrics on it.
                        val_loss, num_batches = self._validation_loss()
                        val_metrics = training_util.get_metrics(self.model, val_loss, num_batches, reset=True)

                        # Check validation metric for early stopping
                        this_epoch_val_metric = val_metrics[self._validation_metric]
                        self._metric_tracker.add_metric(this_epoch_val_metric)

                        if self._metric_tracker.is_best_so_far():
                            metrics['best_batch'] = self._batch_num_total
                            for key, value in val_metrics.items():
                                metrics["best_validation_" + key] = value
                            self._metric_tracker.best_epoch_metrics = val_metrics

                        self._save_checkpoint(self._batch_num_total)

                        if self.callbacks is not None:
                            for callback in self.callbacks:
                                callback.on_batch_end(self._batch_num_total)

        metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory
        return metrics
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        cpu_memory_usage = []
        for worker, memory in common_util.peak_memory_mb().items():
            cpu_memory_usage.append((worker, memory))
            logger.info(f"Worker {worker} memory usage MB: {memory}")
        gpu_memory_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_memory_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        regularization_penalty = self.model.get_regularization_penalty()

        train_loss = 0.0
        batch_loss = 0.0

        if regularization_penalty is not None:
            train_reg_loss = 0.0
            batch_reg_loss = 0.0
        else:
            train_reg_loss = None
            batch_reg_loss = None
        # Set the model to "train" mode.
        self.model_engine.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)

        logger.info("Training")

        num_training_batches: Union[int, float]
        try:
            len_data_loader = len(self.data_loader)
            num_training_batches = math.ceil(
                len_data_loader / self._num_gradient_accumulation_steps)
        except TypeError:
            num_training_batches = float("inf")

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        batch_group_generator_tqdm = batch_group_generator
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            batch_group_outputs = []
            for batch in batch_group:
                with amp.autocast(self._use_amp):
                    batch_outputs = self.batch_outputs(batch,
                                                       for_training=True)
                    batch_group_outputs.append(batch_outputs)
                    loss = batch_outputs.get("loss")
                    reg_loss = batch_outputs.get("reg_loss")
                    if torch.isnan(loss):
                        raise ValueError("nan loss encountered")
                    loss = loss / len(batch_group)

                    batch_loss = loss.item()
                    train_loss += batch_loss
                    if reg_loss is not None:
                        reg_loss = reg_loss / len(batch_group)
                        batch_reg_loss = reg_loss.item()
                        train_reg_loss += batch_reg_loss

                self.model_engine.backward(loss)
                self.model_engine.step()

            param_updates = None
            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # Get the magnitude of parameter updates for logging.  We need to do some
                # computation before and after the optimizer step, and it's expensive because of
                # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so
                # we don't do this every batch, only when it's requested.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }

                if self._scaler is not None:
                    self._scaler.step(self.optimizer)
                    self._scaler.update()
                else:
                    self.optimizer.step()

                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
            else:
                if self._scaler is not None:
                    self._scaler.step(self.optimizer)
                    self._scaler.update()
                else:
                    self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batch_loss,
                batch_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=self.cuda_device,
            )

            if self._master:
                # Updating tqdm only for the master as the trainers wouldn't have one
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)
                self._tensorboard.log_batch(
                    self.model,
                    self.optimizer,
                    0.,  # batch_grad_norm,
                    metrics,
                    batch_group,
                    param_updates,
                )

                self._checkpointer.maybe_save_checkpoint(
                    self, epoch, batches_this_epoch)

            for callback in self._batch_callbacks:
                callback(
                    self,
                    batch_group,
                    batch_group_outputs,
                    epoch,
                    batches_this_epoch,
                    is_training=True,
                    is_master=self._master,
                )

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batch_loss=None,
            batch_reg_loss=None,
            num_batches=batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=self.cuda_device,
        )

        for (worker, memory) in cpu_memory_usage:
            metrics["worker_" + str(worker) + "_memory_MB"] = memory
        for (gpu_num, memory) in gpu_memory_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics