Exemplo n.º 1
0
    def __init__(self,
                 model: Model,
                 training_data: Iterable[Instance],
                 iterator: DataIterator,
                 optimizer: torch.optim.Optimizer,
                 num_epochs: int = 20,
                 shuffle: bool = True,
                 serialization_dir: Optional[str] = None,
                 cuda_device: Union[int, List] = -1,
                 callbacks: List[Callback] = None,
                 apex_opt_level: Optional[str] = None,
                 keep_batchnorm_fp32: Optional[bool] = False) -> None:
        """
        A trainer for doing supervised learning. It just takes a labeled dataset
        and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights
        for your model over some fixed number of epochs. It uses callbacks to handle various
        things ancillary to training, like tracking metrics, validation, early stopping,
        logging to tensorboard, and so on.

        It's easy to create your own callbacks; for example, if you wanted to get a Slack
        notification when training finishes. For more complicated variations, you might have
        to create your own subclass, in which case make sure to fire off all the training events.

        Parameters
        ----------
        model : ``Model``, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their ``forward`` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.

            If you are training your model using GPUs, your model should already be
            on the correct device. (If you use `Trainer.from_params` this will be
            handled for you.)
        training_data : ``Iterable[Instance]``, required
            The instances that you want to train your model on.
        iterator : ``DataIterator``, required
            The iterator for batching / epoch-ing the instances.
        optimizer : ``torch.nn.Optimizer``, required.
            An instance of a Pytorch Optimizer, instantiated with the parameters of the
            model to be optimized.
        num_epochs : int, optional (default=20)
            Number of training epochs.
        shuffle : bool, optional (default=True)
            Whether to shuffle the instances each epoch.
        serialization_dir : str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        cuda_device : ``Union[int, List[int]]``, optional (default=-1)
            An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
        callbacks : ``List[Callback]``, optional (default=None)
            A list of callbacks that will be called based on training events.
        apex_opt_level: ``str``, optional (default = None)
            If provided, we will use the apex library to do mixed-precision training with the specified
            opt_level. This will cause an error if apex is not installed.
            Allowed values are O0, O1, O2, and O3. (Note that is capital-O then a number.)
        """
        super().__init__(serialization_dir, cuda_device)

        logger.warning(
            "The CallbackTrainer should be considered 'experimental' code, "
            "and its behavior may change as we use it more and iterate on it.")

        if apex_opt_level and not _APEX_IMPORTED:
            raise ConfigurationError(
                "You specified an apex_opt_level, but we could not import apex. "
                "Is it installed? see https://github.com/NVIDIA/apex#quick-start"
            )

        # This is all state that the callbacks might want:
        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        if apex_opt_level:
            logging.info(f"using apex.amp with opt_level {apex_opt_level}")
            self.model, self.optimizer = amp.initialize(
                model,
                optimizer,
                opt_level=apex_opt_level,
                keep_batchnorm_fp32=keep_batchnorm_fp32)
        else:
            self.model, self.optimizer = model, optimizer

        self._use_apex = apex_opt_level is not None
        self.validate = False

        # For capturing mid / end-of-epoch metrics
        self.train_metrics: Dict[str, float] = {}
        self.val_metrics: Dict[str, float] = {}
        self.latest_val_metric = 0.0
        self.train_loss = 0.0

        # For capturing overall metrics
        self.metrics: Dict[str, Any] = {}

        self.batch_num_total = 0
        self.batch_group: List[TensorDict] = []
        self.batches_this_epoch = 0

        self.training_batches: Iterable[List[TensorDict]] = ()
        self.num_training_batches = 0

        self.should_stop_early = False
        self.num_epochs = num_epochs

        self.training_start_time = 0.0

        self.last_log = 0.0
        self.epoch_number = 0
        self.batch_grad_norm: Optional[float] = None

        self.training_data = training_data
        self.iterator = iterator
        self.shuffle = shuffle
        self.handler = CallbackHandler(callbacks, self)

        # For capturing errors that occur during the train loop.
        self.exception: Optional[Exception] = None
Exemplo n.º 2
0
class CallbackApexTrainer(TrainerBase):
    def __init__(self,
                 model: Model,
                 training_data: Iterable[Instance],
                 iterator: DataIterator,
                 optimizer: torch.optim.Optimizer,
                 num_epochs: int = 20,
                 shuffle: bool = True,
                 serialization_dir: Optional[str] = None,
                 cuda_device: Union[int, List] = -1,
                 callbacks: List[Callback] = None,
                 apex_opt_level: Optional[str] = None,
                 keep_batchnorm_fp32: Optional[bool] = False) -> None:
        """
        A trainer for doing supervised learning. It just takes a labeled dataset
        and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights
        for your model over some fixed number of epochs. It uses callbacks to handle various
        things ancillary to training, like tracking metrics, validation, early stopping,
        logging to tensorboard, and so on.

        It's easy to create your own callbacks; for example, if you wanted to get a Slack
        notification when training finishes. For more complicated variations, you might have
        to create your own subclass, in which case make sure to fire off all the training events.

        Parameters
        ----------
        model : ``Model``, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their ``forward`` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.

            If you are training your model using GPUs, your model should already be
            on the correct device. (If you use `Trainer.from_params` this will be
            handled for you.)
        training_data : ``Iterable[Instance]``, required
            The instances that you want to train your model on.
        iterator : ``DataIterator``, required
            The iterator for batching / epoch-ing the instances.
        optimizer : ``torch.nn.Optimizer``, required.
            An instance of a Pytorch Optimizer, instantiated with the parameters of the
            model to be optimized.
        num_epochs : int, optional (default=20)
            Number of training epochs.
        shuffle : bool, optional (default=True)
            Whether to shuffle the instances each epoch.
        serialization_dir : str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        cuda_device : ``Union[int, List[int]]``, optional (default=-1)
            An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
        callbacks : ``List[Callback]``, optional (default=None)
            A list of callbacks that will be called based on training events.
        apex_opt_level: ``str``, optional (default = None)
            If provided, we will use the apex library to do mixed-precision training with the specified
            opt_level. This will cause an error if apex is not installed.
            Allowed values are O0, O1, O2, and O3. (Note that is capital-O then a number.)
        """
        super().__init__(serialization_dir, cuda_device)

        logger.warning(
            "The CallbackTrainer should be considered 'experimental' code, "
            "and its behavior may change as we use it more and iterate on it.")

        if apex_opt_level and not _APEX_IMPORTED:
            raise ConfigurationError(
                "You specified an apex_opt_level, but we could not import apex. "
                "Is it installed? see https://github.com/NVIDIA/apex#quick-start"
            )

        # This is all state that the callbacks might want:
        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        if apex_opt_level:
            logging.info(f"using apex.amp with opt_level {apex_opt_level}")
            self.model, self.optimizer = amp.initialize(
                model,
                optimizer,
                opt_level=apex_opt_level,
                keep_batchnorm_fp32=keep_batchnorm_fp32)
        else:
            self.model, self.optimizer = model, optimizer

        self._use_apex = apex_opt_level is not None
        self.validate = False

        # For capturing mid / end-of-epoch metrics
        self.train_metrics: Dict[str, float] = {}
        self.val_metrics: Dict[str, float] = {}
        self.latest_val_metric = 0.0
        self.train_loss = 0.0

        # For capturing overall metrics
        self.metrics: Dict[str, Any] = {}

        self.batch_num_total = 0
        self.batch_group: List[TensorDict] = []
        self.batches_this_epoch = 0

        self.training_batches: Iterable[List[TensorDict]] = ()
        self.num_training_batches = 0

        self.should_stop_early = False
        self.num_epochs = num_epochs

        self.training_start_time = 0.0

        self.last_log = 0.0
        self.epoch_number = 0
        self.batch_grad_norm: Optional[float] = None

        self.training_data = training_data
        self.iterator = iterator
        self.shuffle = shuffle
        self.handler = CallbackHandler(callbacks, self)

        # For capturing errors that occur during the train loop.
        self.exception: Optional[Exception] = None

    def generate_training_batches(self):
        """
        Generates one epoch worth of training data. Stores it in trainer instance variables
        so that callbacks can access it.
        """
        num_gpus = len(self._cuda_devices)

        raw_train_generator = self.iterator(self.training_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        self.training_batches = lazy_groups_of(raw_train_generator, num_gpus)
        self.num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.training_data) / num_gpus)

    def batch_loss(self, batch_group: List[TensorDict],
                   for_training: bool) -> torch.Tensor:
        """
        Does a forward pass on the given batches and returns the ``loss`` value in the result.
        If ``for_training`` is `True` also applies regularization penalty.

        This is a method on the trainer so that it can be used both in training and validation
        (which are handled separately).
        """
        if self._multiple_gpu:
            output_dict = training_util.data_parallel(batch_group, self.model,
                                                      self._cuda_devices)
        else:
            assert len(batch_group) == 1
            batch = batch_group[0]
            batch = nn_util.move_to_device(batch, self._cuda_devices[0])
            output_dict = self.model(**batch)

        try:
            loss = output_dict["loss"]
            if loss is not None and for_training:
                loss += self.model.get_regularization_penalty()
        except KeyError:
            if for_training:
                raise RuntimeError(
                    "The model you are trying to optimize does not contain a"
                    " 'loss' key in the output of model.forward(inputs).")
            loss = None

        return loss

    def train_one_batch_group(self, batch_group: List[TensorDict]) -> str:
        """
        Handles the training for a single batch group.
        Fires off the events BATCH_START, FORWARD, BACKWARD, and BATCH_END.
        """
        self.handler.fire_event(Events.BATCH_START)
        self.optimizer.zero_grad()

        self.batches_this_epoch += 1
        self.batch_num_total += 1

        self.handler.fire_event(Events.FORWARD)
        loss = self.batch_loss(batch_group, for_training=True)

        if loss is None:
            return

        if torch.isnan(loss):
            logger.warning("NaN loss encountered.")
            return

        if self._use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.train_loss += loss.item()

        self.handler.fire_event(Events.BACKWARD)

        self.optimizer.step()

        # Update the description with the latest metrics
        self.train_metrics = training_util.get_metrics(self.model,
                                                       self.train_loss,
                                                       self.batches_this_epoch)

        self.handler.fire_event(Events.BATCH_END)

        return training_util.description_from_metrics(self.train_metrics)

    def train_one_epoch(self) -> None:
        """
        Trains the model for a single epoch.
        Fires off the events EPOCH_START and EPOCH_END,
        and repeatedly calls self.train_one_batch_group().
        """
        self.handler.fire_event(Events.EPOCH_START)

        self.train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        self.last_log = time.time()

        logger.info("Training")
        self.batches_this_epoch = 0

        batch_groups_tqdm = Tqdm.tqdm(self.training_batches,
                                      total=self.num_training_batches)

        for self.batch_group in batch_groups_tqdm:
            description = self.train_one_batch_group(self.batch_group)
            if description is None:
                continue
            batch_groups_tqdm.set_description(description, refresh=False)

        self.handler.fire_event(Events.VALIDATE)
        self.handler.fire_event(Events.EPOCH_END)

    @handle_errors
    def train(self) -> Dict[str, Any]:
        """
        Trains the supplied model with the supplied parameters.
        Fires off the events TRAINING_START and TRAINING END,
        and repeatedly calls `self.train_one_epoch()`.
        """
        logger.info("Beginning training.")
        self.handler.fire_event(Events.TRAINING_START)

        self.training_start_time = time.time()
        starting_epoch = self.epoch_number

        for self.epoch_number in range(self.epoch_number, self.num_epochs):
            epoch_start_time = time.time()

            self.generate_training_batches()
            self.train_one_epoch()

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info("Epoch duration: %s",
                        datetime.timedelta(seconds=epoch_elapsed_time))

            if self.epoch_number < self.num_epochs - 1:
                training_elapsed_time = time.time() - self.training_start_time
                estimated_time_remaining = training_elapsed_time * \
                    ((self.num_epochs - starting_epoch) /
                     float(self.epoch_number - starting_epoch + 1) - 1)
                formatted_time = str(
                    datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s",
                            formatted_time)

            if self.should_stop_early:
                logger.info("Ran out of patience.  Stopping training.")
                break

        self.handler.fire_event(Events.TRAINING_END)

        return self.metrics

    # Requires custom from_params.
    @classmethod
    def from_params(
            cls,  # type: ignore
            params: Params,
            serialization_dir: str,
            recover: bool = False,
            cache_directory: str = None,
            cache_prefix: str = None) -> 'CallbackTrainer':
        pieces = TrainerPieces.from_params(params, serialization_dir, recover)  # pylint: disable=no-member
        model = pieces.model
        params = pieces.params
        validation_iterator = pieces.validation_iterator or pieces.iterator

        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        apex_opt_level = params.pop('apex_opt_level', None)
        keep_batchnorm_fp32 = params.pop("keep_batchnorm_fp32", True)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))

        callbacks_params = params.pop("callbacks", [])
        callbacks: List[Callback] = [
            Callback.from_params(params=callback_params,
                                 model=model,
                                 optimizer=optimizer,
                                 instances=pieces.train_dataset,
                                 iterator=pieces.iterator,
                                 shuffle=shuffle,
                                 validation_data=pieces.validation_dataset,
                                 validation_iterator=validation_iterator,
                                 serialization_dir=serialization_dir)
            for callback_params in callbacks_params
        ]

        params.assert_empty(cls.__name__)
        return cls(model,
                   pieces.train_dataset,
                   pieces.iterator,
                   optimizer,
                   num_epochs=num_epochs,
                   shuffle=shuffle,
                   serialization_dir=serialization_dir,
                   cuda_device=cuda_device,
                   callbacks=callbacks,
                   apex_opt_level=apex_opt_level,
                   keep_batchnorm_fp32=keep_batchnorm_fp32)
Exemplo n.º 3
0
    def __init__(
        self,
        model: Model,
        training_data: Iterable[Instance],
        iterator: DataIterator,
        optimizer: torch.optim.Optimizer,
        num_epochs: int = 20,
        shuffle: bool = True,
        serialization_dir: Optional[str] = None,
        cuda_device: int = -1,
        callbacks: List[Callback] = None,
        distributed: bool = False,
        rank: int = 0,
        world_size: int = 1,
    ) -> None:
        """
        A trainer for doing supervised learning. It just takes a labeled dataset
        and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights
        for your model over some fixed number of epochs. It uses callbacks to handle various
        things ancillary to training, like tracking metrics, validation, early stopping,
        logging to tensorboard, and so on.

        It's easy to create your own callbacks; for example, if you wanted to get a Slack
        notification when training finishes. For more complicated variations, you might have
        to create your own subclass, in which case make sure to fire off all the training events.

        Parameters
        ----------
        model : ``Model``, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their ``forward`` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.

            If you are training your model using GPUs, your model should already be
            on the correct device. (If you use `Trainer.from_params` this will be
            handled for you.)
        training_data : ``Iterable[Instance]``, required
            The instances that you want to train your model on.
        iterator : ``DataIterator``, required
            The iterator for batching / epoch-ing the instances.
        optimizer : ``torch.nn.Optimizer``, required.
            An instance of a Pytorch Optimizer, instantiated with the parameters of the
            model to be optimized.
        num_epochs : int, optional (default=20)
            Number of training epochs.
        shuffle : bool, optional (default=True)
            Whether to shuffle the instances each epoch.
        serialization_dir : str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        cuda_device : ``int``, optional (default=-1)
            An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
            Data parallelism is controlled at the allennlp train level, so each trainer will have a single
            GPU.
        callbacks : ``List[Callback]``, optional (default=None)
            A list of callbacks that will be called based on training events.
        """
        super().__init__(serialization_dir, cuda_device, distributed, rank,
                         world_size)

        logger.warning(
            "The CallbackTrainer should be considered 'experimental' code, "
            "and its behavior may change as we use it more and iterate on it.")

        # This is all state that the callbacks might want:
        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        self.model = model
        self.optimizer = optimizer
        self.validate = False

        # For capturing mid / end-of-epoch metrics
        self.train_metrics: Dict[str, float] = {}
        self.val_metrics: Dict[str, float] = {}
        self.latest_val_metric = 0.0
        self.train_loss = 0.0

        # For capturing overall metrics
        self.metrics: Dict[str, Any] = {}

        self.batch_num_total = 0
        self.batch: TensorDict = None
        self.batches_this_epoch = 0

        self.training_batches: Iterable[List[TensorDict]] = ()
        self.num_training_batches = 0

        self.should_stop_early = False
        self.num_epochs = num_epochs

        self.training_start_time = 0.0

        self.last_log = 0.0
        self.epoch_number = 0
        self.batch_grad_norm: Optional[float] = None

        self.training_data = training_data
        self.iterator = iterator
        self.shuffle = shuffle
        self.handler = CallbackHandler(callbacks, self)

        # For capturing errors that occur during the train loop.
        self.exception: Optional[Exception] = None

        # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its
        # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model`
        # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc.
        #
        # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the
        # normal case, reference to `Model` is retained. This reference is only used in
        # these places: `model.__call__`, `model.train` and `model.eval`.
        if self._distributed:
            self._pytorch_model = DistributedDataParallel(
                self.model, device_ids=[self._rank])
        else:
            self._pytorch_model = self.model
Exemplo n.º 4
0
class CallbackTrainer(TrainerBase):
    def __init__(
        self,
        model: Model,
        training_data: Iterable[Instance],
        iterator: DataIterator,
        optimizer: torch.optim.Optimizer,
        num_epochs: int = 20,
        shuffle: bool = True,
        serialization_dir: Optional[str] = None,
        cuda_device: int = -1,
        callbacks: List[Callback] = None,
        distributed: bool = False,
        rank: int = 0,
        world_size: int = 1,
    ) -> None:
        """
        A trainer for doing supervised learning. It just takes a labeled dataset
        and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights
        for your model over some fixed number of epochs. It uses callbacks to handle various
        things ancillary to training, like tracking metrics, validation, early stopping,
        logging to tensorboard, and so on.

        It's easy to create your own callbacks; for example, if you wanted to get a Slack
        notification when training finishes. For more complicated variations, you might have
        to create your own subclass, in which case make sure to fire off all the training events.

        Parameters
        ----------
        model : ``Model``, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their ``forward`` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.

            If you are training your model using GPUs, your model should already be
            on the correct device. (If you use `Trainer.from_params` this will be
            handled for you.)
        training_data : ``Iterable[Instance]``, required
            The instances that you want to train your model on.
        iterator : ``DataIterator``, required
            The iterator for batching / epoch-ing the instances.
        optimizer : ``torch.nn.Optimizer``, required.
            An instance of a Pytorch Optimizer, instantiated with the parameters of the
            model to be optimized.
        num_epochs : int, optional (default=20)
            Number of training epochs.
        shuffle : bool, optional (default=True)
            Whether to shuffle the instances each epoch.
        serialization_dir : str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        cuda_device : ``int``, optional (default=-1)
            An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
            Data parallelism is controlled at the allennlp train level, so each trainer will have a single
            GPU.
        callbacks : ``List[Callback]``, optional (default=None)
            A list of callbacks that will be called based on training events.
        """
        super().__init__(serialization_dir, cuda_device, distributed, rank,
                         world_size)

        logger.warning(
            "The CallbackTrainer should be considered 'experimental' code, "
            "and its behavior may change as we use it more and iterate on it.")

        # This is all state that the callbacks might want:
        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        self.model = model
        self.optimizer = optimizer
        self.validate = False

        # For capturing mid / end-of-epoch metrics
        self.train_metrics: Dict[str, float] = {}
        self.val_metrics: Dict[str, float] = {}
        self.latest_val_metric = 0.0
        self.train_loss = 0.0

        # For capturing overall metrics
        self.metrics: Dict[str, Any] = {}

        self.batch_num_total = 0
        self.batch: TensorDict = None
        self.batches_this_epoch = 0

        self.training_batches: Iterable[List[TensorDict]] = ()
        self.num_training_batches = 0

        self.should_stop_early = False
        self.num_epochs = num_epochs

        self.training_start_time = 0.0

        self.last_log = 0.0
        self.epoch_number = 0
        self.batch_grad_norm: Optional[float] = None

        self.training_data = training_data
        self.iterator = iterator
        self.shuffle = shuffle
        self.handler = CallbackHandler(callbacks, self)

        # For capturing errors that occur during the train loop.
        self.exception: Optional[Exception] = None

        # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its
        # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model`
        # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc.
        #
        # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the
        # normal case, reference to `Model` is retained. This reference is only used in
        # these places: `model.__call__`, `model.train` and `model.eval`.
        if self._distributed:
            self._pytorch_model = DistributedDataParallel(
                self.model, device_ids=[self._rank])
        else:
            self._pytorch_model = self.model

    def generate_training_batches(self):
        """
        Generates one epoch worth of training data. Stores it in trainer instance variables
        so that callbacks can access it.
        """
        train_generator = self.iterator(self.training_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        self.training_batches = train_generator
        self.num_training_batches = self.iterator.get_num_batches(
            self.training_data)

    def batch_loss(self, batch: TensorDict,
                   for_training: bool) -> torch.Tensor:
        """
        Does a forward pass on the given batches and returns the ``loss`` value in the result.
        If ``for_training`` is `True` also applies regularization penalty.

        This is a method on the trainer so that it can be used both in training and validation
        (which are handled separately).
        """
        batch = nn_util.move_to_device(batch, self.cuda_device)
        output_dict = self._pytorch_model(**batch)

        try:
            loss = output_dict["loss"]
            if for_training:
                loss += self.model.get_regularization_penalty()
        except KeyError:
            if for_training:
                raise RuntimeError(
                    "The model you are trying to optimize does not contain a"
                    " 'loss' key in the output of model.forward(inputs).")
            loss = None

        return loss

    def train_one_batch(self, batch: TensorDict) -> str:
        """
        Handles the training for a single batch group.
        Fires off the events BATCH_START, FORWARD, BACKWARD, and BATCH_END.
        """
        self.handler.fire_event(Events.BATCH_START)
        self.optimizer.zero_grad()

        self.batches_this_epoch += 1
        self.batch_num_total += 1

        self.handler.fire_event(Events.FORWARD)
        loss = self.batch_loss(batch, for_training=True)

        if torch.isnan(loss):
            raise ValueError("nan loss encountered")

        loss.backward()
        self.train_loss += loss.item()

        self.handler.fire_event(Events.BACKWARD)

        self.optimizer.step()

        # Update the description with the latest metrics
        self.train_metrics = training_util.get_metrics(self.model,
                                                       self.train_loss,
                                                       self.batches_this_epoch)

        self.handler.fire_event(Events.BATCH_END)

        return training_util.description_from_metrics(self.train_metrics)

    def train_one_epoch(self) -> None:
        """
        Trains the model for a single epoch.
        Fires off the events EPOCH_START and EPOCH_END,
        and repeatedly calls self.train_one_batch().
        """
        self.handler.fire_event(Events.EPOCH_START)

        self.train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        self.last_log = time.time()

        logger.info("Training")
        self.batches_this_epoch = 0

        batches_tqdm = Tqdm.tqdm(self.training_batches,
                                 total=self.num_training_batches)

        for self.batch in batches_tqdm:
            description = self.train_one_batch(self.batch)
            batches_tqdm.set_description(description, refresh=False)

        self.handler.fire_event(Events.VALIDATE)
        self.handler.fire_event(Events.EPOCH_END)

    @handle_errors
    def train(self) -> Dict[str, Any]:
        """
        Trains the supplied model with the supplied parameters.
        Fires off the events TRAINING_START and TRAINING END,
        and repeatedly calls `self.train_one_epoch()`.
        """
        logger.info("Beginning training.")
        self.handler.fire_event(Events.TRAINING_START)

        self.training_start_time = time.time()
        starting_epoch = self.epoch_number

        for self.epoch_number in range(self.epoch_number, self.num_epochs):
            epoch_start_time = time.time()

            self.generate_training_batches()
            self.train_one_epoch()

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info("Epoch duration: %s",
                        datetime.timedelta(seconds=epoch_elapsed_time))

            if self.epoch_number < self.num_epochs - 1:
                training_elapsed_time = time.time() - self.training_start_time
                estimated_time_remaining = training_elapsed_time * (
                    (self.num_epochs - starting_epoch) /
                    float(self.epoch_number - starting_epoch + 1) - 1)
                formatted_time = str(
                    datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s",
                            formatted_time)

            if self.should_stop_early:
                logger.info("Ran out of patience.  Stopping training.")
                break

        self.handler.fire_event(Events.TRAINING_END)

        return self.metrics

    # Requires custom from_params.
    @classmethod
    def from_params(  # type: ignore
        cls,
        params: Params,
        serialization_dir: str,
        recover: bool = False,
        cache_directory: str = None,
        cache_prefix: str = None,
    ) -> "CallbackTrainer":
        pieces = TrainerPieces.from_params(params, serialization_dir, recover,
                                           cache_directory, cache_prefix)
        model = pieces.model
        params = pieces.params
        validation_iterator = pieces.validation_iterator or pieces.iterator

        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))

        check_for_gpu(cuda_device)
        if cuda_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(cuda_device)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))

        callbacks_params = params.pop("callbacks", [])
        callbacks: List[Callback] = [
            Callback.from_params(
                params=callback_params,
                model=model,
                optimizer=optimizer,
                instances=pieces.train_dataset,
                iterator=pieces.iterator,
                shuffle=shuffle,
                validation_data=pieces.validation_dataset,
                validation_iterator=validation_iterator,
                serialization_dir=serialization_dir,
            ) for callback_params in callbacks_params
        ]

        distributed = params.pop_bool("distributed", False)
        world_size = params.pop_int("world_size", 1)

        if distributed:
            rank = cuda_device
        else:
            rank = 0

        params.assert_empty(cls.__name__)
        return cls(
            model,
            pieces.train_dataset,
            pieces.iterator,
            optimizer,
            num_epochs=num_epochs,
            shuffle=shuffle,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            callbacks=callbacks,
            distributed=distributed,
            rank=rank,
            world_size=world_size,
        )
Exemplo n.º 5
0
    def __init__(self,
                 model: Model,
                 optimizer: torch.optim.Optimizer,
                 num_epochs: int = 20,
                 serialization_dir: Optional[str] = None,
                 model_save_interval: float = None,
                 cuda_device: Union[int, List] = -1,
                 callbacks: List[Callback] = None) -> None:
        """
        A trainer for doing supervised learning. It just takes a labeled dataset
        and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights
        for your model over some fixed number of epochs. You can also pass in a validation
        dataset and enable early stopping. There are many other bells and whistles as well.

        Parameters
        ----------
        model : ``Model``, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their ``forward`` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.

            If you are training your model using GPUs, your model should already be
            on the correct device. (If you use `Trainer.from_params` this will be
            handled for you.)
        optimizer : ``torch.nn.Optimizer``, required.
            An instance of a Pytorch Optimizer, instantiated with the parameters of the
            model to be optimized.
        num_epochs : int, optional (default = 20)
            Number of training epochs.
        serialization_dir : str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        model_save_interval : ``float``, optional (default=None)
            If provided, then serialize models every ``model_save_interval``
            seconds within single epochs.  In all cases, models are also saved
            at the end of every epoch if ``serialization_dir`` is provided.
        cuda_device : ``Union[int, List[int]]``, optional (default = -1)
            An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
        """
        super().__init__(serialization_dir, cuda_device)

        # This is all state that the callbacks might want:
        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        self.model = model
        self.optimizer = optimizer
        self.validate = False

        # For capturing mid / end-of-epoch metrics
        self.train_metrics: Dict[str, float] = {}
        self.val_metrics: Dict[str, float] = {}
        self.latest_val_metric = 0.0
        self.train_loss = 0.0

        # For capturing overall metrics
        self.metrics: Dict[str, Any] = {}

        self.batch_num_total = 0
        self.batch_group: List[TensorDict] = []
        self.batches_this_epoch = 0

        self.training_batches: Iterable[List[TensorDict]] = ()
        self.num_training_batches = 0

        self.should_stop_early = False
        self.num_epochs = num_epochs

        self.training_start_time = 0.0
        self.checkpoint_epoch: Union[int, str] = 0
        self.model_save_interval = model_save_interval

        self.last_log = 0.0
        self.epoch_number = 0
        self.batch_grad_norm: Optional[float] = None
        self.handler = CallbackHandler(callbacks, self)
Exemplo n.º 6
0
class CallbackTrainer(TrainerBase):
    def __init__(self,
                 model: Model,
                 optimizer: torch.optim.Optimizer,
                 num_epochs: int = 20,
                 serialization_dir: Optional[str] = None,
                 model_save_interval: float = None,
                 cuda_device: Union[int, List] = -1,
                 callbacks: List[Callback] = None) -> None:
        """
        A trainer for doing supervised learning. It just takes a labeled dataset
        and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights
        for your model over some fixed number of epochs. You can also pass in a validation
        dataset and enable early stopping. There are many other bells and whistles as well.

        Parameters
        ----------
        model : ``Model``, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their ``forward`` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.

            If you are training your model using GPUs, your model should already be
            on the correct device. (If you use `Trainer.from_params` this will be
            handled for you.)
        optimizer : ``torch.nn.Optimizer``, required.
            An instance of a Pytorch Optimizer, instantiated with the parameters of the
            model to be optimized.
        num_epochs : int, optional (default = 20)
            Number of training epochs.
        serialization_dir : str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        model_save_interval : ``float``, optional (default=None)
            If provided, then serialize models every ``model_save_interval``
            seconds within single epochs.  In all cases, models are also saved
            at the end of every epoch if ``serialization_dir`` is provided.
        cuda_device : ``Union[int, List[int]]``, optional (default = -1)
            An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
        """
        super().__init__(serialization_dir, cuda_device)

        # This is all state that the callbacks might want:
        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        self.model = model
        self.optimizer = optimizer
        self.validate = False

        # For capturing mid / end-of-epoch metrics
        self.train_metrics: Dict[str, float] = {}
        self.val_metrics: Dict[str, float] = {}
        self.latest_val_metric = 0.0
        self.train_loss = 0.0

        # For capturing overall metrics
        self.metrics: Dict[str, Any] = {}

        self.batch_num_total = 0
        self.batch_group: List[TensorDict] = []
        self.batches_this_epoch = 0

        self.training_batches: Iterable[List[TensorDict]] = ()
        self.num_training_batches = 0

        self.should_stop_early = False
        self.num_epochs = num_epochs

        self.training_start_time = 0.0
        self.checkpoint_epoch: Union[int, str] = 0
        self.model_save_interval = model_save_interval

        self.last_log = 0.0
        self.epoch_number = 0
        self.batch_grad_norm: Optional[float] = None
        self.handler = CallbackHandler(callbacks, self)

    def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch.Tensor:
        """
        Does a forward pass on the given batches and returns the ``loss`` value in the result.
        If ``for_training`` is `True` also applies regularization penalty.

        This is a method on the trainer so that it can be used both in training and validation
        (which are handled separately).
        """
        if self._multiple_gpu:
            output_dict = training_util.data_parallel(batch_group, self.model, self._cuda_devices)
        else:
            assert len(batch_group) == 1
            batch = batch_group[0]
            batch = nn_util.move_to_device(batch, self._cuda_devices[0])
            output_dict = self.model(**batch)

        try:
            loss = output_dict["loss"]
            if for_training:
                loss += self.model.get_regularization_penalty()
        except KeyError:
            if for_training:
                raise RuntimeError("The model you are trying to optimize does not contain a"
                                   " 'loss' key in the output of model.forward(inputs).")
            loss = None

        return loss

    def train(self) -> Dict[str, Any]:
        """
        Trains the supplied model with the supplied parameters.
        """
        self.handler.fire_event(Events.RESTORE_CHECKPOINT)
        starting_epoch = self.epoch_number

        logger.info("Beginning training.")
        self.handler.fire_event(Events.TRAINING_START)

        self.training_start_time = time.time()

        for self.epoch_number in range(starting_epoch, self.num_epochs):
            epoch_start_time = time.time()
            ####
            self.handler.fire_event(Events.EPOCH_START)

            self.train_loss = 0.0
            # Set the model to "train" mode.
            self.model.train()

            self.last_log = time.time()
            last_save_time = time.time()

            logger.info("Training")
            self.batches_this_epoch = 0

            batch_groups_tqdm = Tqdm.tqdm(self.training_batches, total=self.num_training_batches)

            for self.batch_group in batch_groups_tqdm:
                self.handler.fire_event(Events.BATCH_START)

                self.batches_this_epoch += 1
                self.batch_num_total += 1

                self.handler.fire_event(Events.FORWARD)
                self.handler.fire_event(Events.BACKWARD)

                description = training_util.description_from_metrics(self.train_metrics)

                batch_groups_tqdm.set_description(description, refresh=False)

                # Save model if needed.
                if self.model_save_interval is not None and (
                        time.time() - last_save_time > self.model_save_interval
                ):
                    last_save_time = time.time()
                    self.checkpoint_epoch = f"{self.epoch_number}.{training_util.time_to_str(int(last_save_time))}"
                    self.handler.fire_event(Events.SAVE_CHECKPOINT)

                self.handler.fire_event(Events.BATCH_END)

            self.handler.fire_event(Events.VALIDATE)

            epoch_elapsed_time = time.time() - epoch_start_time
            logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time))

            if self.epoch_number < self.num_epochs - 1:
                training_elapsed_time = time.time() - self.training_start_time
                estimated_time_remaining = training_elapsed_time * \
                    ((self.num_epochs - starting_epoch) / float(self.epoch_number - starting_epoch + 1) - 1)
                formatted_time = str(datetime.timedelta(seconds=int(estimated_time_remaining)))
                logger.info("Estimated training time remaining: %s", formatted_time)

            self.handler.fire_event(Events.EPOCH_END)

            self.checkpoint_epoch = self.epoch_number
            self.handler.fire_event(Events.SAVE_CHECKPOINT)

            if self.should_stop_early:
                logger.info("Ran out of patience.  Stopping training.")
                break

        self.handler.fire_event(Events.TRAINING_END)

        return self.metrics

    # Requires custom from_params.
    @classmethod
    def from_params(cls,  # type: ignore
                    params: Params,
                    serialization_dir: str,
                    recover: bool = False) -> 'CallbackTrainer':
        pieces = TrainerPieces.from_params(params, serialization_dir, recover)  # pylint: disable=no-member
        model = pieces.model
        params = pieces.params
        validation_iterator = pieces.validation_iterator or pieces.iterator

        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))

        model_save_interval = params.pop_float("model_save_interval", None)

        callbacks_params = params.pop("callbacks", [])
        callbacks: List[Callback] = [Callback.from_params(params=callback_params,
                                                          model=model,
                                                          optimizer=optimizer,
                                                          instances=pieces.train_dataset,
                                                          iterator=pieces.iterator,
                                                          shuffle=shuffle,
                                                          validation_data=pieces.validation_dataset,
                                                          validation_iterator=validation_iterator,
                                                          serialization_dir=serialization_dir)
                                     for callback_params in callbacks_params]

        params.assert_empty(cls.__name__)
        return cls(model, optimizer,
                   num_epochs=num_epochs,
                   serialization_dir=serialization_dir,
                   cuda_device=cuda_device,
                   model_save_interval=model_save_interval,
                   callbacks=callbacks)