Пример #1
0
 def test_configuration_error_when_passed_as_conflicting_argument_to_trainer(self):
     """
     Users should initialize Trainer either with an instance of Checkpointer or by specifying
     parameter values for num_serialized_models_to_keep and keep_serialized_model_every_num_seconds.
     Check that Trainer raises a ConfigurationError if both methods are used at the same time.
     """
     with self.assertRaises(ConfigurationError):
         Trainer(None, None, None, None,
                 num_serialized_models_to_keep=30,
                 keep_serialized_model_every_num_seconds=None,
                 checkpointer=Checkpointer(serialization_dir=self.TEST_DIR,
                                           num_serialized_models_to_keep=40,
                                           keep_serialized_model_every_num_seconds=2))
     with self.assertRaises(ConfigurationError):
         Trainer(None, None, None, None,
                 num_serialized_models_to_keep=20,
                 keep_serialized_model_every_num_seconds=2,
                 checkpointer=Checkpointer(serialization_dir=self.TEST_DIR,
                                           num_serialized_models_to_keep=40,
                                           keep_serialized_model_every_num_seconds=2))
     try:
         Trainer(None, None, None, None,
                 checkpointer=Checkpointer(serialization_dir=self.TEST_DIR,
                                           num_serialized_models_to_keep=40,
                                           keep_serialized_model_every_num_seconds=2))
     except ConfigurationError:
         self.fail("Configuration Error raised for passed checkpointer")
    def default_callbacks(
        self,
        validation_metric: str = "-loss",
        patience: int = None,
        max_checkpoints: int = 20,
        checkpoint_every: int = None,
        model_save_interval: float = None,
        serialization_dir: str = "__DEFAULT__",
        validation_data: Iterable[Instance] = None,
        validation_iterator: DataIterator = None,
        batch_size: int = 2,
    ):
        if serialization_dir == "__DEFAULT__":
            serialization_dir = self.TEST_DIR
        checkpointer = Checkpointer(serialization_dir, checkpoint_every,
                                    max_checkpoints)
        tensorboard = TensorboardWriter(get_batch_num_total=lambda: None)

        if validation_iterator is None:
            validation_iterator = BasicIterator(batch_size=batch_size)
            validation_iterator.index_with(self.vocab)

        return [
            LogToTensorboard(log_batch_size_period=10,
                             tensorboard=tensorboard),
            Checkpoint(checkpointer, model_save_interval),
            Validate(
                validation_data=self.instances
                if validation_data is None else validation_data,
                validation_iterator=validation_iterator,
            ),
            TrackMetrics(patience, validation_metric),
            GradientNormAndClip(),
        ]
Пример #3
0
 def test_with_time(self):
     """
     Tests that keep_serialized_model_every_num_seconds parameter causes a checkpoint to be saved
     after enough time has elapsed between epochs.
     """
     num_to_keep = 10
     num_epochs = 30
     target = list(range(num_epochs - num_to_keep, num_epochs))
     pauses = [5, 18, 26]
     target = sorted(set(target + pauses))
     checkpointer = Checkpointer(
         serialization_dir=self.TEST_DIR,
         num_serialized_models_to_keep=num_to_keep,
         keep_serialized_model_every_num_seconds=1,
     )
     for e in range(num_epochs):
         if e in pauses:
             time.sleep(2)
         checkpointer.save_checkpoint(
             epoch=e,
             model_state={"epoch": e},
             training_states={"epoch": e},
             is_best_so_far=False,
         )
     models, training = self.retrieve_and_delete_saved()
     assert models == training == target
Пример #4
0
    def default_callbacks(self,
                          validation_metric: str = "-loss",
                          patience: int = None,
                          max_checkpoints: int = 20,
                          checkpoint_every: int = None,
                          serialization_dir: str = "__DEFAULT__",
                          iterator: DataIterator = None,
                          validation_data: Iterable[Instance] = None,
                          validation_iterator: DataIterator = None,
                          batch_size: int = 2):
        if serialization_dir == "__DEFAULT__":
            serialization_dir = self.TEST_DIR
        checkpointer = Checkpointer(serialization_dir,
                                    checkpoint_every,
                                    max_checkpoints)
        tensorboard = TensorboardWriter(get_batch_num_total=lambda: None)

        if iterator is None:
            iterator = BasicIterator(batch_size=batch_size)
            iterator.index_with(self.vocab)

        return [
                LogToTensorboard(log_batch_size_period=10, tensorboard=tensorboard),
                Checkpoint(checkpointer),
                Validate(validation_data=self.instances if validation_data is None else validation_data,
                         validation_iterator=iterator if validation_iterator is None else validation_iterator),
                TrackMetrics(patience, validation_metric),
                TrainSupervised(),
                GenerateTrainingBatches(self.instances, iterator, True)
        ]
Пример #5
0
    def train(self) -> Dict[str, Any]:
        self.model.vocab.save_to_files(os.path.join(self._serialization_dir, "vocabulary"))

        checkpointer = Checkpointer(self._serialization_dir)
        checkpointer.save_checkpoint(
            epoch=0, model_state=self.model.state_dict(), training_states={}, is_best_so_far=True
        )
        return {}
Пример #6
0
 def train(self) -> Dict[str, Any]:
     assert self._serialization_dir is not None
     self.model.vocab.save_to_files(
         os.path.join(self._serialization_dir, "vocabulary"))
     checkpointer = Checkpointer(self._serialization_dir)
     checkpointer.save_checkpoint(epoch=0,
                                  trainer=self,
                                  is_best_so_far=True)
     return {}
Пример #7
0
 def test_keep_zero(self):
     checkpointer = Checkpointer(serialization_dir=self.TEST_DIR,
                                 num_serialized_models_to_keep=0)
     for e in range(10):
         checkpointer.save_checkpoint(epoch=e,
                                      model_state={"epoch": e},
                                      training_states={"epoch": e},
                                      is_best_so_far=True)
     files = os.listdir(self.TEST_DIR)
     assert "model_state_epoch_1.th" not in files
     assert "training_state_epoch_1.th" not in files
 def __init__(self,
              serialization_dir: str,
              task_infos,
              num_epochs,
              num_serialized_models_to_keep: int = 10) -> None:
     """
     task1 and task2 should be ditionaries that hold the model, 
     the training, validation, adn test iterator for batches, and 
     the metrics, learning rate, current score, etc for each of the tasks.
     """
     super().__init__(serialization_dir)
     self.task_infos = task_infos
     self.num_epochs = num_epochs
     self.serialization_dir = serialization_dir
     self.swag_checkpointer = Checkpointer(
         serialization_dir + "/swag/",
         num_serialized_models_to_keep=num_serialized_models_to_keep)
     self.conll_checkpointer = Checkpointer(
         serialization_dir + "/conll/",
         num_serialized_models_to_keep=num_serialized_models_to_keep)
Пример #9
0
def single_worker(logger, model, reader, out_feature_key, optimizer, iterator,
                  train_dataset, validation_dataset):

    _cudart = U.get_cudart()
    device = torch.device("cuda" if FLAGS.use_cuda else "cpu")
    if _cudart is None:
        logger.warning(
            "No cudart, probably means you do not have cuda on this machine.")
    model = model.to(device)
    cuda_device = 0 if FLAGS.use_cuda else -1  # TODO: multi GPU
    # NOTE: THIS CKPT Mechanism only ckpt at the end of every epoch.
    # if an epoch is more than 1 day, then you take care of it yourself :P
    ckpter = Checkpointer(serialization_dir=FLAGS.ckpt_dir,
                          num_serialized_models_to_keep=1)

    if FLAGS.profile_only:
        raw_train_generator = iterator(train_dataset,
                                       num_epochs=1,
                                       shuffle=False)
        train_generator = lazy_groups_of(raw_train_generator, 1)
        _prof_input = next(train_generator)[0]
        stats = counter.profile(model,
                                input_size=(FLAGS.batch_size, ),
                                logger=logger,
                                is_cnn=False,
                                rnn_input=_prof_input)
        logger.info("DNN_Features: %s", str(stats))
        sys.exit(0)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      serialization_dir=FLAGS.ckpt_dir,
                      validation_dataset=validation_dataset,
                      num_epochs=FLAGS.max_epochs,
                      checkpointer=ckpter,
                      log_batch_size_period=10,
                      cuda_device=cuda_device)

    start_time = time.time()
    try:
        status = None
        if _cudart:
            status = _cudart.cudaProfilerStart()
        trainer.train()
    finally:
        if status == 0:
            _cudart.cudaProfilerStop()
    final_time = time.time() - start_time
    logger.info("Finished training: ran for %d secs", final_time)
    final_output(FLAGS.flag_values_dict(), model, device, logger, reader,
                 out_feature_key, start_time)
Пример #10
0
    def train(self) -> Dict[str, Any]:
        assert self._serialization_dir is not None
        self.model.vocab.save_to_files(
            os.path.join(self._serialization_dir, "vocabulary"))
        checkpointer = Checkpointer(self._serialization_dir)
        checkpointer.save_checkpoint(self)

        best_model_filename = os.path.join(self._serialization_dir, "best.th")
        torch.save(self.model.state_dict(), best_model_filename)
        self._best_model_filename = best_model_filename

        return {}
Пример #11
0
 def __init__(self,
              serialization_dir: str,
              model: Model,
              data_loader: DataLoader,
              validation_data_loader: Optional[DataLoader] = None,
              checkpointer: Optional[Checkpointer] = None) -> None:
     super().__init__(serialization_dir)
     self.model = model
     self.data_loader = data_loader
     self.validation_data_loader = validation_data_loader
     self.checkpointer = checkpointer or Checkpointer(
         self._serialization_dir)
Пример #12
0
    def __init__(
        self,
        model: Model,
        optimizer: torch.optim.Optimizer,
        train_dataset: Iterable[Instance],
        validation_dataset: Optional[Iterable[Instance]] = None,
        batch_size: int = 1,
        validation_metric: str = "-loss",
        shuffle: bool = True,
        num_epochs: int = 20,
        serialization_dir: Optional[str] = None,
        num_serialized_models_to_keep: int = 20,
        checkpointer: Checkpointer = None,
        cuda_device: Union[int, List] = -1,
        grad_clipping: Optional[float] = None,
        learning_rate_scheduler: Optional[LearningRateScheduler] = None
    ) -> None:
        super().__init__(serialization_dir, cuda_device)

        self.local_rank = dist.get_rank()
        self.local_device = torch.device("cuda", self.local_rank)
        self.model = DDP(model,
                         device_ids=[self.local_rank],
                         output_device=self.local_rank)

        self.batch_size = batch_size

        self.shuffle = shuffle
        self.optimizer = optimizer
        self.train_data = train_dataset
        self._validation_data = validation_dataset

        self._metric_tracker = MetricTracker(metric_name=validation_metric)
        self._validation_metric = validation_metric[1:]

        self._num_epochs = num_epochs

        if checkpointer is not None:
            self._checkpointer = checkpointer
        else:
            self._checkpointer = Checkpointer(serialization_dir, None,
                                              num_serialized_models_to_keep)

        self._grad_clipping = grad_clipping

        self._learning_rate_scheduler = learning_rate_scheduler

        self._batch_num_total = 0

        self._last_log = 0.0  # time of last logging
Пример #13
0
    def from_params(cls, params: Params,
                    serialization_dir: str) -> 'Checkpoint':  # type: ignore
        # pylint: disable=arguments-differ
        checkpointer_params = params.pop("checkpointer", None)
        if checkpointer_params:
            checkpointer = Checkpointer.from_params(
                checkpointer_params, serialization_dir=serialization_dir)
        else:
            checkpointer = Checkpointer(serialization_dir=serialization_dir)

        state_dict_attrs = params.pop("state_dict_attrs", None)
        other_attrs = params.pop("other_attrs", None)

        return Checkpoint(checkpointer, state_dict_attrs, other_attrs)
Пример #14
0
    def from_params(cls, params: Params,
                    serialization_dir: str) -> "Checkpoint":  # type: ignore

        checkpointer_params = params.pop("checkpointer", None)
        if checkpointer_params:
            checkpointer = Checkpointer.from_params(
                checkpointer_params, serialization_dir=serialization_dir)
        else:
            checkpointer = Checkpointer(serialization_dir=serialization_dir)

        state_dict_attrs = params.pop("state_dict_attrs", None)
        other_attrs = params.pop("other_attrs", None)

        return Checkpoint(checkpointer, state_dict_attrs, other_attrs)
Пример #15
0
    def test_default(self):
        """
        Tests that the default behavior keeps just the last 20 checkpoints.
        """
        default_num_to_keep = 20
        num_epochs = 30
        target = list(range(num_epochs - default_num_to_keep, num_epochs))

        checkpointer = Checkpointer(serialization_dir=self.TEST_DIR)

        for e in range(num_epochs):
            checkpointer.save_checkpoint(epoch=e,
                                         model_state={"epoch": e},
                                         training_states={"epoch": e},
                                         is_best_so_far=False)
        models, training = self.retrieve_and_delete_saved()
        assert models == training == target
    def from_params(  # type: ignore
            cls, params: Params, serialization_dir: str,
            **extras) -> "Checkpoint":

        checkpointer_params = params.pop("checkpointer", None)

        if checkpointer_params:
            checkpointer = Checkpointer.from_params(
                checkpointer_params, serialization_dir=serialization_dir)
        else:
            checkpointer = Checkpointer(serialization_dir=serialization_dir)
        model_save_interval = params.pop("model_save_interval", None)
        state_dict_attrs = params.pop("state_dict_attrs", None)
        other_attrs = params.pop("other_attrs", None)

        return Checkpoint(checkpointer, model_save_interval, state_dict_attrs,
                          other_attrs)
Пример #17
0
def build_trainer(model: Model,
                  serialization_dir: str,
                  train_loader: DataLoader,
                  dev_loader: DataLoader,
                  num_epochs: int,
                  learning_rate: float = 0.001,
                  cuda_device=None) -> Trainer:
    """
    Builds instance of Trainer class with specified training hyperparameters
    Adapted from https://guide.allennlp.org/training-and-prediction

    Parameters
        model : Model
            The model to train
        serialization_dir : str
            Directory to save checkpoints and results
        train_loader : DataLoader
            Previously built dataset loader for training data
        dev_loader : DataLoader
            Previously built loader for dev data
        num_epochs : int
            Number of epochs to train for
        learning_rate : float (default: 0.001)
        cuda_device : int (default: None)
            >=0 if using GPU

    Returns
        trainer : Trainer
    """
    parameters = [(n, p) for n, p in model.named_parameters()
                  if p.requires_grad]
    optimizer = AdamOptimizer(parameters, lr=learning_rate)  # type: ignore
    trainer = GradientDescentTrainer(model=model,
                                     checkpointer=Checkpointer(
                                         serialization_dir,
                                         num_serialized_models_to_keep=-1),
                                     serialization_dir=serialization_dir,
                                     data_loader=train_loader,
                                     validation_data_loader=dev_loader,
                                     num_epochs=num_epochs,
                                     optimizer=optimizer,
                                     cuda_device=cuda_device)
    print("Will train for", num_epochs, "epochs")
    return trainer
Пример #18
0
 def __init__(self,
              model: Model,
              serialization_dir: str,
              iterator: DataIterator,
              mingler: DatasetMingler,
              optimizer: torch.optim.Optimizer,
              datasets: Dict[str, Iterable[Instance]],
              num_epochs: int = 10,
              num_serialized_models_to_keep: int = 10) -> None:
     super().__init__(serialization_dir)
     self.model = model
     self.iterator = iterator
     self.mingler = mingler
     self.optimizer = optimizer
     self.datasets = datasets
     self.num_epochs = num_epochs
     self.checkpointer = Checkpointer(
         serialization_dir,
         num_serialized_models_to_keep=num_serialized_models_to_keep)
Пример #19
0
    def from_params(cls,  # type: ignore
                    model: Model,
                    serialization_dir: str,
                    iterator: DataIterator,
                    train_data: Iterable[Instance],
                    validation_data: Optional[Iterable[Instance]],
                    params: Params,
                    validation_iterator: DataIterator = None) -> 'Trainer':
        # pylint: disable=arguments-differ
        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)

        if isinstance(cuda_device, list):
            model_device = cuda_device[0]
        else:
            model_device = cuda_device
        if model_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(model_device)

        parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(params.pop("moving_average"), parameters=parameters)
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if 'checkpointer' in params:
            if 'keep_serialized_model_every_num_seconds' in params or \
                    'num_serialized_models_to_keep' in params:
                raise ConfigurationError(
                        "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                        "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                        " but the passed config uses both methods.")
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                    "keep_serialized_model_every_num_seconds", None)
            checkpointer = Checkpointer(
                    serialization_dir=serialization_dir,
                    num_serialized_models_to_keep=num_serialized_models_to_keep,
                    keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds)
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate", False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        params.assert_empty(cls.__name__)
        return cls(model, optimizer, iterator,
                   train_data, validation_data,
                   patience=patience,
                   validation_metric=validation_metric,
                   validation_iterator=validation_iterator,
                   shuffle=shuffle,
                   num_epochs=num_epochs,
                   serialization_dir=serialization_dir,
                   cuda_device=cuda_device,
                   grad_norm=grad_norm,
                   grad_clipping=grad_clipping,
                   learning_rate_scheduler=lr_scheduler,
                   momentum_scheduler=momentum_scheduler,
                   checkpointer=checkpointer,
                   model_save_interval=model_save_interval,
                   summary_interval=summary_interval,
                   histogram_interval=histogram_interval,
                   should_log_parameter_statistics=should_log_parameter_statistics,
                   should_log_learning_rate=should_log_learning_rate,
                   log_batch_size_period=log_batch_size_period,
                   moving_average=moving_average)
Пример #20
0
    def __init__(self,
                 model: Model,
                 optimizer: torch.optim.Optimizer,
                 iterator: DataIterator,
                 train_dataset: Iterable[Instance],
                 validation_dataset: Optional[Iterable[Instance]] = None,
                 patience: Optional[int] = None,
                 validation_metric: str = "-loss",
                 validation_iterator: DataIterator = None,
                 shuffle: bool = True,
                 num_epochs: int = 20,
                 serialization_dir: Optional[str] = None,
                 num_serialized_models_to_keep: int = 0,
                 keep_serialized_model_every_num_seconds: int = None,
                 checkpointer: Checkpointer = None,
                 model_save_interval: float = None,
                 cuda_device: Union[int, List] = -1,
                 grad_norm: Optional[float] = None,
                 grad_clipping: Optional[float] = None,
                 learning_rate_scheduler: Optional[LearningRateScheduler] = None,
                 momentum_scheduler: Optional[MomentumScheduler] = None,
                 summary_interval: int = 100,
                 histogram_interval: int = None,
                 should_log_parameter_statistics: bool = True,
                 should_log_learning_rate: bool = False,
                 log_batch_size_period: Optional[int] = None,
                 moving_average: Optional[MovingAverage] = None,
                 callbacks: List[allennlp_callback.Callback]=None,
                 early_stopping_by_batch: bool=True,
                 estimator: Estimator=None,
                 ) -> None:
        """
        A trainer for doing supervised learning. It just takes a labeled dataset
        and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights
        for your model over some fixed number of epochs. You can also pass in a validation
        dataset and enable early stopping. There are many other bells and whistles as well.

        Parameters
        ----------
        model : ``Model``, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their ``forward`` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.

            If you are training your model using GPUs, your model should already be
            on the correct device. (If you use `Trainer.from_params` this will be
            handled for you.)
        optimizer : ``torch.nn.Optimizer``, required.
            An instance of a Pytorch Optimizer, instantiated with the parameters of the
            model to be optimized.
        iterator : ``DataIterator``, required.
            A method for iterating over a ``Dataset``, yielding padded indexed batches.
        train_dataset : ``Dataset``, required.
            A ``Dataset`` to train on. The dataset should have already been indexed.
        validation_dataset : ``Dataset``, optional, (default = None).
            A ``Dataset`` to evaluate on. The dataset should have already been indexed.
        patience : Optional[int] > 0, optional (default=None)
            Number of epochs to be patient before early stopping: the training is stopped
            after ``patience`` epochs with no improvement. If given, it must be ``> 0``.
            If None, early stopping is disabled.
        validation_metric : str, optional (default="loss")
            Validation metric to measure for whether to stop training using patience
            and whether to serialize an ``is_best`` model each epoch. The metric name
            must be prepended with either "+" or "-", which specifies whether the metric
            is an increasing or decreasing function.
        validation_iterator : ``DataIterator``, optional (default=None)
            An iterator to use for the validation set.  If ``None``, then
            use the training `iterator`.
        shuffle: ``bool``, optional (default=True)
            Whether to shuffle the instances in the iterator or not.
        num_epochs : int, optional (default = 20)
            Number of training epochs.
        serialization_dir : str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        num_serialized_models_to_keep : ``int``, optional (default=20)
            Number of previous model checkpoints to retain.  Default is to keep 20 checkpoints.
            A value of None or -1 means all checkpoints will be kept.
        keep_serialized_model_every_num_seconds : ``int``, optional (default=None)
            If num_serialized_models_to_keep is not None, then occasionally it's useful to
            save models at a given interval in addition to the last num_serialized_models_to_keep.
            To do so, specify keep_serialized_model_every_num_seconds as the number of seconds
            between permanently saved checkpoints.  Note that this option is only used if
            num_serialized_models_to_keep is not None, otherwise all checkpoints are kept.
        checkpointer : ``Checkpointer``, optional (default=None)
            An instance of class Checkpointer to use instead of the default. If a checkpointer is specified,
            the arguments num_serialized_models_to_keep and keep_serialized_model_every_num_seconds should
            not be specified. The caller is responsible for initializing the checkpointer so that it is
            consistent with serialization_dir.
        model_save_interval : ``float``, optional (default=None)
            If provided, then serialize models every ``model_save_interval``
            seconds within single epochs.  In all cases, models are also saved
            at the end of every epoch if ``serialization_dir`` is provided.
        cuda_device : ``Union[int, List[int]]``, optional (default = -1)
            An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
        grad_norm : ``float``, optional, (default = None).
            If provided, gradient norms will be rescaled to have a maximum of this value.
        grad_clipping : ``float``, optional (default = ``None``).
            If provided, gradients will be clipped `during the backward pass` to have an (absolute)
            maximum of this value.  If you are getting ``NaNs`` in your gradients during training
            that are not solved by using ``grad_norm``, you may need this.
        learning_rate_scheduler : ``LearningRateScheduler``, optional (default = None)
            If specified, the learning rate will be decayed with respect to
            this schedule at the end of each epoch (or batch, if the scheduler implements
            the ``step_batch`` method). If you use :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`,
            this will use the ``validation_metric`` provided to determine if learning has plateaued.
            To support updating the learning rate on every batch, this can optionally implement
            ``step_batch(batch_num_total)`` which updates the learning rate given the batch number.
        momentum_scheduler : ``MomentumScheduler``, optional (default = None)
            If specified, the momentum will be updated at the end of each batch or epoch
            according to the schedule.
        summary_interval: ``int``, optional, (default = 100)
            Number of batches between logging scalars to tensorboard
        histogram_interval : ``int``, optional, (default = ``None``)
            If not None, then log histograms to tensorboard every ``histogram_interval`` batches.
            When this parameter is specified, the following additional logging is enabled:
                * Histograms of model parameters
                * The ratio of parameter update norm to parameter norm
                * Histogram of layer activations
            We log histograms of the parameters returned by
            ``model.get_parameters_for_histogram_tensorboard_logging``.
            The layer activations are logged for any modules in the ``Model`` that have
            the attribute ``should_log_activations`` set to ``True``.  Logging
            histograms requires a number of GPU-CPU copies during training and is typically
            slow, so we recommend logging histograms relatively infrequently.
            Note: only Modules that return tensors, tuples of tensors or dicts
            with tensors as values currently support activation logging.
        should_log_parameter_statistics : ``bool``, optional, (default = True)
            Whether to send parameter statistics (mean and standard deviation
            of parameters and gradients) to tensorboard.
        should_log_learning_rate : ``bool``, optional, (default = False)
            Whether to send parameter specific learning rate to tensorboard.
        log_batch_size_period : ``int``, optional, (default = ``None``)
            If defined, how often to log the average batch size.
        moving_average: ``MovingAverage``, optional, (default = None)
            If provided, we will maintain moving averages for all parameters. During training, we
            employ a shadow variable for each parameter, which maintains the moving average. During
            evaluation, we backup the original parameters and assign the moving averages to corresponding
            parameters. Be careful that when saving the checkpoint, we will save the moving averages of
            parameters. This is necessary because we want the saved model to perform as well as the validated
            model if we load it later. But this may cause problems if you restart the training from checkpoint.
        """
        super().__init__(serialization_dir, cuda_device)

        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        self.model = model

        self.iterator = iterator
        self._validation_iterator = validation_iterator
        self.shuffle = shuffle
        self.optimizer = optimizer
        self.train_data = train_dataset
        self._validation_data = validation_dataset

        if patience is None:  # no early stopping
            if validation_dataset:
                logger.warning('You provided a validation dataset but patience was set to None, '
                               'meaning that early stopping is disabled')
        elif (not isinstance(patience, int)) or patience <= 0:
            raise ConfigurationError('{} is an invalid value for "patience": it must be a positive integer '
                                     'or None (if you want to disable early stopping)'.format(patience))

        # For tracking is_best_so_far and should_stop_early
        self._metric_tracker = MetricTracker(patience, validation_metric)
        # Get rid of + or -
        self._validation_metric = validation_metric[1:]

        self._num_epochs = num_epochs

        if checkpointer is not None:
            # We can't easily check if these parameters were passed in, so check against their default values.
            # We don't check against serialization_dir since it is also used by the parent class.
            if num_serialized_models_to_keep != 20 or \
                    keep_serialized_model_every_num_seconds is not None:
                raise ConfigurationError(
                        "When passing a custom Checkpointer, you may not also pass in separate checkpointer "
                        "args 'num_serialized_models_to_keep' or 'keep_serialized_model_every_num_seconds'.")
            self._checkpointer = checkpointer
        else:
            self._checkpointer = Checkpointer(serialization_dir,
                                              keep_serialized_model_every_num_seconds,
                                              num_serialized_models_to_keep)

        self._model_save_interval = model_save_interval

        self._grad_norm = grad_norm
        self._grad_clipping = grad_clipping

        self._learning_rate_scheduler = learning_rate_scheduler
        self._momentum_scheduler = momentum_scheduler
        self._moving_average = moving_average

        # We keep the total batch number as an instance variable because it
        # is used inside a closure for the hook which logs activations in
        # ``_enable_activation_logging``.
        self._batch_num_total = 0

        self._tensorboard = TensorboardWriter(
                get_batch_num_total=lambda: self._batch_num_total,
                serialization_dir=serialization_dir,
                summary_interval=summary_interval,
                histogram_interval=histogram_interval,
                should_log_parameter_statistics=should_log_parameter_statistics,
                should_log_learning_rate=should_log_learning_rate)

        self._log_batch_size_period = log_batch_size_period

        self._last_log = 0.0  # time of last logging

        # Enable activation logging.
        if histogram_interval is not None:
            self._tensorboard.enable_activation_logging(self.model)
        self.callbacks = callbacks

        self._early_stopping_by_batch = early_stopping_by_batch

        self._estimator = estimator
Пример #21
0
    def from_partial_objects(
        cls,
        model: Model,
        serialization_dir: str,
        data_loader: DataLoader,
        validation_data_loader: DataLoader = None,
        local_rank: int = 0,
        patience: int = None,
        validation_metric: str = "-loss",
        num_epochs: int = 20,
        cuda_device: int = -1,
        grad_norm: float = None,
        grad_clipping: float = None,
        distributed: bool = None,
        world_size: int = 1,
        num_gradient_accumulation_steps: int = 1,
        opt_level: Optional[str] = None,
        no_grad: List[str] = None,
        optimizer: Lazy[Optimizer] = None,
        learning_rate_scheduler: Lazy[LearningRateScheduler] = None,
        momentum_scheduler: Lazy[MomentumScheduler] = None,
        tensorboard_writer: Lazy[TensorboardWriter] = None,
        moving_average: Lazy[MovingAverage] = None,
        checkpointer: Lazy[Checkpointer] = None,
        batch_callbacks: List[BatchCallback] = None,
        epoch_callbacks: List[EpochCallback] = None,
    ) -> "Trainer":
        """
        This method exists so that we can have a documented method to construct this class using
        `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this
        method.

        The reason we can't just use `__init__` with `FromParams` here is because there are
        sequential dependencies to this class's arguments.  Anything that has a `Lazy[]` type
        annotation needs something from one of the non-`Lazy` arguments.  The `Optimizer` needs to
        have the parameters from the `Model` before it's constructed, and the `Schedulers` need to
        have the `Optimizer`. Because of this, the typical way we construct things `FromParams`
        doesn't work, so we use `Lazy` to allow for constructing the objects sequentially.

        If you're not using `FromParams`, you can just construct these arguments in the right order
        yourself in your code and call the constructor directly.
        """

        check_for_gpu(cuda_device)
        if cuda_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(cuda_device)

        if no_grad:
            for name, parameter in model.named_parameters():
                if any(re.search(regex, name) for regex in no_grad):
                    parameter.requires_grad_(False)

        common_util.log_frozen_and_tunable_parameter_names(model)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer_ = optimizer.construct(model_parameters=parameters)
        if not optimizer_:
            optimizer_ = Optimizer.default(parameters)

        try:
            batches_per_epoch = len(data_loader)
        except TypeError:
            # If the dataset is lazy, it won't have a length.
            batches_per_epoch = None

        moving_average_ = moving_average.construct(parameters=parameters)
        learning_rate_scheduler_ = learning_rate_scheduler.construct(
            optimizer=optimizer_,
            num_epochs=num_epochs,
            num_steps_per_epoch=batches_per_epoch)
        momentum_scheduler_ = momentum_scheduler.construct(
            optimizer=optimizer_)

        checkpointer_ = checkpointer.construct() or Checkpointer(
            serialization_dir)
        tensorboard_writer_ = tensorboard_writer.construct(
        ) or TensorboardWriter(serialization_dir)

        return cls(
            model,
            optimizer_,
            data_loader,
            patience=patience,
            validation_metric=validation_metric,
            validation_data_loader=validation_data_loader,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=learning_rate_scheduler_,
            momentum_scheduler=momentum_scheduler_,
            tensorboard_writer=tensorboard_writer_,
            checkpointer=checkpointer_,
            moving_average=moving_average_,
            batch_callbacks=batch_callbacks,
            epoch_callbacks=epoch_callbacks,
            distributed=distributed,
            local_rank=local_rank,
            world_size=world_size,
            num_gradient_accumulation_steps=num_gradient_accumulation_steps,
            opt_level=opt_level,
        )
Пример #22
0
    def __init__(
        self,
        model: Model,
        optimizer: torch.optim.Optimizer,
        data_loader: torch.utils.data.DataLoader,
        patience: Optional[int] = None,
        validation_metric: str = "-loss",
        validation_data_loader: torch.utils.data.DataLoader = None,
        num_epochs: int = 20,
        serialization_dir: Optional[str] = None,
        checkpointer: Checkpointer = None,
        cuda_device: int = -1,
        grad_norm: Optional[float] = None,
        grad_clipping: Optional[float] = None,
        learning_rate_scheduler: Optional[LearningRateScheduler] = None,
        momentum_scheduler: Optional[MomentumScheduler] = None,
        tensorboard_writer: TensorboardWriter = None,
        moving_average: Optional[MovingAverage] = None,
        batch_callbacks: List[BatchCallback] = None,
        epoch_callbacks: List[EpochCallback] = None,
        distributed: bool = False,
        local_rank: int = 0,
        world_size: int = 1,
        num_gradient_accumulation_steps: int = 1,
        opt_level: Optional[str] = None,
    ) -> None:
        super().__init__(serialization_dir, cuda_device, distributed,
                         local_rank, world_size)

        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        self.model = model

        self.data_loader = data_loader
        self._validation_data_loader = validation_data_loader
        self.optimizer = optimizer

        if patience is None:  # no early stopping
            if validation_data_loader:
                logger.warning(
                    "You provided a validation dataset but patience was set to None, "
                    "meaning that early stopping is disabled")
        elif (not isinstance(patience, int)) or patience <= 0:
            raise ConfigurationError(
                '{} is an invalid value for "patience": it must be a positive integer '
                "or None (if you want to disable early stopping)".format(
                    patience))

        # For tracking is_best_so_far and should_stop_early
        self._metric_tracker = MetricTracker(patience, validation_metric)
        # Get rid of + or -
        self._validation_metric = validation_metric[1:]

        self._num_epochs = num_epochs

        if checkpointer is not None:
            self._checkpointer = checkpointer
        else:
            self._checkpointer = Checkpointer(serialization_dir)

        self._grad_norm = grad_norm
        self._grad_clipping = grad_clipping

        self._learning_rate_scheduler = learning_rate_scheduler
        self._momentum_scheduler = momentum_scheduler
        self._moving_average = moving_average
        self._batch_callbacks = batch_callbacks or []
        self._epoch_callbacks = epoch_callbacks or []

        # We keep the total batch number as an instance variable because it
        # is used inside a closure for the hook which logs activations in
        # `_enable_activation_logging`.
        self._batch_num_total = 0

        self._tensorboard = tensorboard_writer or TensorboardWriter(
            serialization_dir)
        self._tensorboard.get_batch_num_total = lambda: self._batch_num_total
        self._tensorboard.enable_activation_logging(self.model)

        self._last_log = 0.0  # time of last logging

        self._num_gradient_accumulation_steps = num_gradient_accumulation_steps

        # Enable automatic mixed precision training with NVIDIA Apex.
        self._opt_level = opt_level
        if self._opt_level is not None:
            if amp is None:
                raise ConfigurationError((
                    "Apex not installed but opt_level was provided. Please install NVIDIA's Apex to enable"
                    " automatic mixed precision (AMP) training. See: https://github.com/NVIDIA/apex."
                ))

            self.model, self.optimizer = amp.initialize(
                self.model, self.optimizer, opt_level=self._opt_level)

        # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its
        # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model`
        # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc.
        #
        # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the
        # normal case, reference to `Model` is retained. This reference is only used in
        # these places: `model.__call__`, `model.train` and `model.eval`.
        if self._distributed:
            self._pytorch_model = DistributedDataParallel(
                self.model,
                device_ids=[self.cuda_device],
                find_unused_parameters=True)
        else:
            self._pytorch_model = self.model
Пример #23
0
    def __init__(
        self,
        model: Model,
        optimizer: torch.optim.Optimizer,
        iterator: DataIterator,
        train_dataset: Iterable[Instance],
        validation_dataset: Optional[Iterable[Instance]] = None,
        patience: Optional[int] = None,
        validation_metric: str = "-loss",
        validation_iterator: DataIterator = None,
        shuffle: bool = True,
        num_epochs: int = 20,
        serialization_dir: Optional[str] = None,
        num_serialized_models_to_keep: int = 20,
        keep_serialized_model_every_num_seconds: int = None,
        checkpointer: Checkpointer = None,
        model_save_interval: float = None,
        cuda_device: int = -1,
        grad_norm: Optional[float] = None,
        grad_clipping: Optional[float] = None,
        learning_rate_scheduler: Optional[LearningRateScheduler] = None,
        momentum_scheduler: Optional[MomentumScheduler] = None,
        summary_interval: int = 100,
        histogram_interval: int = None,
        should_log_parameter_statistics: bool = True,
        should_log_learning_rate: bool = False,
        log_batch_size_period: Optional[int] = None,
        moving_average: Optional[MovingAverage] = None,
        distributed: bool = False,
        local_rank: int = 0,
        world_size: int = 1,
        num_gradient_accumulation_steps: int = 1,
        writer: WandBWriter = None,
    ) -> None:
        """
        A trainer for doing supervised learning. It just takes a labeled dataset
        and a `DataIterator`, and uses the supplied `Optimizer` to learn the weights
        for your model over some fixed number of epochs. You can also pass in a validation
        dataset and enable early stopping. There are many other bells and whistles as well.

        # Parameters

        model : `Model`, required.
            An AllenNLP model to be optimized. Pytorch Modules can also be optimized if
            their `forward` method returns a dictionary with a "loss" key, containing a
            scalar tensor representing the loss function to be optimized.

            If you are training your model using GPUs, your model should already be
            on the correct device. (If you use `Trainer.from_params` this will be
            handled for you.)
        optimizer : `torch.nn.Optimizer`, required.
            An instance of a Pytorch Optimizer, instantiated with the parameters of the
            model to be optimized.
        iterator : `DataIterator`, required.
            A method for iterating over a `Dataset`, yielding padded indexed batches.
        train_dataset : `Dataset`, required.
            A `Dataset` to train on. The dataset should have already been indexed.
        validation_dataset : `Dataset`, optional, (default = None).
            A `Dataset` to evaluate on. The dataset should have already been indexed.
        patience : Optional[int] > 0, optional (default=None)
            Number of epochs to be patient before early stopping: the training is stopped
            after `patience` epochs with no improvement. If given, it must be `> 0`.
            If None, early stopping is disabled.
        validation_metric : str, optional (default="loss")
            Validation metric to measure for whether to stop training using patience
            and whether to serialize an `is_best` model each epoch. The metric name
            must be prepended with either "+" or "-", which specifies whether the metric
            is an increasing or decreasing function.
        validation_iterator : `DataIterator`, optional (default=None)
            An iterator to use for the validation set.  If `None`, then
            use the training `iterator`.
        shuffle : `bool`, optional (default=True)
            Whether to shuffle the instances in the iterator or not.
        num_epochs : int, optional (default = 20)
            Number of training epochs.
        serialization_dir : str, optional (default=None)
            Path to directory for saving and loading model files. Models will not be saved if
            this parameter is not passed.
        num_serialized_models_to_keep : `int`, optional (default=20)
            Number of previous model checkpoints to retain.  Default is to keep 20 checkpoints.
            A value of None or -1 means all checkpoints will be kept.
        keep_serialized_model_every_num_seconds : `int`, optional (default=None)
            If num_serialized_models_to_keep is not None, then occasionally it's useful to
            save models at a given interval in addition to the last num_serialized_models_to_keep.
            To do so, specify keep_serialized_model_every_num_seconds as the number of seconds
            between permanently saved checkpoints.  Note that this option is only used if
            num_serialized_models_to_keep is not None, otherwise all checkpoints are kept.
        checkpointer : `Checkpointer`, optional (default=None)
            An instance of class Checkpointer to use instead of the default. If a checkpointer is specified,
            the arguments num_serialized_models_to_keep and keep_serialized_model_every_num_seconds should
            not be specified. The caller is responsible for initializing the checkpointer so that it is
            consistent with serialization_dir.
        model_save_interval : `float`, optional (default=None)
            If provided, then serialize models every `model_save_interval`
            seconds within single epochs.  In all cases, models are also saved
            at the end of every epoch if `serialization_dir` is provided.
        cuda_device : `int`, optional (default = -1)
            An integer specifying the CUDA device(s) to use for this process. If -1, the CPU is used.
            Data parallelism is controlled at the allennlp train level, so each trainer will have a single
            GPU.
        grad_norm : `float`, optional, (default = None).
            If provided, gradient norms will be rescaled to have a maximum of this value.
        grad_clipping : `float`, optional (default = `None`).
            If provided, gradients will be clipped `during the backward pass` to have an (absolute)
            maximum of this value.  If you are getting `NaNs` in your gradients during training
            that are not solved by using `grad_norm`, you may need this.
        learning_rate_scheduler : `LearningRateScheduler`, optional (default = None)
            If specified, the learning rate will be decayed with respect to
            this schedule at the end of each epoch (or batch, if the scheduler implements
            the `step_batch` method). If you use :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`,
            this will use the `validation_metric` provided to determine if learning has plateaued.
            To support updating the learning rate on every batch, this can optionally implement
            `step_batch(batch_num_total)` which updates the learning rate given the batch number.
        momentum_scheduler : `MomentumScheduler`, optional (default = None)
            If specified, the momentum will be updated at the end of each batch or epoch
            according to the schedule.
        summary_interval : `int`, optional, (default = 100)
            Number of batches between logging scalars to tensorboard
        histogram_interval : `int`, optional, (default = `None`)
            If not None, then log histograms to tensorboard every `histogram_interval` batches.
            When this parameter is specified, the following additional logging is enabled:
                * Histograms of model parameters
                * The ratio of parameter update norm to parameter norm
                * Histogram of layer activations
            We log histograms of the parameters returned by
            `model.get_parameters_for_histogram_tensorboard_logging`.
            The layer activations are logged for any modules in the `Model` that have
            the attribute `should_log_activations` set to `True`.  Logging
            histograms requires a number of GPU-CPU copies during training and is typically
            slow, so we recommend logging histograms relatively infrequently.
            Note: only Modules that return tensors, tuples of tensors or dicts
            with tensors as values currently support activation logging.
        should_log_parameter_statistics : `bool`, optional, (default = True)
            Whether to send parameter statistics (mean and standard deviation
            of parameters and gradients) to tensorboard.
        should_log_learning_rate : `bool`, optional, (default = False)
            Whether to send parameter specific learning rate to tensorboard.
        log_batch_size_period : `int`, optional, (default = `None`)
            If defined, how often to log the average batch size.
        moving_average : `MovingAverage`, optional, (default = None)
            If provided, we will maintain moving averages for all parameters. During training, we
            employ a shadow variable for each parameter, which maintains the moving average. During
            evaluation, we backup the original parameters and assign the moving averages to corresponding
            parameters. Be careful that when saving the checkpoint, we will save the moving averages of
            parameters. This is necessary because we want the saved model to perform as well as the validated
            model if we load it later. But this may cause problems if you restart the training from checkpoint.
        distributed : `bool`, optional, (default = False)
            If set, PyTorch's `DistributedDataParallel` is used to train the model in multiple GPUs. This also
            requires `world_size` to be greater than 1.
        local_rank : `int`, optional, (default = 0)
            This is the unique identifier of the `Trainer` in a distributed process group. The GPU device id is
            used as the rank.
        world_size : `int`, (default = 1)
            The number of `Trainer` workers participating in the distributed training.
        num_gradient_accumulation_steps : `int`, optional, (default = 1)
            Gradients are accumulated for the given number of steps before doing an optimizer step. This can
            be useful to accommodate batches that are larger than the RAM size. Refer Thomas Wolf's
            [post](https://tinyurl.com/y5mv44fw) for details on Gradient Accumulation.
        """
        super().__init__(serialization_dir, cuda_device, distributed, local_rank, world_size)

        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        self.model = model

        self.iterator = iterator
        self._validation_iterator = validation_iterator
        self.shuffle = shuffle
        self.optimizer = optimizer
        self.train_data = train_dataset
        self._validation_data = validation_dataset

        if patience is None:  # no early stopping
            if validation_dataset:
                logger.warning(
                    "You provided a validation dataset but patience was set to None, "
                    "meaning that early stopping is disabled"
                )
        elif (not isinstance(patience, int)) or patience <= 0:
            raise ConfigurationError(
                '{} is an invalid value for "patience": it must be a positive integer '
                "or None (if you want to disable early stopping)".format(patience)
            )

        # For tracking is_best_so_far and should_stop_early
        self._metric_tracker = MetricTracker(patience, validation_metric)
        # Get rid of + or -
        self._validation_metric = validation_metric[1:]

        self._num_epochs = num_epochs

        if checkpointer is not None:
            # We can't easily check if these parameters were passed in, so check against their default values.
            # We don't check against serialization_dir since it is also used by the parent class.
            if (
                num_serialized_models_to_keep != 20
                or keep_serialized_model_every_num_seconds is not None
            ):
                raise ConfigurationError(
                    "When passing a custom Checkpointer, you may not also pass in separate checkpointer "
                    "args 'num_serialized_models_to_keep' or 'keep_serialized_model_every_num_seconds'."
                )
            self._checkpointer = checkpointer
        else:
            self._checkpointer = Checkpointer(
                serialization_dir,
                keep_serialized_model_every_num_seconds,
                num_serialized_models_to_keep,
            )

        self._model_save_interval = model_save_interval

        self._grad_norm = grad_norm
        self._grad_clipping = grad_clipping

        self._learning_rate_scheduler = learning_rate_scheduler
        self._momentum_scheduler = momentum_scheduler
        self._moving_average = moving_average

        # We keep the total batch number as an instance variable because it
        # is used inside a closure for the hook which logs activations in
        # `_enable_activation_logging`.
        self._batch_num_total = 0

        if writer is not None:
            self._writer = writer
        else:
            self._writer = TensorboardWriter(
                    get_batch_num_total=lambda: self._batch_num_total,
                    serialization_dir=serialization_dir,
                    summary_interval=summary_interval,
                    histogram_interval=histogram_interval,
                    should_log_parameter_statistics=should_log_parameter_statistics,
                    should_log_learning_rate=should_log_learning_rate)

        self._log_batch_size_period = log_batch_size_period

        self._last_log = 0.0  # time of last logging

        self._num_gradient_accumulation_steps = num_gradient_accumulation_steps

        # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its
        # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model`
        # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc.
        #
        # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the
        # normal case, reference to `Model` is retained. This reference is only used in
        # these places: `model.__call__`, `model.train` and `model.eval`.
        if self._distributed:
            self._pytorch_model = DistributedDataParallel(
                self.model, device_ids=[self.cuda_device], find_unused_parameters=True
            )
        else:
            self._pytorch_model = self.model
Пример #24
0
    def __init__(
            self,
            model: Model,
            optimizer: torch.optim.Optimizer,
            data_loader: DataLoader,
            patience: Optional[int] = None,
            validation_metric: Union[str, List[str]] = "-loss",
            validation_data_loader: DataLoader = None,
            num_epochs: int = 20,
            serialization_dir: Optional[str] = None,
            checkpointer: Checkpointer = None,
            cuda_device: Optional[Union[int, torch.device]] = None,
            grad_norm: Optional[float] = None,
            grad_clipping: Optional[float] = None,
            learning_rate_scheduler: Optional[LearningRateScheduler] = None,
            momentum_scheduler: Optional[MomentumScheduler] = None,
            moving_average: Optional[MovingAverage] = None,
            callbacks: List[TrainerCallback] = None,
            distributed: bool = False,
            local_rank: int = 0,
            world_size: int = 1,
            num_gradient_accumulation_steps: int = 1,
            use_amp: bool = False,
            enable_default_callbacks: bool = True,
            run_sanity_checks: bool = True,
            val_loss_steps: int = 50
    ) -> None:
        super().__init__(serialization_dir, cuda_device, distributed, local_rank, world_size)

        # I am not calling move_to_gpu here, because if the model is
        # not already on the GPU then the optimizer is going to be wrong.
        self.model = model

        self.data_loader = data_loader
        self.data_loader.set_target_device(self.cuda_device)
        self._validation_data_loader = validation_data_loader
        if self._validation_data_loader is not None:
            self._validation_data_loader.set_target_device(self.cuda_device)
        self.optimizer = optimizer

        if patience is None:  # no early stopping
            if validation_data_loader is not None:
                logger.warning(
                    "You provided a validation dataset but patience was set to None, "
                    "meaning that early stopping is disabled"
                )
        elif (not isinstance(patience, int)) or patience <= 0:
            raise ConfigurationError(
                '{} is an invalid value for "patience": it must be a positive integer '
                "or None (if you want to disable early stopping)".format(patience)
            )

        # For tracking is_best_so_far and should_stop_early
        self._metric_tracker = MetricTracker(validation_metric, patience)

        self._num_epochs = num_epochs

        self._checkpointer: Optional[Checkpointer] = checkpointer
        if checkpointer is None and serialization_dir is not None:
            self._checkpointer = Checkpointer(serialization_dir)

        self._grad_norm = grad_norm
        self._grad_clipping = grad_clipping

        self._learning_rate_scheduler = learning_rate_scheduler
        self._momentum_scheduler = momentum_scheduler
        self._moving_average = moving_average

        self._callbacks = callbacks or []
        default_callbacks = list(DEFAULT_CALLBACKS) if enable_default_callbacks else []
        if run_sanity_checks:
            default_callbacks.append(SanityChecksCallback)
        for callback_cls in default_callbacks:
            for callback in self._callbacks:
                if callback.__class__ == callback_cls:
                    break
            else:
                self._callbacks.append(callback_cls(self._serialization_dir))

        self._batch_num_total = 0
        self._last_log = 0.0  # time of last logging
        self._num_gradient_accumulation_steps = num_gradient_accumulation_steps

        # Enable automatic mixed precision training.
        self._scaler: Optional[amp.GradScaler] = None
        self._use_amp = use_amp
        if self._use_amp:
            if self.cuda_device == torch.device("cpu"):
                raise ValueError("Using AMP requires a cuda device")
            self._scaler = amp.GradScaler()

        # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its
        # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model`
        # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc.
        #
        # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the
        # normal case, reference to `Model` is retained. This reference is only used in
        # these places: `model.__call__`, `model.train` and `model.eval`.
        if self._distributed:
            self._pytorch_model = DistributedDataParallel(
                self.model,
                device_ids=None if self.cuda_device == torch.device("cpu") else [self.cuda_device],
                find_unused_parameters=True,
            )
        else:
            self._pytorch_model = self.model

        self.val_loss_steps = val_loss_steps
Пример #25
0
    def from_params(  # type: ignore
        cls,
        params: Params,
        serialization_dir: str,
        recover: bool = False,
        local_rank: int = 0,
    ) -> "Trainer":

        from allennlp.training.trainer import Trainer
        from allennlp.training.trainer_pieces import TrainerPieces

        config = dict(as_flat_dict(params.as_dict()))
        pieces = TrainerPieces.from_params(params, serialization_dir, recover)
        model = pieces.model
        serialization_dir = serialization_dir
        iterator = pieces.iterator
        train_data = pieces.train_dataset
        validation_data = pieces.validation_dataset
        params = pieces.params
        validation_iterator = pieces.validation_iterator

        patience = params.pop_int("patience", None)
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_norm = params.pop_float("grad_norm", None)
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        momentum_scheduler_params = params.pop("momentum_scheduler", None)

        check_for_gpu(cuda_device)
        if cuda_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(cuda_device)

        parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
        if "moving_average" in params:
            moving_average = MovingAverage.from_params(
                params.pop("moving_average"), parameters=parameters
            )
        else:
            moving_average = None

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None
        if momentum_scheduler_params:
            momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params)
        else:
            momentum_scheduler = None

        if "checkpointer" in params:
            if (
                "keep_serialized_model_every_num_seconds" in params
                or "num_serialized_models_to_keep" in params
            ):
                raise ConfigurationError(
                    "Checkpointer may be initialized either from the 'checkpointer' key or from the "
                    "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'"
                    " but the passed config uses both methods."
                )
            checkpointer = Checkpointer.from_params(params.pop("checkpointer"))
        else:
            num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20)
            keep_serialized_model_every_num_seconds = params.pop_int(
                "keep_serialized_model_every_num_seconds", None
            )
            checkpointer = Checkpointer(
                serialization_dir=serialization_dir,
                num_serialized_models_to_keep=num_serialized_models_to_keep,
                keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds,
            )
        model_save_interval = params.pop_float("model_save_interval", None)
        summary_interval = params.pop_int("summary_interval", 100)
        histogram_interval = params.pop_int("histogram_interval", None)
        should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True)
        should_log_learning_rate = params.pop_bool("should_log_learning_rate", False)
        log_batch_size_period = params.pop_int("log_batch_size_period", None)

        distributed = params.pop_bool("distributed", False)
        world_size = params.pop_int("world_size", 1)

        num_gradient_accumulation_steps = params.pop("num_gradient_accumulation_steps", 1)
        lang_mean_dir = params.pop("ft_lang_mean_dir", None)
        if lang_mean_dir:
            try:
                assert model._lang_means is not None
                lang_mean = get_lang_mean(lang_mean_dir)
                model.add_ft_lang_mean_to_lang_means(lang_mean)
            except (AttributeError, AssertionError) as e:
                pass

        writer = None
        wandb_config = params.pop("wandb", None)
        if wandb_config is not None:
            writer = WandBWriter(config, model, wandb_config)

        params.assert_empty(cls.__name__)
        return cls(
            model,
            optimizer,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=lr_scheduler,
            momentum_scheduler=momentum_scheduler,
            checkpointer=checkpointer,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average,
            distributed=distributed,
            local_rank=local_rank,
            world_size=world_size,
            num_gradient_accumulation_steps=num_gradient_accumulation_steps,
            writer=writer,
        )
Пример #26
0
    def from_params(cls,
                    params: Params,
                    serialization_dir: str,
                    recover: bool = False,
                    cache_directory: str = None,
                    cache_prefix: str = None) -> 'PtDistTrainer':
        all_datasets = training_util.datasets_from_params(
            params, cache_directory, cache_prefix)
        vocab = Vocabulary.from_files(params.vocabulary.directory_path)

        model = Model.from_params(vocab=vocab, params=params.pop('model'))
        model.extend_embedder_vocab()
        if is_master_rank():
            vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

        train_data = all_datasets['train']
        validation_data = all_datasets.get('validation')

        batch_size = params.iterator.batch_size

        trainer_params = params.pop("trainer")
        keys = [key for key in params]
        for key in keys:
            params.pop(key)
        params = trainer_params
        validation_metric = params.pop("validation_metric", "-loss")
        shuffle = params.pop_bool("shuffle", True)
        num_epochs = params.pop_int("num_epochs", 20)
        cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
        grad_clipping = params.pop_float("grad_clipping", None)
        lr_scheduler_params = params.pop("learning_rate_scheduler", None)
        pretrain_file = params.pop("pretrain_file", None)

        no_grad_regexes = params.pop("no_grad", ())
        for name, parameter in model.named_parameters():
            if any(re.search(regex, name) for regex in no_grad_regexes):
                parameter.requires_grad_(False)

        frozen_parameter_names, tunable_parameter_names = \
            get_frozen_and_tunable_parameter_names(model)
        logger.info("Following parameters are Frozen  (without gradient):")
        for name in frozen_parameter_names:
            logger.info(name)
        logger.info("Following parameters are Tunable (with gradient):")
        for name in tunable_parameter_names:
            logger.info(name)

        model = model.cuda(dist.get_rank())
        if pretrain_file:
            model_state = torch.load(pretrain_file,
                                     map_location=nn_util.device_mapping(
                                         dist.get_rank()))
            model.load_state_dict(model_state)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        # print([n for n, p in model.named_parameters() if p.requires_grad])
        optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))

        if lr_scheduler_params:
            lr_scheduler = LearningRateScheduler.from_params(
                optimizer, lr_scheduler_params)
        else:
            lr_scheduler = None

        num_serialized_models_to_keep = params.pop_int(
            "num_serialized_models_to_keep", 20)
        checkpointer = Checkpointer(
            serialization_dir=serialization_dir,
            num_serialized_models_to_keep=num_serialized_models_to_keep,
            keep_serialized_model_every_num_seconds=None)

        return cls(model,
                   optimizer,
                   train_data,
                   validation_data,
                   batch_size=batch_size,
                   validation_metric=validation_metric,
                   shuffle=shuffle,
                   num_epochs=num_epochs,
                   serialization_dir=serialization_dir,
                   cuda_device=cuda_device,
                   grad_clipping=grad_clipping,
                   learning_rate_scheduler=lr_scheduler,
                   checkpointer=checkpointer)
Пример #27
0
            total += labels.size(0)
            correct += (predicted_labels == labels).sum().item()
        f1 = f1_score(y_true, y_pred, average="binary")
        print(f"Epcoh {ep} Validation accuracy: {correct/total}, f1: {f1}")
        # print_cm(confusion_matrix(y_true, y_pred, labels=range(len(le.classes_))), labels=[l[-5:] for l in le.classes_.tolist()])
        return correct/total, f1


# ### Training

# In[10]:


checkpointer = Checkpointer(serialization_dir="Checkpoint_act_clf", 
                            keep_serialized_model_every_num_seconds=3600*2, 
                            num_serialized_models_to_keep=5)


# In[11]:


# optimizer
num_epochs = 10
num_gradients_accumulation = 1
num_train_optimization_steps = num_train_optimization_steps = len(train_dataset) * num_epochs // batch_size // num_gradients_accumulation

param_optimizer = list(model_A.named_parameters()) 
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
Пример #28
0
    def from_partial_objects(
        cls,
        model: Model,
        serialization_dir: str,
        iterator: DataIterator,
        train_data: Iterable[Instance],
        validation_iterator: DataIterator = None,
        validation_data: Iterable[Instance] = None,
        local_rank: int = 0,
        patience: int = None,
        validation_metric: str = "-loss",
        shuffle: bool = True,
        num_epochs: int = 20,
        cuda_device: int = -1,
        grad_norm: float = None,
        grad_clipping: float = None,
        model_save_interval: float = None,
        summary_interval: int = 100,
        histogram_interval: int = None,
        should_log_parameter_statistics: bool = True,
        should_log_learning_rate: bool = False,
        log_batch_size_period: int = None,
        distributed: bool = None,
        world_size: int = 1,
        num_gradient_accumulation_steps: int = 1,
        no_grad: List[str] = None,
        optimizer: Lazy[Optimizer] = None,
        learning_rate_scheduler: Lazy[LearningRateScheduler] = None,
        momentum_scheduler: Lazy[MomentumScheduler] = None,
        moving_average: Lazy[MovingAverage] = None,
        checkpointer: Lazy[Checkpointer] = None,
    ) -> "Trainer":
        """
        This method exists so that we can have a documented method to construct this class using
        `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this
        method.

        The reason we can't just use `__init__` with `FromParams` here is because there are
        sequential dependencies to this class's arguments.  Anything that has a `Lazy[]` type
        annotation needs something from one of the non-`Lazy` arguments.  The `Optimizer` needs to
        have the parameters from the `Model` before it's constructed, and the `Schedulers` need to
        have the `Optimizer`. Because of this, the typical way we construct things `FromParams`
        doesn't work, so we use `Lazy` to allow for constructing the objects sequentially.

        If you're not using `FromParams`, you can just construct these arguments in the right order
        yourself in your code and call the constructor directly.
        """

        check_for_gpu(cuda_device)
        if cuda_device >= 0:
            # Moving model to GPU here so that the optimizer state gets constructed on
            # the right device.
            model = model.cuda(cuda_device)

        if no_grad:
            for name, parameter in model.named_parameters():
                if any(re.search(regex, name) for regex in no_grad):
                    parameter.requires_grad_(False)

        common_util.log_frozen_and_tunable_parameter_names(model)

        parameters = [[n, p] for n, p in model.named_parameters()
                      if p.requires_grad]
        optimizer_ = optimizer.construct(model_parameters=parameters)
        if not optimizer_:
            optimizer_ = Optimizer.default(parameters)

        batches_per_epoch = iterator.get_num_batches(train_data)
        if batches_per_epoch == 1:  # get_num_batches returns 1 when it can't determine the answer
            batches_per_epoch = None
        moving_average_ = moving_average.construct(parameters=parameters)
        learning_rate_scheduler_ = learning_rate_scheduler.construct(
            optimizer=optimizer_,
            num_epochs=num_epochs,
            num_steps_per_epoch=batches_per_epoch)
        momentum_scheduler_ = momentum_scheduler.construct(
            optimizer=optimizer_)

        checkpointer_ = checkpointer.construct() or Checkpointer(
            serialization_dir)
        return cls(
            model,
            optimizer_,
            iterator,
            train_data,
            validation_data,
            patience=patience,
            validation_metric=validation_metric,
            validation_iterator=validation_iterator,
            shuffle=shuffle,
            num_epochs=num_epochs,
            serialization_dir=serialization_dir,
            cuda_device=cuda_device,
            grad_norm=grad_norm,
            grad_clipping=grad_clipping,
            learning_rate_scheduler=learning_rate_scheduler_,
            momentum_scheduler=momentum_scheduler_,
            checkpointer=checkpointer_,
            model_save_interval=model_save_interval,
            summary_interval=summary_interval,
            histogram_interval=histogram_interval,
            should_log_parameter_statistics=should_log_parameter_statistics,
            should_log_learning_rate=should_log_learning_rate,
            log_batch_size_period=log_batch_size_period,
            moving_average=moving_average_,
            distributed=distributed,
            local_rank=local_rank,
            world_size=world_size,
            num_gradient_accumulation_steps=num_gradient_accumulation_steps,
        )
Пример #29
0
def distribute_worker(gpu_index, ngpus_per_node, world_size, program_flags):

    # at this point, rank is just machine rank.
    rank = program_flags['rank']

    ckpter = None
    if rank == 0:
        # only ever first rank do the ckpt to save time.
        ckpter = Checkpointer(serialization_dir=program_flags['ckpt_dir'],
                              num_serialized_models_to_keep=2)

    if world_size > 1:
        # NOTE: however here, we need to convert rank to be global rank among processes
        # machine * gpus per node + our current gpu index
        # see https://github.com/pytorch/examples/blob/master/imagenet/main.py
        if program_flags['assume_same_gpus']:
            rank = rank * ngpus_per_node + gpu_index
        else:
            rank = rank * program_flags['rank_scale_factor'] + gpu_index

    program_flags['run_name'] = program_flags['run_name'] + str(rank)
    logger, model, reader, out_feature_key, optimizer, iterator, train_dataset, validation_dataset = pre_init(
        program_flags, ngpus_per_node)

    dist.init_process_group(backend=program_flags['dist_backend'],
                            init_method=program_flags['dist_method'],
                            world_size=world_size,
                            rank=rank)

    logger.info("Rank %d --- preparing to start training", rank)
    # Set cuda to a single gpu context
    torch.cuda.set_device(gpu_index)
    device = torch.device("cuda:%d" % gpu_index)
    model.cuda(gpu_index)
    model = torch.nn.parallel.DistributedDataParallel(model,
                                                      device_ids=[gpu_index])

    trainer = DistributeTrainer(
        rank=rank,
        worldsize=world_size,
        ngpus_per_node=ngpus_per_node,
        cuda_device=[gpu_index],
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        train_dataset=train_dataset,
        validation_dataset=validation_dataset,
        serialization_dir=program_flags['ckpt_dir'],
        checkpointer=ckpter,
        log_batch_size_period=20,
    )

    logger.info(device)
    start_time = time.time()
    trainer.train()
    final_time = time.time() - start_time
    logger.info("Rank %d Finished training: ran for %d secs", rank, final_time)
    if rank == 0:
        # only 1 worker need to do an output check.
        final_output(program_flags, model, device, logger, reader,
                     out_feature_key, start_time)
Пример #30
0
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=num_train_optimization_steps)

    manager.init_training(model, optimizer)

    update_count = 0
    if manager.is_main_rank():
        progress_bar = tqdm.tqdm
    else:
        progress_bar = iter

    if manager.is_main_rank():
        checkpointer = Checkpointer(
            "Checkpoint",
            keep_serialized_model_every_num_seconds=3600 * 4,
            num_serialized_models_to_keep=10)
        writer = SummaryWriter()
        start = time.time()
        update_loss = 0.0
        update_kl = 0.0

    for ep in range(args.num_train_epochs):
        pbar = progress_bar(train_dataloader)

        for batch in pbar:
            batch = batch_to_device(batch, args.device)

            loss, kl = model.train_one_step(batch)
            manager.backward_loss(loss, model, optimizer)
            update_count += 1