def test_configuration_error_when_passed_as_conflicting_argument_to_trainer(self): """ Users should initialize Trainer either with an instance of Checkpointer or by specifying parameter values for num_serialized_models_to_keep and keep_serialized_model_every_num_seconds. Check that Trainer raises a ConfigurationError if both methods are used at the same time. """ with self.assertRaises(ConfigurationError): Trainer(None, None, None, None, num_serialized_models_to_keep=30, keep_serialized_model_every_num_seconds=None, checkpointer=Checkpointer(serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=40, keep_serialized_model_every_num_seconds=2)) with self.assertRaises(ConfigurationError): Trainer(None, None, None, None, num_serialized_models_to_keep=20, keep_serialized_model_every_num_seconds=2, checkpointer=Checkpointer(serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=40, keep_serialized_model_every_num_seconds=2)) try: Trainer(None, None, None, None, checkpointer=Checkpointer(serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=40, keep_serialized_model_every_num_seconds=2)) except ConfigurationError: self.fail("Configuration Error raised for passed checkpointer")
def default_callbacks( self, validation_metric: str = "-loss", patience: int = None, max_checkpoints: int = 20, checkpoint_every: int = None, model_save_interval: float = None, serialization_dir: str = "__DEFAULT__", validation_data: Iterable[Instance] = None, validation_iterator: DataIterator = None, batch_size: int = 2, ): if serialization_dir == "__DEFAULT__": serialization_dir = self.TEST_DIR checkpointer = Checkpointer(serialization_dir, checkpoint_every, max_checkpoints) tensorboard = TensorboardWriter(get_batch_num_total=lambda: None) if validation_iterator is None: validation_iterator = BasicIterator(batch_size=batch_size) validation_iterator.index_with(self.vocab) return [ LogToTensorboard(log_batch_size_period=10, tensorboard=tensorboard), Checkpoint(checkpointer, model_save_interval), Validate( validation_data=self.instances if validation_data is None else validation_data, validation_iterator=validation_iterator, ), TrackMetrics(patience, validation_metric), GradientNormAndClip(), ]
def test_with_time(self): """ Tests that keep_serialized_model_every_num_seconds parameter causes a checkpoint to be saved after enough time has elapsed between epochs. """ num_to_keep = 10 num_epochs = 30 target = list(range(num_epochs - num_to_keep, num_epochs)) pauses = [5, 18, 26] target = sorted(set(target + pauses)) checkpointer = Checkpointer( serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=num_to_keep, keep_serialized_model_every_num_seconds=1, ) for e in range(num_epochs): if e in pauses: time.sleep(2) checkpointer.save_checkpoint( epoch=e, model_state={"epoch": e}, training_states={"epoch": e}, is_best_so_far=False, ) models, training = self.retrieve_and_delete_saved() assert models == training == target
def default_callbacks(self, validation_metric: str = "-loss", patience: int = None, max_checkpoints: int = 20, checkpoint_every: int = None, serialization_dir: str = "__DEFAULT__", iterator: DataIterator = None, validation_data: Iterable[Instance] = None, validation_iterator: DataIterator = None, batch_size: int = 2): if serialization_dir == "__DEFAULT__": serialization_dir = self.TEST_DIR checkpointer = Checkpointer(serialization_dir, checkpoint_every, max_checkpoints) tensorboard = TensorboardWriter(get_batch_num_total=lambda: None) if iterator is None: iterator = BasicIterator(batch_size=batch_size) iterator.index_with(self.vocab) return [ LogToTensorboard(log_batch_size_period=10, tensorboard=tensorboard), Checkpoint(checkpointer), Validate(validation_data=self.instances if validation_data is None else validation_data, validation_iterator=iterator if validation_iterator is None else validation_iterator), TrackMetrics(patience, validation_metric), TrainSupervised(), GenerateTrainingBatches(self.instances, iterator, True) ]
def train(self) -> Dict[str, Any]: self.model.vocab.save_to_files(os.path.join(self._serialization_dir, "vocabulary")) checkpointer = Checkpointer(self._serialization_dir) checkpointer.save_checkpoint( epoch=0, model_state=self.model.state_dict(), training_states={}, is_best_so_far=True ) return {}
def train(self) -> Dict[str, Any]: assert self._serialization_dir is not None self.model.vocab.save_to_files( os.path.join(self._serialization_dir, "vocabulary")) checkpointer = Checkpointer(self._serialization_dir) checkpointer.save_checkpoint(epoch=0, trainer=self, is_best_so_far=True) return {}
def test_keep_zero(self): checkpointer = Checkpointer(serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=0) for e in range(10): checkpointer.save_checkpoint(epoch=e, model_state={"epoch": e}, training_states={"epoch": e}, is_best_so_far=True) files = os.listdir(self.TEST_DIR) assert "model_state_epoch_1.th" not in files assert "training_state_epoch_1.th" not in files
def __init__(self, serialization_dir: str, task_infos, num_epochs, num_serialized_models_to_keep: int = 10) -> None: """ task1 and task2 should be ditionaries that hold the model, the training, validation, adn test iterator for batches, and the metrics, learning rate, current score, etc for each of the tasks. """ super().__init__(serialization_dir) self.task_infos = task_infos self.num_epochs = num_epochs self.serialization_dir = serialization_dir self.swag_checkpointer = Checkpointer( serialization_dir + "/swag/", num_serialized_models_to_keep=num_serialized_models_to_keep) self.conll_checkpointer = Checkpointer( serialization_dir + "/conll/", num_serialized_models_to_keep=num_serialized_models_to_keep)
def single_worker(logger, model, reader, out_feature_key, optimizer, iterator, train_dataset, validation_dataset): _cudart = U.get_cudart() device = torch.device("cuda" if FLAGS.use_cuda else "cpu") if _cudart is None: logger.warning( "No cudart, probably means you do not have cuda on this machine.") model = model.to(device) cuda_device = 0 if FLAGS.use_cuda else -1 # TODO: multi GPU # NOTE: THIS CKPT Mechanism only ckpt at the end of every epoch. # if an epoch is more than 1 day, then you take care of it yourself :P ckpter = Checkpointer(serialization_dir=FLAGS.ckpt_dir, num_serialized_models_to_keep=1) if FLAGS.profile_only: raw_train_generator = iterator(train_dataset, num_epochs=1, shuffle=False) train_generator = lazy_groups_of(raw_train_generator, 1) _prof_input = next(train_generator)[0] stats = counter.profile(model, input_size=(FLAGS.batch_size, ), logger=logger, is_cnn=False, rnn_input=_prof_input) logger.info("DNN_Features: %s", str(stats)) sys.exit(0) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, serialization_dir=FLAGS.ckpt_dir, validation_dataset=validation_dataset, num_epochs=FLAGS.max_epochs, checkpointer=ckpter, log_batch_size_period=10, cuda_device=cuda_device) start_time = time.time() try: status = None if _cudart: status = _cudart.cudaProfilerStart() trainer.train() finally: if status == 0: _cudart.cudaProfilerStop() final_time = time.time() - start_time logger.info("Finished training: ran for %d secs", final_time) final_output(FLAGS.flag_values_dict(), model, device, logger, reader, out_feature_key, start_time)
def train(self) -> Dict[str, Any]: assert self._serialization_dir is not None self.model.vocab.save_to_files( os.path.join(self._serialization_dir, "vocabulary")) checkpointer = Checkpointer(self._serialization_dir) checkpointer.save_checkpoint(self) best_model_filename = os.path.join(self._serialization_dir, "best.th") torch.save(self.model.state_dict(), best_model_filename) self._best_model_filename = best_model_filename return {}
def __init__(self, serialization_dir: str, model: Model, data_loader: DataLoader, validation_data_loader: Optional[DataLoader] = None, checkpointer: Optional[Checkpointer] = None) -> None: super().__init__(serialization_dir) self.model = model self.data_loader = data_loader self.validation_data_loader = validation_data_loader self.checkpointer = checkpointer or Checkpointer( self._serialization_dir)
def __init__( self, model: Model, optimizer: torch.optim.Optimizer, train_dataset: Iterable[Instance], validation_dataset: Optional[Iterable[Instance]] = None, batch_size: int = 1, validation_metric: str = "-loss", shuffle: bool = True, num_epochs: int = 20, serialization_dir: Optional[str] = None, num_serialized_models_to_keep: int = 20, checkpointer: Checkpointer = None, cuda_device: Union[int, List] = -1, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[LearningRateScheduler] = None ) -> None: super().__init__(serialization_dir, cuda_device) self.local_rank = dist.get_rank() self.local_device = torch.device("cuda", self.local_rank) self.model = DDP(model, device_ids=[self.local_rank], output_device=self.local_rank) self.batch_size = batch_size self.shuffle = shuffle self.optimizer = optimizer self.train_data = train_dataset self._validation_data = validation_dataset self._metric_tracker = MetricTracker(metric_name=validation_metric) self._validation_metric = validation_metric[1:] self._num_epochs = num_epochs if checkpointer is not None: self._checkpointer = checkpointer else: self._checkpointer = Checkpointer(serialization_dir, None, num_serialized_models_to_keep) self._grad_clipping = grad_clipping self._learning_rate_scheduler = learning_rate_scheduler self._batch_num_total = 0 self._last_log = 0.0 # time of last logging
def from_params(cls, params: Params, serialization_dir: str) -> 'Checkpoint': # type: ignore # pylint: disable=arguments-differ checkpointer_params = params.pop("checkpointer", None) if checkpointer_params: checkpointer = Checkpointer.from_params( checkpointer_params, serialization_dir=serialization_dir) else: checkpointer = Checkpointer(serialization_dir=serialization_dir) state_dict_attrs = params.pop("state_dict_attrs", None) other_attrs = params.pop("other_attrs", None) return Checkpoint(checkpointer, state_dict_attrs, other_attrs)
def from_params(cls, params: Params, serialization_dir: str) -> "Checkpoint": # type: ignore checkpointer_params = params.pop("checkpointer", None) if checkpointer_params: checkpointer = Checkpointer.from_params( checkpointer_params, serialization_dir=serialization_dir) else: checkpointer = Checkpointer(serialization_dir=serialization_dir) state_dict_attrs = params.pop("state_dict_attrs", None) other_attrs = params.pop("other_attrs", None) return Checkpoint(checkpointer, state_dict_attrs, other_attrs)
def test_default(self): """ Tests that the default behavior keeps just the last 20 checkpoints. """ default_num_to_keep = 20 num_epochs = 30 target = list(range(num_epochs - default_num_to_keep, num_epochs)) checkpointer = Checkpointer(serialization_dir=self.TEST_DIR) for e in range(num_epochs): checkpointer.save_checkpoint(epoch=e, model_state={"epoch": e}, training_states={"epoch": e}, is_best_so_far=False) models, training = self.retrieve_and_delete_saved() assert models == training == target
def from_params( # type: ignore cls, params: Params, serialization_dir: str, **extras) -> "Checkpoint": checkpointer_params = params.pop("checkpointer", None) if checkpointer_params: checkpointer = Checkpointer.from_params( checkpointer_params, serialization_dir=serialization_dir) else: checkpointer = Checkpointer(serialization_dir=serialization_dir) model_save_interval = params.pop("model_save_interval", None) state_dict_attrs = params.pop("state_dict_attrs", None) other_attrs = params.pop("other_attrs", None) return Checkpoint(checkpointer, model_save_interval, state_dict_attrs, other_attrs)
def build_trainer(model: Model, serialization_dir: str, train_loader: DataLoader, dev_loader: DataLoader, num_epochs: int, learning_rate: float = 0.001, cuda_device=None) -> Trainer: """ Builds instance of Trainer class with specified training hyperparameters Adapted from https://guide.allennlp.org/training-and-prediction Parameters model : Model The model to train serialization_dir : str Directory to save checkpoints and results train_loader : DataLoader Previously built dataset loader for training data dev_loader : DataLoader Previously built loader for dev data num_epochs : int Number of epochs to train for learning_rate : float (default: 0.001) cuda_device : int (default: None) >=0 if using GPU Returns trainer : Trainer """ parameters = [(n, p) for n, p in model.named_parameters() if p.requires_grad] optimizer = AdamOptimizer(parameters, lr=learning_rate) # type: ignore trainer = GradientDescentTrainer(model=model, checkpointer=Checkpointer( serialization_dir, num_serialized_models_to_keep=-1), serialization_dir=serialization_dir, data_loader=train_loader, validation_data_loader=dev_loader, num_epochs=num_epochs, optimizer=optimizer, cuda_device=cuda_device) print("Will train for", num_epochs, "epochs") return trainer
def __init__(self, model: Model, serialization_dir: str, iterator: DataIterator, mingler: DatasetMingler, optimizer: torch.optim.Optimizer, datasets: Dict[str, Iterable[Instance]], num_epochs: int = 10, num_serialized_models_to_keep: int = 10) -> None: super().__init__(serialization_dir) self.model = model self.iterator = iterator self.mingler = mingler self.optimizer = optimizer self.datasets = datasets self.num_epochs = num_epochs self.checkpointer = Checkpointer( serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep)
def from_params(cls, # type: ignore model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'Trainer': # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params(params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls(model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average)
def __init__(self, model: Model, optimizer: torch.optim.Optimizer, iterator: DataIterator, train_dataset: Iterable[Instance], validation_dataset: Optional[Iterable[Instance]] = None, patience: Optional[int] = None, validation_metric: str = "-loss", validation_iterator: DataIterator = None, shuffle: bool = True, num_epochs: int = 20, serialization_dir: Optional[str] = None, num_serialized_models_to_keep: int = 0, keep_serialized_model_every_num_seconds: int = None, checkpointer: Checkpointer = None, model_save_interval: float = None, cuda_device: Union[int, List] = -1, grad_norm: Optional[float] = None, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[LearningRateScheduler] = None, momentum_scheduler: Optional[MomentumScheduler] = None, summary_interval: int = 100, histogram_interval: int = None, should_log_parameter_statistics: bool = True, should_log_learning_rate: bool = False, log_batch_size_period: Optional[int] = None, moving_average: Optional[MovingAverage] = None, callbacks: List[allennlp_callback.Callback]=None, early_stopping_by_batch: bool=True, estimator: Estimator=None, ) -> None: """ A trainer for doing supervised learning. It just takes a labeled dataset and a ``DataIterator``, and uses the supplied ``Optimizer`` to learn the weights for your model over some fixed number of epochs. You can also pass in a validation dataset and enable early stopping. There are many other bells and whistles as well. Parameters ---------- model : ``Model``, required. An AllenNLP model to be optimized. Pytorch Modules can also be optimized if their ``forward`` method returns a dictionary with a "loss" key, containing a scalar tensor representing the loss function to be optimized. If you are training your model using GPUs, your model should already be on the correct device. (If you use `Trainer.from_params` this will be handled for you.) optimizer : ``torch.nn.Optimizer``, required. An instance of a Pytorch Optimizer, instantiated with the parameters of the model to be optimized. iterator : ``DataIterator``, required. A method for iterating over a ``Dataset``, yielding padded indexed batches. train_dataset : ``Dataset``, required. A ``Dataset`` to train on. The dataset should have already been indexed. validation_dataset : ``Dataset``, optional, (default = None). A ``Dataset`` to evaluate on. The dataset should have already been indexed. patience : Optional[int] > 0, optional (default=None) Number of epochs to be patient before early stopping: the training is stopped after ``patience`` epochs with no improvement. If given, it must be ``> 0``. If None, early stopping is disabled. validation_metric : str, optional (default="loss") Validation metric to measure for whether to stop training using patience and whether to serialize an ``is_best`` model each epoch. The metric name must be prepended with either "+" or "-", which specifies whether the metric is an increasing or decreasing function. validation_iterator : ``DataIterator``, optional (default=None) An iterator to use for the validation set. If ``None``, then use the training `iterator`. shuffle: ``bool``, optional (default=True) Whether to shuffle the instances in the iterator or not. num_epochs : int, optional (default = 20) Number of training epochs. serialization_dir : str, optional (default=None) Path to directory for saving and loading model files. Models will not be saved if this parameter is not passed. num_serialized_models_to_keep : ``int``, optional (default=20) Number of previous model checkpoints to retain. Default is to keep 20 checkpoints. A value of None or -1 means all checkpoints will be kept. keep_serialized_model_every_num_seconds : ``int``, optional (default=None) If num_serialized_models_to_keep is not None, then occasionally it's useful to save models at a given interval in addition to the last num_serialized_models_to_keep. To do so, specify keep_serialized_model_every_num_seconds as the number of seconds between permanently saved checkpoints. Note that this option is only used if num_serialized_models_to_keep is not None, otherwise all checkpoints are kept. checkpointer : ``Checkpointer``, optional (default=None) An instance of class Checkpointer to use instead of the default. If a checkpointer is specified, the arguments num_serialized_models_to_keep and keep_serialized_model_every_num_seconds should not be specified. The caller is responsible for initializing the checkpointer so that it is consistent with serialization_dir. model_save_interval : ``float``, optional (default=None) If provided, then serialize models every ``model_save_interval`` seconds within single epochs. In all cases, models are also saved at the end of every epoch if ``serialization_dir`` is provided. cuda_device : ``Union[int, List[int]]``, optional (default = -1) An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used. grad_norm : ``float``, optional, (default = None). If provided, gradient norms will be rescaled to have a maximum of this value. grad_clipping : ``float``, optional (default = ``None``). If provided, gradients will be clipped `during the backward pass` to have an (absolute) maximum of this value. If you are getting ``NaNs`` in your gradients during training that are not solved by using ``grad_norm``, you may need this. learning_rate_scheduler : ``LearningRateScheduler``, optional (default = None) If specified, the learning rate will be decayed with respect to this schedule at the end of each epoch (or batch, if the scheduler implements the ``step_batch`` method). If you use :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`, this will use the ``validation_metric`` provided to determine if learning has plateaued. To support updating the learning rate on every batch, this can optionally implement ``step_batch(batch_num_total)`` which updates the learning rate given the batch number. momentum_scheduler : ``MomentumScheduler``, optional (default = None) If specified, the momentum will be updated at the end of each batch or epoch according to the schedule. summary_interval: ``int``, optional, (default = 100) Number of batches between logging scalars to tensorboard histogram_interval : ``int``, optional, (default = ``None``) If not None, then log histograms to tensorboard every ``histogram_interval`` batches. When this parameter is specified, the following additional logging is enabled: * Histograms of model parameters * The ratio of parameter update norm to parameter norm * Histogram of layer activations We log histograms of the parameters returned by ``model.get_parameters_for_histogram_tensorboard_logging``. The layer activations are logged for any modules in the ``Model`` that have the attribute ``should_log_activations`` set to ``True``. Logging histograms requires a number of GPU-CPU copies during training and is typically slow, so we recommend logging histograms relatively infrequently. Note: only Modules that return tensors, tuples of tensors or dicts with tensors as values currently support activation logging. should_log_parameter_statistics : ``bool``, optional, (default = True) Whether to send parameter statistics (mean and standard deviation of parameters and gradients) to tensorboard. should_log_learning_rate : ``bool``, optional, (default = False) Whether to send parameter specific learning rate to tensorboard. log_batch_size_period : ``int``, optional, (default = ``None``) If defined, how often to log the average batch size. moving_average: ``MovingAverage``, optional, (default = None) If provided, we will maintain moving averages for all parameters. During training, we employ a shadow variable for each parameter, which maintains the moving average. During evaluation, we backup the original parameters and assign the moving averages to corresponding parameters. Be careful that when saving the checkpoint, we will save the moving averages of parameters. This is necessary because we want the saved model to perform as well as the validated model if we load it later. But this may cause problems if you restart the training from checkpoint. """ super().__init__(serialization_dir, cuda_device) # I am not calling move_to_gpu here, because if the model is # not already on the GPU then the optimizer is going to be wrong. self.model = model self.iterator = iterator self._validation_iterator = validation_iterator self.shuffle = shuffle self.optimizer = optimizer self.train_data = train_dataset self._validation_data = validation_dataset if patience is None: # no early stopping if validation_dataset: logger.warning('You provided a validation dataset but patience was set to None, ' 'meaning that early stopping is disabled') elif (not isinstance(patience, int)) or patience <= 0: raise ConfigurationError('{} is an invalid value for "patience": it must be a positive integer ' 'or None (if you want to disable early stopping)'.format(patience)) # For tracking is_best_so_far and should_stop_early self._metric_tracker = MetricTracker(patience, validation_metric) # Get rid of + or - self._validation_metric = validation_metric[1:] self._num_epochs = num_epochs if checkpointer is not None: # We can't easily check if these parameters were passed in, so check against their default values. # We don't check against serialization_dir since it is also used by the parent class. if num_serialized_models_to_keep != 20 or \ keep_serialized_model_every_num_seconds is not None: raise ConfigurationError( "When passing a custom Checkpointer, you may not also pass in separate checkpointer " "args 'num_serialized_models_to_keep' or 'keep_serialized_model_every_num_seconds'.") self._checkpointer = checkpointer else: self._checkpointer = Checkpointer(serialization_dir, keep_serialized_model_every_num_seconds, num_serialized_models_to_keep) self._model_save_interval = model_save_interval self._grad_norm = grad_norm self._grad_clipping = grad_clipping self._learning_rate_scheduler = learning_rate_scheduler self._momentum_scheduler = momentum_scheduler self._moving_average = moving_average # We keep the total batch number as an instance variable because it # is used inside a closure for the hook which logs activations in # ``_enable_activation_logging``. self._batch_num_total = 0 self._tensorboard = TensorboardWriter( get_batch_num_total=lambda: self._batch_num_total, serialization_dir=serialization_dir, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate) self._log_batch_size_period = log_batch_size_period self._last_log = 0.0 # time of last logging # Enable activation logging. if histogram_interval is not None: self._tensorboard.enable_activation_logging(self.model) self.callbacks = callbacks self._early_stopping_by_batch = early_stopping_by_batch self._estimator = estimator
def from_partial_objects( cls, model: Model, serialization_dir: str, data_loader: DataLoader, validation_data_loader: DataLoader = None, local_rank: int = 0, patience: int = None, validation_metric: str = "-loss", num_epochs: int = 20, cuda_device: int = -1, grad_norm: float = None, grad_clipping: float = None, distributed: bool = None, world_size: int = 1, num_gradient_accumulation_steps: int = 1, opt_level: Optional[str] = None, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, tensorboard_writer: Lazy[TensorboardWriter] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = None, batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this method. The reason we can't just use `__init__` with `FromParams` here is because there are sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to have the parameters from the `Model` before it's constructed, and the `Schedulers` need to have the `Optimizer`. Because of this, the typical way we construct things `FromParams` doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. If you're not using `FromParams`, you can just construct these arguments in the right order yourself in your code and call the constructor directly. """ check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) common_util.log_frozen_and_tunable_parameter_names(model) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) if not optimizer_: optimizer_ = Optimizer.default(parameters) try: batches_per_epoch = len(data_loader) except TypeError: # If the dataset is lazy, it won't have a length. batches_per_epoch = None moving_average_ = moving_average.construct(parameters=parameters) learning_rate_scheduler_ = learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch) momentum_scheduler_ = momentum_scheduler.construct( optimizer=optimizer_) checkpointer_ = checkpointer.construct() or Checkpointer( serialization_dir) tensorboard_writer_ = tensorboard_writer.construct( ) or TensorboardWriter(serialization_dir) return cls( model, optimizer_, data_loader, patience=patience, validation_metric=validation_metric, validation_data_loader=validation_data_loader, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_, tensorboard_writer=tensorboard_writer_, checkpointer=checkpointer_, moving_average=moving_average_, batch_callbacks=batch_callbacks, epoch_callbacks=epoch_callbacks, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, opt_level=opt_level, )
def __init__( self, model: Model, optimizer: torch.optim.Optimizer, data_loader: torch.utils.data.DataLoader, patience: Optional[int] = None, validation_metric: str = "-loss", validation_data_loader: torch.utils.data.DataLoader = None, num_epochs: int = 20, serialization_dir: Optional[str] = None, checkpointer: Checkpointer = None, cuda_device: int = -1, grad_norm: Optional[float] = None, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[LearningRateScheduler] = None, momentum_scheduler: Optional[MomentumScheduler] = None, tensorboard_writer: TensorboardWriter = None, moving_average: Optional[MovingAverage] = None, batch_callbacks: List[BatchCallback] = None, epoch_callbacks: List[EpochCallback] = None, distributed: bool = False, local_rank: int = 0, world_size: int = 1, num_gradient_accumulation_steps: int = 1, opt_level: Optional[str] = None, ) -> None: super().__init__(serialization_dir, cuda_device, distributed, local_rank, world_size) # I am not calling move_to_gpu here, because if the model is # not already on the GPU then the optimizer is going to be wrong. self.model = model self.data_loader = data_loader self._validation_data_loader = validation_data_loader self.optimizer = optimizer if patience is None: # no early stopping if validation_data_loader: logger.warning( "You provided a validation dataset but patience was set to None, " "meaning that early stopping is disabled") elif (not isinstance(patience, int)) or patience <= 0: raise ConfigurationError( '{} is an invalid value for "patience": it must be a positive integer ' "or None (if you want to disable early stopping)".format( patience)) # For tracking is_best_so_far and should_stop_early self._metric_tracker = MetricTracker(patience, validation_metric) # Get rid of + or - self._validation_metric = validation_metric[1:] self._num_epochs = num_epochs if checkpointer is not None: self._checkpointer = checkpointer else: self._checkpointer = Checkpointer(serialization_dir) self._grad_norm = grad_norm self._grad_clipping = grad_clipping self._learning_rate_scheduler = learning_rate_scheduler self._momentum_scheduler = momentum_scheduler self._moving_average = moving_average self._batch_callbacks = batch_callbacks or [] self._epoch_callbacks = epoch_callbacks or [] # We keep the total batch number as an instance variable because it # is used inside a closure for the hook which logs activations in # `_enable_activation_logging`. self._batch_num_total = 0 self._tensorboard = tensorboard_writer or TensorboardWriter( serialization_dir) self._tensorboard.get_batch_num_total = lambda: self._batch_num_total self._tensorboard.enable_activation_logging(self.model) self._last_log = 0.0 # time of last logging self._num_gradient_accumulation_steps = num_gradient_accumulation_steps # Enable automatic mixed precision training with NVIDIA Apex. self._opt_level = opt_level if self._opt_level is not None: if amp is None: raise ConfigurationError(( "Apex not installed but opt_level was provided. Please install NVIDIA's Apex to enable" " automatic mixed precision (AMP) training. See: https://github.com/NVIDIA/apex." )) self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level=self._opt_level) # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model` # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc. # # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the # normal case, reference to `Model` is retained. This reference is only used in # these places: `model.__call__`, `model.train` and `model.eval`. if self._distributed: self._pytorch_model = DistributedDataParallel( self.model, device_ids=[self.cuda_device], find_unused_parameters=True) else: self._pytorch_model = self.model
def __init__( self, model: Model, optimizer: torch.optim.Optimizer, iterator: DataIterator, train_dataset: Iterable[Instance], validation_dataset: Optional[Iterable[Instance]] = None, patience: Optional[int] = None, validation_metric: str = "-loss", validation_iterator: DataIterator = None, shuffle: bool = True, num_epochs: int = 20, serialization_dir: Optional[str] = None, num_serialized_models_to_keep: int = 20, keep_serialized_model_every_num_seconds: int = None, checkpointer: Checkpointer = None, model_save_interval: float = None, cuda_device: int = -1, grad_norm: Optional[float] = None, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[LearningRateScheduler] = None, momentum_scheduler: Optional[MomentumScheduler] = None, summary_interval: int = 100, histogram_interval: int = None, should_log_parameter_statistics: bool = True, should_log_learning_rate: bool = False, log_batch_size_period: Optional[int] = None, moving_average: Optional[MovingAverage] = None, distributed: bool = False, local_rank: int = 0, world_size: int = 1, num_gradient_accumulation_steps: int = 1, writer: WandBWriter = None, ) -> None: """ A trainer for doing supervised learning. It just takes a labeled dataset and a `DataIterator`, and uses the supplied `Optimizer` to learn the weights for your model over some fixed number of epochs. You can also pass in a validation dataset and enable early stopping. There are many other bells and whistles as well. # Parameters model : `Model`, required. An AllenNLP model to be optimized. Pytorch Modules can also be optimized if their `forward` method returns a dictionary with a "loss" key, containing a scalar tensor representing the loss function to be optimized. If you are training your model using GPUs, your model should already be on the correct device. (If you use `Trainer.from_params` this will be handled for you.) optimizer : `torch.nn.Optimizer`, required. An instance of a Pytorch Optimizer, instantiated with the parameters of the model to be optimized. iterator : `DataIterator`, required. A method for iterating over a `Dataset`, yielding padded indexed batches. train_dataset : `Dataset`, required. A `Dataset` to train on. The dataset should have already been indexed. validation_dataset : `Dataset`, optional, (default = None). A `Dataset` to evaluate on. The dataset should have already been indexed. patience : Optional[int] > 0, optional (default=None) Number of epochs to be patient before early stopping: the training is stopped after `patience` epochs with no improvement. If given, it must be `> 0`. If None, early stopping is disabled. validation_metric : str, optional (default="loss") Validation metric to measure for whether to stop training using patience and whether to serialize an `is_best` model each epoch. The metric name must be prepended with either "+" or "-", which specifies whether the metric is an increasing or decreasing function. validation_iterator : `DataIterator`, optional (default=None) An iterator to use for the validation set. If `None`, then use the training `iterator`. shuffle : `bool`, optional (default=True) Whether to shuffle the instances in the iterator or not. num_epochs : int, optional (default = 20) Number of training epochs. serialization_dir : str, optional (default=None) Path to directory for saving and loading model files. Models will not be saved if this parameter is not passed. num_serialized_models_to_keep : `int`, optional (default=20) Number of previous model checkpoints to retain. Default is to keep 20 checkpoints. A value of None or -1 means all checkpoints will be kept. keep_serialized_model_every_num_seconds : `int`, optional (default=None) If num_serialized_models_to_keep is not None, then occasionally it's useful to save models at a given interval in addition to the last num_serialized_models_to_keep. To do so, specify keep_serialized_model_every_num_seconds as the number of seconds between permanently saved checkpoints. Note that this option is only used if num_serialized_models_to_keep is not None, otherwise all checkpoints are kept. checkpointer : `Checkpointer`, optional (default=None) An instance of class Checkpointer to use instead of the default. If a checkpointer is specified, the arguments num_serialized_models_to_keep and keep_serialized_model_every_num_seconds should not be specified. The caller is responsible for initializing the checkpointer so that it is consistent with serialization_dir. model_save_interval : `float`, optional (default=None) If provided, then serialize models every `model_save_interval` seconds within single epochs. In all cases, models are also saved at the end of every epoch if `serialization_dir` is provided. cuda_device : `int`, optional (default = -1) An integer specifying the CUDA device(s) to use for this process. If -1, the CPU is used. Data parallelism is controlled at the allennlp train level, so each trainer will have a single GPU. grad_norm : `float`, optional, (default = None). If provided, gradient norms will be rescaled to have a maximum of this value. grad_clipping : `float`, optional (default = `None`). If provided, gradients will be clipped `during the backward pass` to have an (absolute) maximum of this value. If you are getting `NaNs` in your gradients during training that are not solved by using `grad_norm`, you may need this. learning_rate_scheduler : `LearningRateScheduler`, optional (default = None) If specified, the learning rate will be decayed with respect to this schedule at the end of each epoch (or batch, if the scheduler implements the `step_batch` method). If you use :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`, this will use the `validation_metric` provided to determine if learning has plateaued. To support updating the learning rate on every batch, this can optionally implement `step_batch(batch_num_total)` which updates the learning rate given the batch number. momentum_scheduler : `MomentumScheduler`, optional (default = None) If specified, the momentum will be updated at the end of each batch or epoch according to the schedule. summary_interval : `int`, optional, (default = 100) Number of batches between logging scalars to tensorboard histogram_interval : `int`, optional, (default = `None`) If not None, then log histograms to tensorboard every `histogram_interval` batches. When this parameter is specified, the following additional logging is enabled: * Histograms of model parameters * The ratio of parameter update norm to parameter norm * Histogram of layer activations We log histograms of the parameters returned by `model.get_parameters_for_histogram_tensorboard_logging`. The layer activations are logged for any modules in the `Model` that have the attribute `should_log_activations` set to `True`. Logging histograms requires a number of GPU-CPU copies during training and is typically slow, so we recommend logging histograms relatively infrequently. Note: only Modules that return tensors, tuples of tensors or dicts with tensors as values currently support activation logging. should_log_parameter_statistics : `bool`, optional, (default = True) Whether to send parameter statistics (mean and standard deviation of parameters and gradients) to tensorboard. should_log_learning_rate : `bool`, optional, (default = False) Whether to send parameter specific learning rate to tensorboard. log_batch_size_period : `int`, optional, (default = `None`) If defined, how often to log the average batch size. moving_average : `MovingAverage`, optional, (default = None) If provided, we will maintain moving averages for all parameters. During training, we employ a shadow variable for each parameter, which maintains the moving average. During evaluation, we backup the original parameters and assign the moving averages to corresponding parameters. Be careful that when saving the checkpoint, we will save the moving averages of parameters. This is necessary because we want the saved model to perform as well as the validated model if we load it later. But this may cause problems if you restart the training from checkpoint. distributed : `bool`, optional, (default = False) If set, PyTorch's `DistributedDataParallel` is used to train the model in multiple GPUs. This also requires `world_size` to be greater than 1. local_rank : `int`, optional, (default = 0) This is the unique identifier of the `Trainer` in a distributed process group. The GPU device id is used as the rank. world_size : `int`, (default = 1) The number of `Trainer` workers participating in the distributed training. num_gradient_accumulation_steps : `int`, optional, (default = 1) Gradients are accumulated for the given number of steps before doing an optimizer step. This can be useful to accommodate batches that are larger than the RAM size. Refer Thomas Wolf's [post](https://tinyurl.com/y5mv44fw) for details on Gradient Accumulation. """ super().__init__(serialization_dir, cuda_device, distributed, local_rank, world_size) # I am not calling move_to_gpu here, because if the model is # not already on the GPU then the optimizer is going to be wrong. self.model = model self.iterator = iterator self._validation_iterator = validation_iterator self.shuffle = shuffle self.optimizer = optimizer self.train_data = train_dataset self._validation_data = validation_dataset if patience is None: # no early stopping if validation_dataset: logger.warning( "You provided a validation dataset but patience was set to None, " "meaning that early stopping is disabled" ) elif (not isinstance(patience, int)) or patience <= 0: raise ConfigurationError( '{} is an invalid value for "patience": it must be a positive integer ' "or None (if you want to disable early stopping)".format(patience) ) # For tracking is_best_so_far and should_stop_early self._metric_tracker = MetricTracker(patience, validation_metric) # Get rid of + or - self._validation_metric = validation_metric[1:] self._num_epochs = num_epochs if checkpointer is not None: # We can't easily check if these parameters were passed in, so check against their default values. # We don't check against serialization_dir since it is also used by the parent class. if ( num_serialized_models_to_keep != 20 or keep_serialized_model_every_num_seconds is not None ): raise ConfigurationError( "When passing a custom Checkpointer, you may not also pass in separate checkpointer " "args 'num_serialized_models_to_keep' or 'keep_serialized_model_every_num_seconds'." ) self._checkpointer = checkpointer else: self._checkpointer = Checkpointer( serialization_dir, keep_serialized_model_every_num_seconds, num_serialized_models_to_keep, ) self._model_save_interval = model_save_interval self._grad_norm = grad_norm self._grad_clipping = grad_clipping self._learning_rate_scheduler = learning_rate_scheduler self._momentum_scheduler = momentum_scheduler self._moving_average = moving_average # We keep the total batch number as an instance variable because it # is used inside a closure for the hook which logs activations in # `_enable_activation_logging`. self._batch_num_total = 0 if writer is not None: self._writer = writer else: self._writer = TensorboardWriter( get_batch_num_total=lambda: self._batch_num_total, serialization_dir=serialization_dir, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate) self._log_batch_size_period = log_batch_size_period self._last_log = 0.0 # time of last logging self._num_gradient_accumulation_steps = num_gradient_accumulation_steps # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model` # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc. # # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the # normal case, reference to `Model` is retained. This reference is only used in # these places: `model.__call__`, `model.train` and `model.eval`. if self._distributed: self._pytorch_model = DistributedDataParallel( self.model, device_ids=[self.cuda_device], find_unused_parameters=True ) else: self._pytorch_model = self.model
def __init__( self, model: Model, optimizer: torch.optim.Optimizer, data_loader: DataLoader, patience: Optional[int] = None, validation_metric: Union[str, List[str]] = "-loss", validation_data_loader: DataLoader = None, num_epochs: int = 20, serialization_dir: Optional[str] = None, checkpointer: Checkpointer = None, cuda_device: Optional[Union[int, torch.device]] = None, grad_norm: Optional[float] = None, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[LearningRateScheduler] = None, momentum_scheduler: Optional[MomentumScheduler] = None, moving_average: Optional[MovingAverage] = None, callbacks: List[TrainerCallback] = None, distributed: bool = False, local_rank: int = 0, world_size: int = 1, num_gradient_accumulation_steps: int = 1, use_amp: bool = False, enable_default_callbacks: bool = True, run_sanity_checks: bool = True, val_loss_steps: int = 50 ) -> None: super().__init__(serialization_dir, cuda_device, distributed, local_rank, world_size) # I am not calling move_to_gpu here, because if the model is # not already on the GPU then the optimizer is going to be wrong. self.model = model self.data_loader = data_loader self.data_loader.set_target_device(self.cuda_device) self._validation_data_loader = validation_data_loader if self._validation_data_loader is not None: self._validation_data_loader.set_target_device(self.cuda_device) self.optimizer = optimizer if patience is None: # no early stopping if validation_data_loader is not None: logger.warning( "You provided a validation dataset but patience was set to None, " "meaning that early stopping is disabled" ) elif (not isinstance(patience, int)) or patience <= 0: raise ConfigurationError( '{} is an invalid value for "patience": it must be a positive integer ' "or None (if you want to disable early stopping)".format(patience) ) # For tracking is_best_so_far and should_stop_early self._metric_tracker = MetricTracker(validation_metric, patience) self._num_epochs = num_epochs self._checkpointer: Optional[Checkpointer] = checkpointer if checkpointer is None and serialization_dir is not None: self._checkpointer = Checkpointer(serialization_dir) self._grad_norm = grad_norm self._grad_clipping = grad_clipping self._learning_rate_scheduler = learning_rate_scheduler self._momentum_scheduler = momentum_scheduler self._moving_average = moving_average self._callbacks = callbacks or [] default_callbacks = list(DEFAULT_CALLBACKS) if enable_default_callbacks else [] if run_sanity_checks: default_callbacks.append(SanityChecksCallback) for callback_cls in default_callbacks: for callback in self._callbacks: if callback.__class__ == callback_cls: break else: self._callbacks.append(callback_cls(self._serialization_dir)) self._batch_num_total = 0 self._last_log = 0.0 # time of last logging self._num_gradient_accumulation_steps = num_gradient_accumulation_steps # Enable automatic mixed precision training. self._scaler: Optional[amp.GradScaler] = None self._use_amp = use_amp if self._use_amp: if self.cuda_device == torch.device("cpu"): raise ValueError("Using AMP requires a cuda device") self._scaler = amp.GradScaler() # Using `DistributedDataParallel`(ddp) brings in a quirk wrt AllenNLP's `Model` interface and its # usage. A `Model` object is wrapped by `ddp`, but assigning the wrapped model to `self.model` # will break the usages such as `Model.get_regularization_penalty`, `Model.get_metrics`, etc. # # Hence a reference to Pytorch's object is maintained in the case of distributed training and in the # normal case, reference to `Model` is retained. This reference is only used in # these places: `model.__call__`, `model.train` and `model.eval`. if self._distributed: self._pytorch_model = DistributedDataParallel( self.model, device_ids=None if self.cuda_device == torch.device("cpu") else [self.cuda_device], find_unused_parameters=True, ) else: self._pytorch_model = self.model self.val_loss_steps = val_loss_steps
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, local_rank: int = 0, ) -> "Trainer": from allennlp.training.trainer import Trainer from allennlp.training.trainer_pieces import TrainerPieces config = dict(as_flat_dict(params.as_dict())) pieces = TrainerPieces.from_params(params, serialization_dir, recover) model = pieces.model serialization_dir = serialization_dir iterator = pieces.iterator train_data = pieces.train_dataset validation_data = pieces.validation_dataset params = pieces.params validation_iterator = pieces.validation_iterator patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters ) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params) else: momentum_scheduler = None if "checkpointer" in params: if ( "keep_serialized_model_every_num_seconds" in params or "num_serialized_models_to_keep" in params ): raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods." ) checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None ) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds, ) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) distributed = params.pop_bool("distributed", False) world_size = params.pop_int("world_size", 1) num_gradient_accumulation_steps = params.pop("num_gradient_accumulation_steps", 1) lang_mean_dir = params.pop("ft_lang_mean_dir", None) if lang_mean_dir: try: assert model._lang_means is not None lang_mean = get_lang_mean(lang_mean_dir) model.add_ft_lang_mean_to_lang_means(lang_mean) except (AttributeError, AssertionError) as e: pass writer = None wandb_config = params.pop("wandb", None) if wandb_config is not None: writer = WandBWriter(config, model, wandb_config) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, writer=writer, )
def from_params(cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None) -> 'PtDistTrainer': all_datasets = training_util.datasets_from_params( params, cache_directory, cache_prefix) vocab = Vocabulary.from_files(params.vocabulary.directory_path) model = Model.from_params(vocab=vocab, params=params.pop('model')) model.extend_embedder_vocab() if is_master_rank(): vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') batch_size = params.iterator.batch_size trainer_params = params.pop("trainer") keys = [key for key in params] for key in keys: params.pop(key) params = trainer_params validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) pretrain_file = params.pop("pretrain_file", None) no_grad_regexes = params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) model = model.cuda(dist.get_rank()) if pretrain_file: model_state = torch.load(pretrain_file, map_location=nn_util.device_mapping( dist.get_rank())) model.load_state_dict(model_state) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] # print([n for n, p in model.named_parameters() if p.requires_grad]) optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=None) return cls(model, optimizer, train_data, validation_data, batch_size=batch_size, validation_metric=validation_metric, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, checkpointer=checkpointer)
total += labels.size(0) correct += (predicted_labels == labels).sum().item() f1 = f1_score(y_true, y_pred, average="binary") print(f"Epcoh {ep} Validation accuracy: {correct/total}, f1: {f1}") # print_cm(confusion_matrix(y_true, y_pred, labels=range(len(le.classes_))), labels=[l[-5:] for l in le.classes_.tolist()]) return correct/total, f1 # ### Training # In[10]: checkpointer = Checkpointer(serialization_dir="Checkpoint_act_clf", keep_serialized_model_every_num_seconds=3600*2, num_serialized_models_to_keep=5) # In[11]: # optimizer num_epochs = 10 num_gradients_accumulation = 1 num_train_optimization_steps = num_train_optimization_steps = len(train_dataset) * num_epochs // batch_size // num_gradients_accumulation param_optimizer = list(model_A.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
def from_partial_objects( cls, model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_iterator: DataIterator = None, validation_data: Iterable[Instance] = None, local_rank: int = 0, patience: int = None, validation_metric: str = "-loss", shuffle: bool = True, num_epochs: int = 20, cuda_device: int = -1, grad_norm: float = None, grad_clipping: float = None, model_save_interval: float = None, summary_interval: int = 100, histogram_interval: int = None, should_log_parameter_statistics: bool = True, should_log_learning_rate: bool = False, log_batch_size_period: int = None, distributed: bool = None, world_size: int = 1, num_gradient_accumulation_steps: int = 1, no_grad: List[str] = None, optimizer: Lazy[Optimizer] = None, learning_rate_scheduler: Lazy[LearningRateScheduler] = None, momentum_scheduler: Lazy[MomentumScheduler] = None, moving_average: Lazy[MovingAverage] = None, checkpointer: Lazy[Checkpointer] = None, ) -> "Trainer": """ This method exists so that we can have a documented method to construct this class using `FromParams`. If you are not using `FromParams` or config files, you can safely ignore this method. The reason we can't just use `__init__` with `FromParams` here is because there are sequential dependencies to this class's arguments. Anything that has a `Lazy[]` type annotation needs something from one of the non-`Lazy` arguments. The `Optimizer` needs to have the parameters from the `Model` before it's constructed, and the `Schedulers` need to have the `Optimizer`. Because of this, the typical way we construct things `FromParams` doesn't work, so we use `Lazy` to allow for constructing the objects sequentially. If you're not using `FromParams`, you can just construct these arguments in the right order yourself in your code and call the constructor directly. """ check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) if no_grad: for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad): parameter.requires_grad_(False) common_util.log_frozen_and_tunable_parameter_names(model) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer_ = optimizer.construct(model_parameters=parameters) if not optimizer_: optimizer_ = Optimizer.default(parameters) batches_per_epoch = iterator.get_num_batches(train_data) if batches_per_epoch == 1: # get_num_batches returns 1 when it can't determine the answer batches_per_epoch = None moving_average_ = moving_average.construct(parameters=parameters) learning_rate_scheduler_ = learning_rate_scheduler.construct( optimizer=optimizer_, num_epochs=num_epochs, num_steps_per_epoch=batches_per_epoch) momentum_scheduler_ = momentum_scheduler.construct( optimizer=optimizer_) checkpointer_ = checkpointer.construct() or Checkpointer( serialization_dir) return cls( model, optimizer_, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=learning_rate_scheduler_, momentum_scheduler=momentum_scheduler_, checkpointer=checkpointer_, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average_, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, )
def distribute_worker(gpu_index, ngpus_per_node, world_size, program_flags): # at this point, rank is just machine rank. rank = program_flags['rank'] ckpter = None if rank == 0: # only ever first rank do the ckpt to save time. ckpter = Checkpointer(serialization_dir=program_flags['ckpt_dir'], num_serialized_models_to_keep=2) if world_size > 1: # NOTE: however here, we need to convert rank to be global rank among processes # machine * gpus per node + our current gpu index # see https://github.com/pytorch/examples/blob/master/imagenet/main.py if program_flags['assume_same_gpus']: rank = rank * ngpus_per_node + gpu_index else: rank = rank * program_flags['rank_scale_factor'] + gpu_index program_flags['run_name'] = program_flags['run_name'] + str(rank) logger, model, reader, out_feature_key, optimizer, iterator, train_dataset, validation_dataset = pre_init( program_flags, ngpus_per_node) dist.init_process_group(backend=program_flags['dist_backend'], init_method=program_flags['dist_method'], world_size=world_size, rank=rank) logger.info("Rank %d --- preparing to start training", rank) # Set cuda to a single gpu context torch.cuda.set_device(gpu_index) device = torch.device("cuda:%d" % gpu_index) model.cuda(gpu_index) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu_index]) trainer = DistributeTrainer( rank=rank, worldsize=world_size, ngpus_per_node=ngpus_per_node, cuda_device=[gpu_index], model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, serialization_dir=program_flags['ckpt_dir'], checkpointer=ckpter, log_batch_size_period=20, ) logger.info(device) start_time = time.time() trainer.train() final_time = time.time() - start_time logger.info("Rank %d Finished training: ran for %d secs", rank, final_time) if rank == 0: # only 1 worker need to do an output check. final_output(program_flags, model, device, logger, reader, out_feature_key, start_time)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) manager.init_training(model, optimizer) update_count = 0 if manager.is_main_rank(): progress_bar = tqdm.tqdm else: progress_bar = iter if manager.is_main_rank(): checkpointer = Checkpointer( "Checkpoint", keep_serialized_model_every_num_seconds=3600 * 4, num_serialized_models_to_keep=10) writer = SummaryWriter() start = time.time() update_loss = 0.0 update_kl = 0.0 for ep in range(args.num_train_epochs): pbar = progress_bar(train_dataloader) for batch in pbar: batch = batch_to_device(batch, args.device) loss, kl = model.train_one_step(batch) manager.backward_loss(loss, model, optimizer) update_count += 1