def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=(megatron_amp_o2 and cfg.trainer.precision == 'bf16' ), # Only bf16 uses fp32_grad_accum. gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append( MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append( NativeMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager resume_from_checkpoint = trainer.checkpoint_connector.resume_from_checkpoint_fit_path logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer.checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronXNlIModel(cfg.model, trainer) trainer.fit(model) trainer.test(model)
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') plugins = [NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)] if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), ) plugins.append( NativeMixedPrecisionPlugin(precision=16, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager resume_from_checkpoint = trainer.checkpoint_connector.resume_from_checkpoint_fit_path if resume_from_checkpoint is not None: # inject mp_rank into resume_from_checkpoint if cfg.model.tensor_model_parallel_size is not None and cfg.model.tensor_model_parallel_size > 1: mp_rank = compute_model_parallel_rank( trainer.local_rank, cfg.model.tensor_model_parallel_size) resume_from_checkpoint = Path(resume_from_checkpoint) resume_from_checkpoint = resume_from_checkpoint.parent.parent.joinpath( f'mp_rank_{mp_rank:02d}').joinpath(resume_from_checkpoint.name) resume_from_checkpoint = str(resume_from_checkpoint) logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer.checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronT5Model(cfg.model, trainer) trainer.fit(model)
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') plugins = [NLPDDPPlugin(num_nodes=cfg.trainer.num_nodes)] if cfg.trainer.precision == 16: plugins.append( NLPNativeMixedPrecisionPlugin( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), )) elif cfg.trainer.precision == 'bf16': plugins.append(NLPNativeBfloat16PrecisionPlugin()) else: plugins.append(NLPPrecisionPlugin()) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager resume_from_checkpoint = trainer.resume_from_checkpoint if resume_from_checkpoint is not None: mp_rank = compute_model_parallel_rank( trainer.local_rank, cfg.model.tensor_model_parallel_size) resume_from_checkpoint = Path(resume_from_checkpoint) resume_from_checkpoint = resume_from_checkpoint.parent.parent.joinpath( f'mp_rank_{mp_rank:02d}').joinpath(resume_from_checkpoint.name) resume_from_checkpoint = str(resume_from_checkpoint) logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer.checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) model = MegatronGPTModel(cfg.model, trainer) trainer.fit(model)
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') plugins = [NLPDDPPlugin(find_unused_parameters=False)] if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), ) plugins.append( NativeMixedPrecisionPlugin(precision=16, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager resume_from_checkpoint = trainer.checkpoint_connector.resume_from_checkpoint_fit_path logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer.checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronBertModel(cfg.model, trainer) trainer.fit(model)
def __init__( self, logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True, checkpoint_callback: bool = True, callbacks: Optional[Union[List[Callback], Callback]] = None, default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, process_position: int = 0, num_nodes: int = 1, num_processes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], str, int]] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: Optional[int] = None, overfit_batches: Union[int, float] = 0.0, track_grad_norm: Union[int, float, str] = -1, check_val_every_n_epoch: int = 1, fast_dev_run: Union[int, bool] = False, accumulate_grad_batches: Union[int, Dict[int, int], List[list]] = 1, max_epochs: Optional[int] = None, min_epochs: Optional[int] = None, max_steps: Optional[int] = None, min_steps: Optional[int] = None, limit_train_batches: Union[int, float] = 1.0, limit_val_batches: Union[int, float] = 1.0, limit_test_batches: Union[int, float] = 1.0, limit_predict_batches: Union[int, float] = 1.0, val_check_interval: Union[int, float] = 1.0, flush_logs_every_n_steps: int = 100, log_every_n_steps: int = 50, accelerator: Optional[Union[str, Accelerator]] = None, sync_batchnorm: bool = False, precision: int = 32, weights_summary: Optional[str] = 'top', weights_save_path: Optional[str] = None, num_sanity_val_steps: int = 2, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[Union[Path, str]] = None, profiler: Optional[Union[BaseProfiler, bool, str]] = None, benchmark: bool = False, deterministic: bool = False, reload_dataloaders_every_epoch: bool = False, auto_lr_find: Union[bool, str] = False, replace_sampler_ddp: bool = True, terminate_on_nan: bool = False, auto_scale_batch_size: Union[str, bool] = False, prepare_data_per_node: bool = True, plugins: Optional[Union[str, list]] = None, amp_backend: str = 'native', amp_level: str = 'O2', distributed_backend: Optional[str] = None, automatic_optimization: Optional[bool] = None, move_metrics_to_cpu: bool = False, enable_pl_optimizer: bool = None, # todo: remove in v1.3 multiple_trainloader_mode: str = 'max_size_cycle', ): r""" Customize every aspect of training via flags Args: accelerator: Previously known as distributed_backend (dp, ddp, ddp2, etc...). Can also take in an accelerator object for custom hardware. accumulate_grad_batches: Accumulates grads every k batches or as set up in the dict. amp_backend: The mixed precision backend to use ("native" or "apex") amp_level: The optimization level to use (O1, O2, etc...). auto_lr_find: If set to True, will make trainer.tune() run a learning rate finder, trying to optimize initial learning for faster convergence. trainer.tune() method will set the suggested learning rate in self.lr or self.learning_rate in the LightningModule. To use a different key set a string instead of True with the key name. auto_scale_batch_size: If set to True, will `initially` run a batch size finder trying to find the largest batch size that fits into memory. The result will be stored in self.batch_size in the LightningModule. Additionally, can be set to either `power` that estimates the batch size through a power search or `binsearch` that estimates the batch size through a binary search. auto_select_gpus: If enabled and `gpus` is an integer, pick available gpus automatically. This is especially useful when GPUs are configured to be in "exclusive mode", such that only one process at a time can access them. benchmark: If true enables cudnn.benchmark. callbacks: Add a callback or list of callbacks. checkpoint_callback: If ``True``, enable checkpointing. It will configure a default ModelCheckpoint callback if there is no user-defined ModelCheckpoint in :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks`. Default: ``True``. .. warning:: Passing a ModelCheckpoint instance to this argument is deprecated since v1.1 and will be unsupported from v1.3. Use `callbacks` argument instead. check_val_every_n_epoch: Check val every n train epochs. default_root_dir: Default path for logs and weights when no logger/ckpt_callback passed. Default: ``os.getcwd()``. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' deterministic: If true enables cudnn.deterministic. distributed_backend: deprecated. Please use 'accelerator' fast_dev_run: runs n if set to ``n`` (int) else 1 if set to ``True`` batch(es) of train, val and test to find any bugs (ie: a sort of unit test). flush_logs_every_n_steps: How often to flush logs to disk (defaults to every 100 steps). gpus: number of gpus to train on (int) or which GPUs to train on (list or str) applied per node gradient_clip_val: 0 means don't clip. limit_train_batches: How much of training dataset to check (floats = percent, int = num_batches) limit_val_batches: How much of validation dataset to check (floats = percent, int = num_batches) limit_test_batches: How much of test dataset to check (floats = percent, int = num_batches) logger: Logger (or iterable collection of loggers) for experiment tracking. log_gpu_memory: None, 'min_max', 'all'. Might slow performance log_every_n_steps: How often to log within steps (defaults to every 50 steps). automatic_optimization: If False you are responsible for calling .backward, .step, zero_grad in LightningModule. This argument has been moved to LightningModule. It is deprecated here in v1.1 and will be removed in v1.3. prepare_data_per_node: If True, each LOCAL_RANK=0 will call prepare data. Otherwise only NODE_RANK=0, LOCAL_RANK=0 will prepare data process_position: orders the progress bar when running multiple models on same machine. progress_bar_refresh_rate: How often to refresh progress bar (in steps). Value ``0`` disables progress bar. Ignored when a custom progress bar is passed to :paramref:`~Trainer.callbacks`. Default: None, means a suitable value will be chosen based on the environment (terminal, Google COLAB, etc.). profiler: To profile individual steps during training and assist in identifying bottlenecks. Passing bool value is deprecated in v1.1 and will be removed in v1.3. overfit_batches: Overfit a percent of training data (float) or a set number of batches (int). Default: 0.0 plugins: Plugins allow modification of core behavior like ddp and amp, and enable custom lightning plugins. precision: Full precision (32), half precision (16). Can be used on CPU, GPU or TPUs. max_epochs: Stop training once this number of epochs is reached. Disabled by default (None). If both max_epochs and max_steps are not specified, defaults to ``max_epochs`` = 1000. min_epochs: Force training for at least these many epochs. Disabled by default (None). If both min_epochs and min_steps are not specified, defaults to ``min_epochs`` = 1. max_steps: Stop training after this number of steps. Disabled by default (None). min_steps: Force training for at least these number of steps. Disabled by default (None). num_nodes: number of GPU nodes for distributed training. num_processes: number of processes for distributed training with distributed_backend="ddp_cpu" num_sanity_val_steps: Sanity check runs n validation batches before starting the training routine. Set it to `-1` to run all batches in all validation dataloaders. Default: 2 reload_dataloaders_every_epoch: Set to True to reload dataloaders every epoch. replace_sampler_ddp: Explicitly enables or disables sampler replacement. If not specified this will toggled automatically when DDP is used. By default it will add ``shuffle=True`` for train sampler and ``shuffle=False`` for val/test sampler. If you want to customize it, you can set ``replace_sampler_ddp=False`` and add your own distributed sampler. resume_from_checkpoint: Path/URL of the checkpoint from which training is resumed. If there is no checkpoint file at the path, start from scratch. If resuming from mid-epoch checkpoint, training will start from the beginning of the next epoch. sync_batchnorm: Synchronize batch norm layers between process groups/whole world. terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the end of each training batch, if any of the parameters or the loss are NaN or +/-inf. tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on [1] track_grad_norm: -1 no tracking. Otherwise tracks that p-norm. May be set to 'inf' infinity-norm. truncated_bptt_steps: Truncated back prop breaks performs backprop every k steps of much longer sequence. val_check_interval: How often to check the validation set. Use float to check within a training epoch, use int to check every n steps (batches). weights_summary: Prints a summary of the weights when training begins. weights_save_path: Where to save weights if specified. Will override default_root_dir for checkpoints only. Use this if for whatever reason you need the checkpoints stored in a different place than the logs written in `default_root_dir`. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' Defaults to `default_root_dir`. move_metrics_to_cpu: Whether to force internal logged metrics to be moved to cpu. This can save some gpu memory, but can make training slower. Use with attention. enable_pl_optimizer: If True, each optimizer will be wrapped by `pytorch_lightning.core.optimizer.LightningOptimizer`. It allows Lightning to handle AMP, TPU, accumulated_gradients, etc. .. warning:: Currently deprecated and it will be removed in v1.3 multiple_trainloader_mode: How to loop over the datasets when there are multiple train loaders. In 'max_size_cycle' mode, the trainer ends one epoch when the largest dataset is traversed, and smaller datasets reload when running out of their data. In 'min_size' mode, all the datasets reload when reaching the minimum length of datasets. """ super().__init__() self._running_stage = None distributed_backend = distributed_backend or accelerator # init connectors self.dev_debugger = InternalDebugger(self) self.config_validator = ConfigValidator(self) self.data_connector = DataConnector(self) self.optimizer_connector = OptimizerConnector(self) self.accelerator_connector = BackendConnector( num_processes, tpu_cores, distributed_backend, auto_select_gpus, gpus, num_nodes, sync_batchnorm, benchmark, replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins) self.logger_connector = LoggerConnector(self, log_gpu_memory) self.model_connector = ModelConnector(self) self.callback_connector = CallbackConnector(self) self.debugging_connector = DebuggingConnector(self) self.training_tricks_connector = TrainingTricksConnector(self) self.profile_connector = ProfilerConnector(self) self.checkpoint_connector = CheckpointConnector(self) self.slurm_connector = SLURMConnector(self) self.tuner = Tuner(self) self.train_loop = TrainLoop(self, multiple_trainloader_mode) self.evaluation_loop = EvaluationLoop(self) self.predict_loop = PredictLoop(self) # training state self.weights_summary = weights_summary self.shown_warnings = set() # init callbacks # Declare attributes to be set in callback_connector on_trainer_init self.callback_connector.on_trainer_init( callbacks, checkpoint_callback, progress_bar_refresh_rate, process_position, default_root_dir, weights_save_path, resume_from_checkpoint, ) # hook self.on_init_start() # init optimizer + lr scheduler related flags self.optimizer_connector.on_trainer_init(enable_pl_optimizer) # init data flags self.data_connector.on_trainer_init(check_val_every_n_epoch, reload_dataloaders_every_epoch, prepare_data_per_node) # init training tricks self.training_tricks_connector.on_trainer_init( gradient_clip_val, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, terminate_on_nan) # init train loop related flags # TODO: remove in 1.3.0 if automatic_optimization is None: automatic_optimization = True else: rank_zero_warn( "Disable automatic optimization with the trainer flag is deprecated and will be removed in v1.3.0!" "Please use the property on the LightningModule for disabling automatic optimization" ) self.train_loop.on_trainer_init( max_epochs, min_epochs, max_steps, min_steps, num_sanity_val_steps, automatic_optimization, weights_summary, ) self.evaluation_loop.on_trainer_init() # configure tuner self.tuner.on_trainer_init(auto_lr_find, auto_scale_batch_size) # configure profiler self.profile_connector.on_trainer_init(profiler) # init logger flags self.logger_connector.on_trainer_init( logger, flush_logs_every_n_steps, log_every_n_steps, move_metrics_to_cpu, ) # init debugging flags self.debugging_connector.on_init_start( limit_train_batches, limit_val_batches, limit_test_batches, limit_predict_batches, val_check_interval, overfit_batches, fast_dev_run, ) # Callback system self.on_init_end()
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') # setup the data processor for processor_config in cfg.model.task_processors: processor = TemplateProcessor( template=processor_config.template, limit_length_field=processor_config.limit_length_field ) register_taskdata_processor(processor_config.taskname, processor) plugins = [NLPDDPPlugin()] if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), ) plugins.append(NativeMixedPrecisionPlugin(precision=16, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision model = MegatronT5PTuneModel(cfg.model, trainer) trainer.fit(model) if cfg.model.data.test_ds.file_path: logging.info("===========================================================================================") logging.info("Starting the testing of the trained model on test set...") trainer.test(model) logging.info("Testing finished!") logging.info("===========================================================================================") # extract the path of the best checkpoint from the training, you may update it to any checkpoint checkpoint_path = trainer.checkpoint_callback.best_model_path tensor_parallel_size = cfg.model.tensor_model_parallel_size pathobj = Path(checkpoint_path) checkpoint_folder = str(pathobj.parent) checkpoint_name = str(pathobj.name) rank = trainer.accelerator.training_type_plugin.local_rank if tensor_parallel_size > 1: # inject model parallel rank checkpoint_path = os.path.join(checkpoint_folder, f'mp_rank_{rank:02d}', checkpoint_name) else: checkpoint_path = os.path.join(checkpoint_folder, checkpoint_name) # Load the checkpoint best_eval_model = MegatronT5PTuneModel.load_from_checkpoint( checkpoint_path=checkpoint_path, strict=False, trainer=trainer ) logging.info(f'Best checkpoint path: {checkpoint_path}') logging.info("Running Test with best EVAL checkpoint!") # setup the test dataset # best_eval_model.setup_test_data(test_data_config=cfg.model.data.test_ds) if torch.distributed.is_initialized(): torch.distributed.barrier() trainer.test(model=best_eval_model, ckpt_path=None, verbose=False) logging.info("Beset EVAL Testing finished!") logging.info("===========================================================================================") if cfg.model.nemo_path: # '.nemo' file contains the last checkpoint and the params to initialize the model best_eval_model.save_to(cfg.model.nemo_path) logging.info(f'Model is saved into `.nemo` file: {cfg.model.nemo_path}') # perform inference on a list of queries. if "infer_samples" in cfg.model and cfg.model.infer_samples: logging.info("===========================================================================================") logging.info("Starting the inference on some sample queries...") # max_seq_length=512 is the maximum length BERT supports. results = best_eval_model.cuda().ptune_inference( queries=cfg.model.infer_samples, batch_size=1, decode_token_len=5 ) logging.info('The prediction results of some sample queries with the trained model:') for query, result in zip(cfg.model.infer_samples, results): logging.info(f'Query : {query}') logging.info(f'Predicted label: {result}') logging.info("Inference finished!") logging.info("===========================================================================================")
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=True, gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2**32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append( MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append( PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated if cfg.model.preproc_out_dir is not None: MTDataPreproc(cfg=cfg.model, trainer=trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager if cfg.model.resume_from_checkpoint is not None: resume_from_checkpoint = cfg.model.resume_from_checkpoint else: resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path logging.info( f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer._checkpoint_connector = CheckpointConnector( trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time, ) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision if hasattr(cfg.model, 'pretrained_model_path' ) and cfg.model.pretrained_model_path is not None: if not hasattr(cfg.model, 'pretrained_model_type'): raise ValueError(f"Pretrained model type must be in [T5, BART].") assert cfg.model.pretrained_model_type in ['T5', 'BART'] if cfg.model.pretrained_model_type == 'T5': pretrained_cfg = MegatronT5Model.restore_from( cfg.model.pretrained_model_path, trainer=trainer, return_config=True) else: pretrained_cfg = MegatronBARTModel.restore_from( cfg.model.pretrained_model_path, trainer=trainer, return_config=True) OmegaConf.set_struct(pretrained_cfg, True) with open_dict(pretrained_cfg): pretrained_cfg.masked_softmax_fusion = False # Set source and target language/multilingual pretrained_cfg.src_language = cfg.model.src_language pretrained_cfg.tgt_language = cfg.model.tgt_language pretrained_cfg.multilingual = cfg.model.multilingual pretrained_cfg.shared_tokenizer = True # Max generation delta pretrained_cfg.max_generation_delta = cfg.model.max_generation_delta # Set label smoothing pretrained_cfg.label_smoothing = cfg.model.label_smoothing # Set tokenizer paths: pretrained_cfg.encoder_tokenizer = pretrained_cfg.tokenizer pretrained_cfg.decoder_tokenizer = pretrained_cfg.tokenizer # Pre-trained models should use the legacy sentencepiece tokenizer ex: mT5 pretrained_cfg.encoder_tokenizer.sentencepiece_legacy = True pretrained_cfg.decoder_tokenizer.sentencepiece_legacy = True # Override dropout pretrained_cfg.hidden_dropout = cfg.model.hidden_dropout pretrained_cfg.attention_dropout = cfg.model.attention_dropout # Override precision pretrained_cfg.precision = cfg.model.precision # Set above from trainer.precision # Override data and global/micro batch size. pretrained_cfg.train_ds = cfg.model.train_ds pretrained_cfg.validation_ds = cfg.model.validation_ds pretrained_cfg.test_ds = cfg.model.test_ds pretrained_cfg.micro_batch_size = cfg.model.micro_batch_size pretrained_cfg.global_batch_size = cfg.model.global_batch_size # Class target for the new class being restored. pretrained_cfg.target = ( "nemo.collections.nlp.models.machine_translation.megatron_nmt_model.MegatronNMTModel" ) # Optimizer overrides. pretrained_cfg.optim = cfg.model.optim model = MegatronNMTModel.restore_from( cfg.model.pretrained_model_path, trainer=trainer, override_config_path=pretrained_cfg, save_restore_connector=NLPSaveRestoreConnector(), ) else: model = MegatronNMTModel(cfg.model, trainer) if cfg.do_training: trainer.fit(model) if cfg.do_testing: trainer.test(model)
def __init__( self, logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True, checkpoint_callback: Union[ModelCheckpoint, bool] = True, early_stop_callback: Optional[Union[ EarlyStopping, bool]] = False, # todo: remove in v1.0.0 callbacks: Optional[List[Callback]] = None, default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, process_position: int = 0, num_nodes: int = 1, num_processes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], str, int]] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: int = 1, overfit_batches: Union[int, float] = 0.0, track_grad_norm: Union[int, float, str] = -1, check_val_every_n_epoch: int = 1, fast_dev_run: bool = False, accumulate_grad_batches: Union[int, Dict[int, int], List[list]] = 1, max_epochs: int = 1000, min_epochs: int = 1, max_steps: Optional[int] = None, min_steps: Optional[int] = None, limit_train_batches: Union[int, float] = 1.0, limit_val_batches: Union[int, float] = 1.0, limit_test_batches: Union[int, float] = 1.0, val_check_interval: Union[int, float] = 1.0, log_save_interval: int = 100, row_log_interval: int = 50, distributed_backend: Optional[str] = None, sync_batchnorm: bool = False, precision: int = 32, weights_summary: Optional[str] = ModelSummary.MODE_DEFAULT, weights_save_path: Optional[str] = None, num_sanity_val_steps: int = 2, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[str] = None, profiler: Optional[Union[BaseProfiler, bool]] = None, benchmark: bool = False, deterministic: bool = False, reload_dataloaders_every_epoch: bool = False, auto_lr_find: Union[bool, str] = False, replace_sampler_ddp: bool = True, terminate_on_nan: bool = False, auto_scale_batch_size: Union[str, bool] = False, prepare_data_per_node: bool = True, cluster_environment: ClusterEnvironment = None, amp_backend: str = 'native', amp_level: str = 'O2', # backward compatible, todo: remove in v1.0.0 overfit_pct: float = None, # backward compatible, todo: remove in v1.0.0 ): r""" Customize every aspect of training via flags Args: accumulate_grad_batches: Accumulates grads every k batches or as set up in the dict. amp_backend: The mixed precision backend to use ("native" or "apex") amp_level: The optimization level to use (O1, O2, etc...). auto_lr_find: If set to True, will `initially` run a learning rate finder, trying to optimize initial learning for faster convergence. Sets learning rate in self.lr or self.learning_rate in the LightningModule. To use a different key, set a string instead of True with the key name. auto_scale_batch_size: If set to True, will `initially` run a batch size finder trying to find the largest batch size that fits into memory. The result will be stored in self.batch_size in the LightningModule. Additionally, can be set to either `power` that estimates the batch size through a power search or `binsearch` that estimates the batch size through a binary search. auto_select_gpus: If enabled and `gpus` is an integer, pick available gpus automatically. This is especially useful when GPUs are configured to be in "exclusive mode", such that only one process at a time can access them. benchmark: If true enables cudnn.benchmark. callbacks: Add a list of callbacks. checkpoint_callback: Callback for checkpointing. check_val_every_n_epoch: Check val every n train epochs. cluster_environment: Environment config to link up arbitrary clusters default_root_dir: Default path for logs and weights when no logger/ckpt_callback passed. Default: ``os.getcwd()``. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' deterministic: If true enables cudnn.deterministic. distributed_backend: The distributed backend to use (dp, ddp, ddp2, ddp_spawn, ddp_cpu) early_stop_callback (:class:`pytorch_lightning.callbacks.EarlyStopping`). Deprecated since v0.10.0 and will be removed in v1.0. fast_dev_run: runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). gpus: number of gpus to train on (int) or which GPUs to train on (list or str) applied per node gradient_clip_val: 0 means don't clip. limit_train_batches: How much of training dataset to check (floats = percent, int = num_batches) limit_val_batches: How much of validation dataset to check (floats = percent, int = num_batches) limit_test_batches: How much of test dataset to check (floats = percent, int = num_batches) logger: Logger (or iterable collection of loggers) for experiment tracking. log_gpu_memory: None, 'min_max', 'all'. Might slow performance log_save_interval: Writes logs to disk this often prepare_data_per_node: If True, each LOCAL_RANK=0 will call prepare data. Otherwise only NODE_RANK=0, LOCAL_RANK=0 will prepare data process_position: orders the progress bar when running multiple models on same machine. progress_bar_refresh_rate: How often to refresh progress bar (in steps). Value ``0`` disables progress bar. Ignored when a custom callback is passed to :paramref:`~Trainer.callbacks`. profiler: To profile individual steps during training and assist in identifying bottlenecks. overfit_batches: Overfit a percent of training data (float) or a set number of batches (int). Default: 0.0 precision: Full precision (32), half precision (16). Can be used on CPU, GPU or TPUs. max_epochs: Stop training once this number of epochs is reached. min_epochs: Force training for at least these many epochs max_steps: Stop training after this number of steps. Disabled by default (None). min_steps: Force training for at least these number of steps. Disabled by default (None). num_nodes: number of GPU nodes for distributed training. num_sanity_val_steps: Sanity check runs n validation batches before starting the training routine. Set it to `-1` to run all batches in all validation dataloaders. Default: 2 reload_dataloaders_every_epoch: Set to True to reload dataloaders every epoch. replace_sampler_ddp: Explicitly enables or disables sampler replacement. If not specified this will toggled automatically when DDP is used. By default it will add ``shuffle=True`` for train sampler and ``shuffle=False`` for val/test sampler. If you want to customize it, you can set ``replace_sampler_ddp=False`` and add your own distributed sampler. resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here. This can be a URL. row_log_interval: How often to add logging rows (does not write to disk) sync_batchnorm: Synchronize batch norm layers between process groups/whole world. terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the end of each training batch, if any of the parameters or the loss are NaN or +/-inf. tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on [1] track_grad_norm: -1 no tracking. Otherwise tracks that p-norm. May be set to 'inf' infinity-norm. truncated_bptt_steps: Truncated back prop breaks performs backprop every k steps of much longer sequence. val_check_interval: How often to check the validation set. Use float to check within a training epoch, use int to check every n steps (batches). weights_summary: Prints a summary of the weights when training begins. weights_save_path: Where to save weights if specified. Will override default_root_dir for checkpoints only. Use this if for whatever reason you need the checkpoints stored in a different place than the logs written in `default_root_dir`. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' Defaults to `default_root_dir`. """ super().__init__() # init connectors self.dev_debugger = InternalDebugger(self) self.config_validator = ConfigValidator(self) self.data_connector = DataConnector(self) self.optimizer_connector = OptimizerConnector(self) self.accelerator_connector = AcceleratorConnector(self) self.logger_connector = LoggerConnector(self) self.model_connector = ModelConnector(self) self.precision_connector = PrecisionConnector(self) self.callback_connector = CallbackConnector(self) self.debugging_connector = DebuggingConnector(self) self.training_tricks_connector = TrainingTricksConnector(self) self.profile_connector = ProfilerConnector(self) self.checkpoint_connector = CheckpointConnector(self) self.slurm_connector = SLURMConnector(self) self.tuner = Tuner(self) self.accelerator_backend = None self.evaluation_loop = EvaluationLoop(self) self.train_loop = TrainLoop(self) # training state self.weights_summary = weights_summary self.model = None self.shown_warnings = set() # init callbacks # Declare attributes to be set in callback_connector on_trainer_init self.checkpoint_callback: Union[ModelCheckpoint, bool] = checkpoint_callback self.early_stop_callback: Optional[Union[EarlyStopping, bool]] = early_stop_callback self.callback_connector.on_trainer_init( callbacks, early_stop_callback, checkpoint_callback, progress_bar_refresh_rate, process_position, default_root_dir, weights_save_path, resume_from_checkpoint) # hook self.on_init_start() # init optimizer + lr scheduler related flags self.optimizer_connector.on_trainer_init() # init data flags self.data_connector.on_trainer_init(check_val_every_n_epoch, reload_dataloaders_every_epoch, prepare_data_per_node) # init training tricks self.training_tricks_connector.on_trainer_init( gradient_clip_val, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, terminate_on_nan) # init accelerator related flags self.accelerator_connector.on_trainer_init( num_processes, tpu_cores, distributed_backend, auto_select_gpus, gpus, num_nodes, log_gpu_memory, sync_batchnorm, benchmark, replace_sampler_ddp, deterministic, cluster_environment) # init train loop related flags self.train_loop.on_trainer_init(max_epochs, min_epochs, max_steps, min_steps, num_sanity_val_steps) self.evaluation_loop.on_trainer_init() # configure tuner self.tuner.on_trainer_init(auto_lr_find, auto_scale_batch_size) # configure profiler self.profile_connector.on_trainer_init(profiler) # init logger flags self.logger_connector.on_trainer_init(logger, log_save_interval, row_log_interval) # init debugging flags self.debugging_connector.on_init_start(overfit_pct, limit_train_batches, limit_val_batches, limit_test_batches, val_check_interval, overfit_batches, fast_dev_run) # set precision self.precision_connector.on_trainer_init(precision, amp_level, amp_backend) # Callback system self.on_init_end()
def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) plugins = [ NLPDDPPlugin( no_ddp_communication_hook=True, gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, find_unused_parameters=False, ) ] if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), hysteresis=cfg.model.get('hysteresis', 2), ) if megatron_amp_o2: plugins.append(MegatronHalfPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) else: plugins.append(PipelineMixedPrecisionPlugin(precision=cfg.trainer.precision, device='cuda', scaler=scaler)) if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) trainer = Trainer(plugins=plugins, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager if cfg.model.resume_from_checkpoint is not None: resume_from_checkpoint = cfg.model.resume_from_checkpoint else: resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}') trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) # Override timer callback to a stateless one for idx, callback in enumerate(trainer.callbacks): if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) # Get the T5 Base configuration. t5_cfg = MegatronT5FinetuneModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) # Override the T5 configuration with the one from the config file. OmegaConf.set_struct(t5_cfg, True) with open_dict(t5_cfg): t5_cfg.masked_softmax_fusion = False t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) t5_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) t5_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.1) t5_cfg.data = cfg.model.data t5_cfg.precision = cfg.trainer.precision t5_cfg.optim = cfg.model.optim t5_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size t5_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size # XNLI has eval languages in the yaml config. if hasattr(cfg.model, 'eval_languages'): t5_cfg.eval_languages = cfg.model.eval_languages if hasattr(cfg.model.data.train_ds, 'task_name'): model = MegatronT5GLUEModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg, save_restore_connector=NLPSaveRestoreConnector(), ) else: model = MegatronT5FinetuneModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg, save_restore_connector=NLPSaveRestoreConnector(), ) trainer.fit(model) trainer.validate(model) if hasattr(cfg.model.data, 'test_ds'): trainer.test(model)
def __init__( self, logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True, checkpoint_callback: Union[ModelCheckpoint, bool] = True, early_stop_callback: Optional[Union[EarlyStopping, bool]] = False, callbacks: Optional[List[Callback]] = None, default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, process_position: int = 0, num_nodes: int = 1, num_processes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], str, int]] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: int = 1, overfit_batches: Union[int, float] = 0.0, track_grad_norm: Union[int, float, str] = -1, check_val_every_n_epoch: int = 1, fast_dev_run: bool = False, accumulate_grad_batches: Union[int, Dict[int, int], List[list]] = 1, max_epochs: int = 1000, min_epochs: int = 1, max_steps: Optional[int] = None, min_steps: Optional[int] = None, limit_train_batches: Union[int, float] = 1.0, limit_val_batches: Union[int, float] = 1.0, limit_test_batches: Union[int, float] = 1.0, val_check_interval: Union[int, float] = 1.0, log_save_interval: int = 100, row_log_interval: int = 50, distributed_backend: Optional[str] = None, sync_batchnorm: bool = False, precision: int = 32, weights_summary: Optional[str] = ModelSummary.MODE_DEFAULT, weights_save_path: Optional[str] = None, num_sanity_val_steps: int = 2, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[str] = None, profiler: Optional[Union[BaseProfiler, bool]] = None, benchmark: bool = False, deterministic: bool = False, reload_dataloaders_every_epoch: bool = False, auto_lr_find: Union[bool, str] = False, replace_sampler_ddp: bool = True, terminate_on_nan: bool = False, auto_scale_batch_size: Union[str, bool] = False, prepare_data_per_node: bool = True, amp_backend: str = 'native', amp_level: str = 'O2', # backward compatible, todo: remove in v1.0.0 overfit_pct: float = None, # backward compatible, todo: remove in v1.0.0 ): super().__init__() # init connectors self.dev_debugger = InternalDebugger(self) self.config_validator = ConfigValidator(self) self.data_connector = DataConnector(self) self.optimizer_connector = OptimizerConnector(self) self.accelerator_connector = AcceleratorConnector(self) self.logger_connector = LoggerConnector(self) self.model_connector = ModelConnector(self) self.precision_connector = PrecisionConnector(self) self.callback_connector = CallbackConnector(self) self.debugging_connector = DebuggingConnector(self) self.training_tricks_connector = TrainingTricksConnector(self) self.profile_connector = ProfilerConnector(self) self.checkpoint_connector = CheckpointConnector(self) self.slurm_connector = SLURMConnector(self) self.tuner = Tuner(self) self.accelerator_backend = None self.evaluation_loop = EvaluationLoop(self) self.train_loop = TrainLoop(self) # training state self.weights_summary = weights_summary self.model = None self.shown_warnings = set() # init callbacks self.callback_connector.on_trainer_init( callbacks, early_stop_callback, checkpoint_callback, progress_bar_refresh_rate, process_position, default_root_dir, weights_save_path, resume_from_checkpoint) # hook self.on_init_start() # init optimizer + lr scheduler related flags self.optimizer_connector.on_trainer_init() # init data flags self.data_connector.on_trainer_init(check_val_every_n_epoch, reload_dataloaders_every_epoch, prepare_data_per_node) # init training tricks self.training_tricks_connector.on_trainer_init( gradient_clip_val, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, terminate_on_nan) # init accelerator related flags self.accelerator_connector.on_trainer_init( num_processes, tpu_cores, distributed_backend, auto_select_gpus, gpus, num_nodes, log_gpu_memory, sync_batchnorm, benchmark, replace_sampler_ddp, deterministic) # init train loop related flags self.train_loop.on_trainer_init(max_epochs, min_epochs, max_steps, min_steps, num_sanity_val_steps) self.evaluation_loop.on_trainer_init() # configure tuner self.tuner.on_trainer_init(auto_lr_find, auto_scale_batch_size) # configure profiler self.profile_connector.on_trainer_init(profiler) # init logger flags self.logger_connector.on_trainer_init(logger, log_save_interval, row_log_interval) # init debugging flags self.debugging_connector.on_init_start(overfit_pct, limit_train_batches, limit_val_batches, limit_test_batches, val_check_interval, overfit_batches, fast_dev_run) # set precision self.precision_connector.on_trainer_init(precision, amp_level, amp_backend) # Callback system self.on_init_end()