def on_trainer_init(self, profiler: Union[BaseProfiler, bool, str]): if profiler and not isinstance(profiler, (bool, str, BaseProfiler)): # TODO: Update exception on removal of bool raise MisconfigurationException( "Only None, bool, str and subclasses of `BaseProfiler`" " are valid values for `Trainer`'s `profiler` parameter." f" Received {profiler} which is of type {type(profiler)}.") if isinstance(profiler, bool): rank_zero_warn( "Passing a bool value as a `profiler` argument to `Trainer` is deprecated" " and will be removed in v1.3. Use str ('simple' or 'advanced') instead.", DeprecationWarning) if profiler: profiler = SimpleProfiler() elif isinstance(profiler, str): if profiler.lower() in PROFILERS: profiler_class = PROFILERS[profiler.lower()] profiler = profiler_class() else: raise ValueError( "When passing string value for the `profiler` parameter of" " `Trainer`, it can only be 'simple' or 'advanced'") self.trainer.profiler = profiler or PassThroughProfiler()
def test_simple_profiler_distributed_files(tmpdir): """Ensure the proper files are saved in distributed""" profiler = SimpleProfiler(dirpath=tmpdir, filename='profiler') model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=2, accelerator="ddp_cpu", num_processes=2, profiler=profiler, logger=False, ) trainer.fit(model) trainer.validate(model) trainer.test(model) actual = set(os.listdir(profiler.dirpath)) expected = { f"{stage}-profiler-{rank}.txt" for stage in ("fit", "validate", "test") for rank in (0, 1) } assert actual == expected for f in profiler.dirpath.listdir(): assert f.read_text('utf-8')
def test_simple_profiler_iterable_durations(tmpdir, action: str, expected: list): """Ensure the reported durations are reasonably accurate.""" def _sleep_generator(durations): """the profile_iterable method needs an iterable in which we can ensure that we're properly timing how long it takes to call __next__""" for duration in durations: time.sleep(duration) yield duration def _get_python_cprofile_total_duration(profile): return sum(x.inlinetime for x in profile.getstats()) simple_profiler = SimpleProfiler() iterable = _sleep_generator(expected) with pytest.deprecated_call( match= "`SimpleProfiler.profile_iterable` is deprecated in v1.6 and will be removed in v1.8." ): for _ in simple_profiler.profile_iterable(iterable, action): pass # we exclude the last item in the recorded durations since that's when StopIteration is raised np.testing.assert_allclose(simple_profiler.recorded_durations[action][:-1], expected, rtol=0.2) advanced_profiler = AdvancedProfiler(dirpath=tmpdir, filename="profiler") iterable = _sleep_generator(expected) with pytest.deprecated_call( match= "`AdvancedProfiler.profile_iterable` is deprecated in v1.6 and will be removed in v1.8." ): for _ in advanced_profiler.profile_iterable(iterable, action): pass recorded_total_duration = _get_python_cprofile_total_duration( advanced_profiler.profiled_actions[action]) expected_total_duration = np.sum(expected) np.testing.assert_allclose(recorded_total_duration, expected_total_duration, rtol=0.2)
def main(): args = parse_args() cfg = Config.fromfile(args.config) setup_seed(cfg.random_seed) model = LightningModel(cfg) checkpoint_callback = ModelCheckpoint( filepath=f"{cfg.checkpoint_path}/{cfg.name}/{cfg.version}/" f"{cfg.name}_{cfg.version}_{{epoch}}_{{avg_val_loss:.3f}}_{{ade:.3f}}_{{fde:.3f}}_{{fiou:.3f}}", save_last=True, save_top_k=8, verbose=True, monitor='fiou', mode='max', prefix='') lr_logger_callback = LearningRateLogger(logging_interval='step') logger = TensorBoardLogger(save_dir=cfg.log_path, name=cfg.name, version=cfg.version) logger.log_hyperparams(model.hparams) profiler = SimpleProfiler() if cfg.simple_profiler else AdvancedProfiler() check_val_every_n_epoch = cfg.check_val_every_n_epoch if hasattr( cfg, 'check_val_every_n_epoch') else 1 trainer = pl.Trainer( gpus=cfg.num_gpus, max_epochs=cfg.max_epochs, logger=logger, profiler=profiler, # this line won't work in multi-gpu setting. weights_summary="top", gradient_clip_val=cfg.gradient_clip_val, callbacks=[lr_logger_callback], checkpoint_callback=checkpoint_callback, resume_from_checkpoint=cfg.resume_from_checkpoint, accumulate_grad_batches=cfg.batch_size_times, check_val_every_n_epoch=check_val_every_n_epoch) if (not (args.train or args.test)) or args.train: shutil.copy( args.config, os.path.join(cfg.log_path, cfg.name, cfg.version, args.config.split('/')[-1])) if cfg.load_from_checkpoint is not None: model_ckpt = partial_state_dict(model, cfg.load_from_checkpoint) model.load_state_dict(model_ckpt) trainer.fit(model) if args.test: if cfg.test_checkpoint is not None: model_ckpt = partial_state_dict(model, cfg.test_checkpoint) model.load_state_dict(model_ckpt) trainer.test(model)
def test_simple_profiler_log_dir(tmpdir): """Ensure the profiler dirpath defaults to `trainer.log_dir` when not present""" profiler = SimpleProfiler(filename="profiler") assert profiler._log_dir is None model = BoringModel() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, profiler=profiler) trainer.fit(model) expected = tmpdir / "lightning_logs" / "version_0" assert trainer.log_dir == expected assert profiler._log_dir == trainer.log_dir assert expected.join("fit-profiler.txt").exists()
def test_simple_profiler_with_nonexisting_dirpath(tmpdir): """Ensure the profiler creates non-existing dirpath.""" nonexisting_tmpdir = tmpdir / "nonexisting" profiler = SimpleProfiler(dirpath=nonexisting_tmpdir, filename="profiler") model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=1, limit_val_batches=1, profiler=profiler ) trainer.fit(model) assert nonexisting_tmpdir.exists() assert nonexisting_tmpdir.join("fit-profiler.txt").exists()
def create_profiler(profiler_params, checkpoint_path): if profiler_params is None: return None else: if profiler_params.save_profile: output_filename = checkpoint_path / 'profile.log' else: output_filename = None if profiler_params.name == 'simple': return SimpleProfiler(output_filename) elif profiler_params.name == 'advanced': return AdvancedProfiler(output_filename) else: raise ValueError( 'Given type of profiler is not supported. Use `simple` or `advanced`' )
def test_simple_profiler_log_dir(tmpdir): """Ensure the profiler dirpath defaults to `trainer.log_dir` when not present""" profiler = SimpleProfiler(filename="profiler") assert profiler._log_dir is None model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, profiler=profiler, ) trainer.fit(model) expected = profiler.dirpath assert trainer.log_dir == expected assert profiler._log_dir == trainer.log_dir assert Path(os.path.join(profiler.dirpath, "fit-profiler.txt")).exists()
def test_simple_profiler_with_nonexisting_log_dir(tmpdir): """Ensure the profiler dirpath defaults to `trainer.log_dir`and creates it when not present.""" nonexisting_tmpdir = tmpdir / "nonexisting" profiler = SimpleProfiler(filename="profiler") assert profiler.dirpath is None model = BoringModel() trainer = Trainer( default_root_dir=nonexisting_tmpdir, max_epochs=1, limit_train_batches=1, limit_val_batches=1, profiler=profiler ) trainer.fit(model) expected = nonexisting_tmpdir / "lightning_logs" / "version_0" assert expected.exists() assert trainer.log_dir == expected assert profiler.dirpath == trainer.log_dir assert expected.join("fit-profiler.txt").exists()
def init_trainer(project_config: dict, run_config: dict, logger, callbacks: list) -> pl.Trainer: """Initialize PyTorch Lightning Trainer""" # Get path to checkpoint you want to resume with if it was set in the run config resume_from_checkpoint = run_config.get("resume_training", {}).get("checkpoint_path", None) trainer = pl.Trainer( # whether to use gpu and how many gpus=project_config["num_of_gpus"], # experiment logging logger=logger, # useful callbacks callbacks=callbacks, # resume training from checkpoint if it was set in the run config resume_from_checkpoint=resume_from_checkpoint if resume_from_checkpoint != "None" and resume_from_checkpoint != "False" and resume_from_checkpoint is not False else None, # print related progress_bar_refresh_rate=project_config["printing"] ["progress_bar_refresh_rate"], profiler=SimpleProfiler() if project_config["printing"]["profiler"] else None, weights_summary=project_config["printing"]["weights_summary"], # number of validation sanity checks num_sanity_val_steps=3, # default log dir if no logger is found default_root_dir=os.path.join( os.path.dirname(os.path.dirname(__file__)), "logs/lightning_logs"), # insert all other trainer parameters specified in run config **run_config["trainer"]) return trainer
def main(): args = parse_args() cfg = Config.fromfile(args.config) setup_seed(cfg.random_seed) model = LightningTransformer(cfg) checkpoint_callback = ModelCheckpoint(filepath=os.path.join( cfg.checkpoint_path, cfg.name, cfg.version, "{}_{}_{{epoch}}_{{val_loss_per_word}}".format(cfg.name, cfg.version)), save_last=True, save_top_k=8, verbose=True, monitor='val_loss_per_word', mode='min', prefix='') lr_logger_callback = LearningRateLogger(logging_interval='step') logger = TensorBoardLogger(save_dir=cfg.log_path, name=cfg.name, version=cfg.version) logger.log_hyperparams(model.hparams) profiler = SimpleProfiler() if cfg.simple_profiler else AdvancedProfiler() trainer = pl.Trainer(gpus=cfg.num_gpus, max_epochs=cfg.max_epochs, logger=logger, profiler=profiler, weights_summary="top", callbacks=[lr_logger_callback], checkpoint_callback=checkpoint_callback, resume_from_checkpoint=cfg.resume_from_checkpoint, accumulate_grad_batches=cfg.batch_size_times) if cfg.load_from_checkpoint is not None: ckpt = torch.load(cfg.load_from_checkpoint, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt['state_dict']) trainer.fit(model)
def _create_pl_profiler(self): # Only if an experiment direcotyr exists if self.exp_main_dir: prof_out_file = os.path.join(self.cfg["setup_cfg"]["exp_main_dir"], "runtime_profiling.txt") else: return None if self.cfg["training_cfg"]["pl_which_profiler"].lower() == "simple": return SimpleProfiler( output_filename=prof_out_file, extended=True ) elif self.cfg["train_cfg"]["pl_which_profiler"].lower() == "advanced": return AdvancedProfiler( output_filename=prof_out_file, line_count_restriction = 1.0 ) elif self.cfg["train_cfg"]["pl_which_profiler"].lower() in ["none", ""]: return None else: raise NotImplementedError
def test_simple_profiler_deepcopy(tmpdir): simple_profiler = SimpleProfiler(dirpath=tmpdir, filename="test") simple_profiler.describe() assert deepcopy(simple_profiler)
def simple_profiler(): profiler = SimpleProfiler() return profiler
def __init__( self, logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True, checkpoint_callback: Union[ModelCheckpoint, bool] = True, early_stop_callback: Optional[Union[EarlyStopping, bool]] = False, callbacks: Optional[List[Callback]] = None, default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, process_position: int = 0, num_nodes: int = 1, num_processes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], int]] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: int = 1, overfit_pct: float = 0.0, track_grad_norm: int = -1, check_val_every_n_epoch: int = 1, fast_dev_run: bool = False, accumulate_grad_batches: Union[int, Dict[int, int], List[list]] = 1, max_epochs: int = 1000, min_epochs: int = 1, max_steps: Optional[int] = None, min_steps: Optional[int] = None, train_percent_check: float = 1.0, val_percent_check: float = 1.0, test_percent_check: float = 1.0, val_check_interval: float = 1.0, log_save_interval: int = 100, row_log_interval: int = 10, add_row_log_interval=None, # backward compatible, todo: remove in v0.8.0 distributed_backend: Optional[str] = None, precision: int = 32, print_nan_grads: bool = False, # backward compatible, todo: remove in v0.9.0 weights_summary: Optional[str] = 'full', weights_save_path: Optional[str] = None, num_sanity_val_steps: int = 2, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[str] = None, profiler: Optional[Union[BaseProfiler, bool]] = None, benchmark: bool = False, deterministic: bool = False, reload_dataloaders_every_epoch: bool = False, auto_lr_find: Union[bool, str] = False, replace_sampler_ddp: bool = True, terminate_on_nan: bool = False, auto_scale_batch_size: Union[str, bool] = False, num_tpu_cores: Optional[ int] = None, # backward compatible, todo: remove in v0.9.0 amp_level: str = 'O1', # backward compatible, todo: remove in v0.8.0 default_save_path=None, # backward compatible, todo: remove in v0.8.0 gradient_clip=None, # backward compatible, todo: remove in v0.8.0 nb_gpu_nodes=None, # backward compatible, todo: remove in v0.8.0 max_nb_epochs=None, # backward compatible, todo: remove in v0.8.0 min_nb_epochs=None, # backward compatible, todo: remove in v0.8.0 use_amp=None, # backward compatible, todo: remove in v0.9.0 show_progress_bar=None, # backward compatible, todo: remove in v0.9.0 nb_sanity_val_steps=None, # backward compatible, todo: remove in v0.8.0 ): r""" Customize every aspect of training via flags Args: logger: Logger (or iterable collection of loggers) for experiment tracking. checkpoint_callback: Callback for checkpointing. early_stop_callback (:class:`pytorch_lightning.callbacks.EarlyStopping`): callbacks: Add a list of callbacks. default_root_dir: Default path for logs and weights when no logger/ckpt_callback passed default_save_path: .. warning:: .. deprecated:: 0.7.3 Use `default_root_dir` instead. Will remove 0.9.0. gradient_clip_val: 0 means don't clip. gradient_clip: .. warning:: .. deprecated:: 0.7.0 Use `gradient_clip_val` instead. Will remove 0.9.0. process_position: orders the progress bar when running multiple models on same machine. num_nodes: number of GPU nodes for distributed training. nb_gpu_nodes: .. warning:: .. deprecated:: 0.7.0 Use `num_nodes` instead. Will remove 0.9.0. gpus: Which GPUs to train on. auto_select_gpus: If enabled and `gpus` is an integer, pick available gpus automatically. This is especially useful when GPUs are configured to be in "exclusive mode", such that only one process at a time can access them. tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on [1] num_tpu_cores: How many TPU cores to train on (1 or 8) .. warning:: .. deprecated:: 0.7.6. Will remove 0.9.0. log_gpu_memory: None, 'min_max', 'all'. Might slow performance show_progress_bar: .. warning:: .. deprecated:: 0.7.2 Set `progress_bar_refresh_rate` to positive integer to enable. Will remove 0.9.0. progress_bar_refresh_rate: How often to refresh progress bar (in steps). Value ``0`` disables progress bar. Ignored when a custom callback is passed to :paramref:`~Trainer.callbacks`. overfit_pct: How much of training-, validation-, and test dataset to check. track_grad_norm: -1 no tracking. Otherwise tracks that norm check_val_every_n_epoch: Check val every n train epochs. fast_dev_run: runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). accumulate_grad_batches: Accumulates grads every k batches or as set up in the dict. max_epochs: Stop training once this number of epochs is reached. max_nb_epochs: .. warning:: .. deprecated:: 0.7.0 Use `max_epochs` instead. Will remove 0.9.0. min_epochs: Force training for at least these many epochs min_nb_epochs: .. warning:: .. deprecated:: 0.7.0 Use `min_epochs` instead. Will remove 0.9.0. max_steps: Stop training after this number of steps. Disabled by default (None). min_steps: Force training for at least these number of steps. Disabled by default (None). train_percent_check: How much of training dataset to check. val_percent_check: How much of validation dataset to check. test_percent_check: How much of test dataset to check. val_check_interval: How often within one training epoch to check the validation set log_save_interval: Writes logs to disk this often row_log_interval: How often to add logging rows (does not write to disk) add_row_log_interval: .. warning:: .. deprecated:: 0.7.0 Use `row_log_interval` instead. Will remove 0.9.0. distributed_backend: The distributed backend to use. use_amp: .. warning:: .. deprecated:: 0.7.0 Use `precision` instead. Will remove 0.9.0. precision: Full precision (32), half precision (16). print_nan_grads: .. warning:: .. deprecated:: 0.7.2 Has no effect. When detected, NaN grads will be printed automatically. Will remove 0.9.0. weights_summary: Prints a summary of the weights when training begins. weights_save_path: Where to save weights if specified. Will override default_root_dir for checkpoints only. Use this if for whatever reason you need the checkpoints stored in a different place than the logs written in `default_root_dir`. amp_level: The optimization level to use (O1, O2, etc...). num_sanity_val_steps: Sanity check runs n batches of val before starting the training routine. nb_sanity_val_steps: .. warning:: .. deprecated:: 0.7.0 Use `num_sanity_val_steps` instead. Will remove 0.8.0. truncated_bptt_steps: Truncated back prop breaks performs backprop every k steps of resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here. profiler: To profile individual steps during training and assist in reload_dataloaders_every_epoch: Set to True to reload dataloaders every epoch auto_lr_find: If set to True, will `initially` run a learning rate finder, trying to optimize initial learning for faster convergence. Sets learning rate in self.lr or self.learning_rate in the LightningModule. To use a different key, set a string instead of True with the key name. replace_sampler_ddp: Explicitly enables or disables sampler replacement. If not specified this will toggled automatically ddp is used benchmark: If true enables cudnn.benchmark. deterministic: If true enables cudnn.deterministic terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the end of each training batch, if any of the parameters or the loss are NaN or +/-inf. auto_scale_batch_size: If set to True, will `initially` run a batch size finder trying to find the largest batch size that fits into memory. The result will be stored in self.batch_size in the LightningModule. Additionally, can be set to either `power` that estimates the batch size through a power search or `binsearch` that estimates the batch size through a binary search. """ super().__init__() self.deterministic = deterministic torch.backends.cudnn.deterministic = self.deterministic if self.deterministic: # fixing non-deterministic part of horovod # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) # Init callbacks self.callbacks = callbacks or [] self.on_init_start() # benchmarking self.benchmark = benchmark torch.backends.cudnn.benchmark = self.benchmark # Transfer params self.num_nodes = num_nodes # Backward compatibility, TODO: remove in v0.8.0 if nb_gpu_nodes is not None: rank_zero_warn( "Argument `nb_gpu_nodes` has renamed to `num_nodes` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.num_gpu_nodes = nb_gpu_nodes self.log_gpu_memory = log_gpu_memory self.gradient_clip_val = gradient_clip_val # Backward compatibility, TODO: remove in v0.8.0 if gradient_clip is not None: rank_zero_warn( "Argument `gradient_clip` has renamed to `gradient_clip_val` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.gradient_clip = gradient_clip self.check_val_every_n_epoch = check_val_every_n_epoch self.track_grad_norm = track_grad_norm self.on_gpu = True if (gpus and torch.cuda.is_available()) else False # tpu config if num_tpu_cores is not None: rank_zero_warn( "Argument `num_tpu_cores` is now set by `tpu_cores` since v0.7.6" " and this argument will be removed in v0.9.0", DeprecationWarning) if tpu_cores is None: tpu_cores = num_tpu_cores self.on_tpu = tpu_cores is not None self.tpu_cores = tpu_cores assert self.tpu_cores in (1, 8, None) or ( isinstance(self.tpu_cores, (list, tuple, set)) and len(self.tpu_cores) == 1), '`tpu_cores` can only be 1, 8 or [<1-8>]' self.tpu_id = tpu_cores[0] if isinstance(tpu_cores, list) else None if num_processes != 1 and distributed_backend != "ddp_cpu": rank_zero_warn( "num_processes is only used for distributed_backend=\"ddp_cpu\". Ignoring it." ) self.num_processes = num_processes self.weights_summary = weights_summary self.max_epochs = max_epochs # Backward compatibility, TODO: remove in v0.8.0 if max_nb_epochs is not None: rank_zero_warn( "Argument `max_nb_epochs` has renamed to `max_epochs` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.max_nb_epochs = max_nb_epochs self.min_epochs = min_epochs # Backward compatibility, TODO: remove in v0.8.0 if min_nb_epochs is not None: rank_zero_warn( "Argument `min_nb_epochs` has renamed to `min_epochs` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.min_nb_epochs = min_nb_epochs self.max_steps = max_steps self.min_steps = min_steps self.num_sanity_val_steps = num_sanity_val_steps # Backward compatibility, TODO: remove in v0.8.0 if nb_sanity_val_steps is not None: rank_zero_warn( "Argument `nb_sanity_val_steps` has renamed to " "`num_sanity_val_steps` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.nb_sanity_val_steps = nb_sanity_val_steps # Backward compatibility, TODO: remove in v0.9.0 if print_nan_grads: rank_zero_warn( "Argument `print_nan_grads` has no effect and will be removed in v0.9.0." " NaN grads will be printed automatically when detected.", DeprecationWarning) self.reload_dataloaders_every_epoch = reload_dataloaders_every_epoch self.auto_lr_find = auto_lr_find self.auto_scale_batch_size = auto_scale_batch_size self._is_data_prepared = False self.replace_sampler_ddp = replace_sampler_ddp self.truncated_bptt_steps = truncated_bptt_steps self.resume_from_checkpoint = resume_from_checkpoint self.terminate_on_nan = terminate_on_nan self.shown_warnings = set() self.fast_dev_run = fast_dev_run if self.fast_dev_run: self.num_sanity_val_steps = 0 self.max_epochs = 1 log.info('Running in fast_dev_run mode: will run a full train,' ' val and test loop using a single batch') # set default save path if user didn't provide one self.default_root_dir = default_root_dir # Backward compatibility, TODO: remove in v0.8.0 if default_save_path is not None: self.default_root_dir = default_save_path if self.default_root_dir is None: self.default_root_dir = os.getcwd() # training bookeeping self.total_batch_idx = 0 self.running_loss = TensorRunningAccum(window_length=20) self.batch_idx = 0 self.progress_bar_metrics = {} self.callback_metrics = {} self.num_val_batches = 0 self.num_training_batches = 0 self.num_test_batches = 0 self.train_dataloader = None self.test_dataloaders = None self.val_dataloaders = None # training state self.model = None self.testing = False self.disable_validation = False self.lr_schedulers = [] self.optimizers = None self.optimizer_frequencies = [] self.global_step = 0 self.current_epoch = 0 self.interrupted = False # configure logger self.configure_logger(logger) # configure profiler if profiler is True: profiler = SimpleProfiler() self.profiler = profiler or PassThroughProfiler() # configure early stop callback # creates a default one if none passed in self.configure_early_stopping(early_stop_callback) # configure checkpoint callback self.checkpoint_callback = checkpoint_callback self.weights_save_path = weights_save_path # accumulated grads self.accumulate_grad_batches = accumulate_grad_batches self.configure_accumulated_gradients(accumulate_grad_batches) # for gpus allow int, string and gpu list if auto_select_gpus and isinstance(gpus, int): self.gpus = pick_multiple_gpus(gpus) else: self.gpus = gpus self.data_parallel_device_ids = parse_gpu_ids(self.gpus) self.root_gpu = determine_root_gpu_device( self.data_parallel_device_ids) self.root_device = torch.device("cpu") # tpu state flags self.use_tpu = False self.tpu_local_core_rank = None self.tpu_global_core_rank = None # distributed backend choice self.distributed_backend = distributed_backend self.set_distributed_mode(distributed_backend) # override dist backend when using tpus if self.on_tpu: self.init_tpu() # init flags for SLURM+ddp to work self.proc_rank = 0 self.world_size = 1 self.configure_slurm_ddp(self.num_nodes) self.node_rank = self.determine_ddp_node_rank() # nvidia setup self.set_nvidia_flags(self.is_slurm_managing_tasks, self.data_parallel_device_ids) # backward compatibility if show_progress_bar is not None: self.show_progress_bar = show_progress_bar self._progress_bar_callback = self.configure_progress_bar( progress_bar_refresh_rate, process_position) # logging self.log_save_interval = log_save_interval self.val_check_interval = val_check_interval # backward compatibility if add_row_log_interval is not None: rank_zero_warn( "`add_row_log_interval` has renamed to `row_log_interval` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) if not row_log_interval: # in case you did not set the proper value row_log_interval = add_row_log_interval self.row_log_interval = row_log_interval # how much of the data to use self.overfit_pct = overfit_pct self.determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct) # AMP init # These are the only lines needed after v0.8.0 # we wrap the user's forward with autocast and give it back at the end of fit self.autocast_original_forward = None self.use_native_amp = hasattr(torch.cuda, "amp") and hasattr( torch.cuda.amp, "autocast") self.precision = precision self.scaler = None # TODO: remove for v0.8.0 self.amp_level = amp_level self.init_amp(use_amp) self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv( 'KAGGLE_URL_BASE') # Callback system self.on_init_end()
def simple_profiler(): return SimpleProfiler()
def objective(self, trial): self.modify_config(trial) if not os.path.exists(self.study_dir): os.mkdir(self.study_dir) if not os.path.exists( os.path.join(self.study_dir, "trial_{}".format(trial.number))): os.mkdir( os.path.join(self.study_dir, "trial_{}".format(trial.number))) logger = TensorBoardLogger(self.study_dir, name="trial_{}".format(trial.number), default_hp_metric=False) log_folder = logger.log_dir if not os.path.exists(log_folder): os.makedirs(log_folder, exist_ok=True) trainer_args = self.trainer_args checkpoint_callback = \ ModelCheckpoint( dirpath=log_folder, filename='{epoch}-{val_loss:.2f}', monitor="val_loss") trainer_args["logger"] = logger trainer_args["default_root_dir"] = self.study_dir set_default_trainer_args(trainer_args, self.config) if trainer_args["profiler"]: profiler = SimpleProfiler(output_filename=os.path.join( log_folder, "profile_results.txt")) trainer_args["profiler"] = profiler save_config(self.config, log_folder, "trial_{}".format(trial.number), "config") # save_config(DictionaryUtility.to_object(trainer_args), log_folder, # "trial_{}".format(trial.number), "train_args") cbs = [LoggingCallback(), PruningCallback(), checkpoint_callback] # trainer_args["early_stop_callback"] = PyTorchLightningPruningCallback(trial, monitor="val_early_stop_on") if self.config.run_config.run_class == "LitZ": cbs.append( EarlyStopping(monitor='val_loss', min_delta=.00, verbose=True, mode="min", patience=5)) else: cbs.append( EarlyStopping(monitor='val_loss', min_delta=.00, verbose=True, mode="min", patience=4)) trainer = pl.Trainer(**trainer_args, callbacks=cbs) modules = ModuleUtility(self.config.run_config.imports) model = modules.retrieve_class(self.config.run_config.run_class)( self.config, trial) data_module = PSDDataModule(self.config, model.device) try: trainer.fit(model, datamodule=data_module) loss = trainer.checkpoint_callback.best_model_score self.log.info("best loss found for trial {0} is {1}".format( trial.number, loss)) except RuntimeError as e: print( "Caught error during trial {0}, moving to next trial. Error message below." .format(trial.number, trial)) print(e) self.log.info("Trial {0} failed with error {1}".format( trial.number, e)) gc.collect() loss = None return loss
def __init__( self, logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True, checkpoint_callback: Union[ModelCheckpoint, bool] = True, early_stop_callback: Optional[Union[EarlyStopping, bool]] = False, callbacks: Optional[List[Callback]] = None, default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, process_position: int = 0, num_nodes: int = 1, num_processes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], str, int]] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: int = 1, overfit_batches: Union[int, float] = 0.0, track_grad_norm: Union[int, float, str] = -1, check_val_every_n_epoch: int = 1, fast_dev_run: bool = False, accumulate_grad_batches: Union[int, Dict[int, int], List[list]] = 1, max_epochs: int = 1000, min_epochs: int = 1, max_steps: Optional[int] = None, min_steps: Optional[int] = None, limit_train_batches: Union[int, float] = 1.0, limit_val_batches: Union[int, float] = 1.0, limit_test_batches: Union[int, float] = 1.0, val_check_interval: Union[int, float] = 1.0, log_save_interval: int = 100, row_log_interval: int = 50, distributed_backend: Optional[str] = None, sync_batchnorm: bool = False, precision: int = 32, weights_summary: Optional[str] = ModelSummary.MODE_DEFAULT, weights_save_path: Optional[str] = None, num_sanity_val_steps: int = 2, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[str] = None, profiler: Optional[Union[BaseProfiler, bool]] = None, benchmark: bool = False, deterministic: bool = False, reload_dataloaders_every_epoch: bool = False, auto_lr_find: Union[bool, str] = False, replace_sampler_ddp: bool = True, terminate_on_nan: bool = False, auto_scale_batch_size: Union[str, bool] = False, prepare_data_per_node: bool = True, amp_backend: str = 'native', amp_level: str = 'O2', # backward compatible, todo: remove in v1.0.0 val_percent_check: float = None, # backward compatible, todo: remove in v0.10.0 test_percent_check: float = None, # backward compatible, todo: remove in v0.10.0 train_percent_check: float = None, # backward compatible, todo: remove in v0.10.0 overfit_pct: float = None, # backward compatible, todo: remove in v1.0.0 ): super().__init__() self.deterministic = deterministic torch.backends.cudnn.deterministic = self.deterministic if self.deterministic: # fixing non-deterministic part of horovod # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0) # init the default rank if exists # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks # this way we only show it on rank 0 if 'LOCAL_RANK' in os.environ: rank_zero_only.rank = int(os.environ['LOCAL_RANK']) # tracks internal state for debugging self.dev_debugger = InternalDebugger(self) self.config_validator = ConfigValidator(self) self.data_connector = DataConnector(self) self.lr_scheduler_connector = LRSchedulerConnector(self) self.accelerator_connector = AcceleratorConnector(self) self.logger_connector = LoggerConnector(self) self.model_connector = ModelConnector(self) self.initializer = Initializer(self) self.tuner = Tuner(self) self.accelerator_backend = None # loops self.evaluation_loop = EvaluationLoop(self) self.train_loop = TrainLoop(self) # training bookeeping self.total_batch_idx = 0 self.running_loss = TensorRunningAccum(window_length=20) self.batch_idx = 0 self.num_training_batches = 0 self.num_val_batches = [] self.num_sanity_val_batches = [] self.num_test_batches = [] self.train_dataloader = None self.test_dataloaders = None self.val_dataloaders = None # when true, prints test results self.verbose_test = True # when .test() is called, it sets this self.tested_ckpt_path = None # training state self.model = None self.datamodule = None self.testing = False self.prepare_data_per_node = prepare_data_per_node self.lr_schedulers = [] self.optimizers = None self.optimizer_frequencies = [] self.global_step = 0 self.current_epoch = 0 self.interrupted = False self.should_stop = False self.running_sanity_check = False self._state = TrainerState.INITIALIZING self._default_root_dir = default_root_dir or os.getcwd() self._weights_save_path = weights_save_path or self._default_root_dir # init callbacks self.callbacks = callbacks or [] # configure early stop callback # creates a default one if none passed in early_stop_callback = self.configure_early_stopping( early_stop_callback) if early_stop_callback: self.callbacks.append(early_stop_callback) # configure checkpoint callback # it is important that this is the last callback to run # pass through the required args to figure out defaults checkpoint_callback = self.configure_checkpoint_callback( checkpoint_callback) if checkpoint_callback: self.callbacks.append(checkpoint_callback) # TODO refactor codebase (tests) to not directly reach into these callbacks self.checkpoint_callback = checkpoint_callback self.early_stop_callback = early_stop_callback self.on_init_start() # benchmarking self.benchmark = benchmark torch.backends.cudnn.benchmark = self.benchmark # Transfer params self.num_nodes = num_nodes self.log_gpu_memory = log_gpu_memory # sync-bn backend self.sync_batchnorm = sync_batchnorm self.gradient_clip_val = gradient_clip_val self.check_val_every_n_epoch = check_val_every_n_epoch if not isinstance(track_grad_norm, (int, float)) and track_grad_norm != 'inf': raise MisconfigurationException( "track_grad_norm can be an int, a float or 'inf' (infinity norm)." ) self.track_grad_norm = float(track_grad_norm) self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) self.on_tpu = self.tpu_cores is not None self.tpu_id = self.tpu_cores[0] if isinstance(self.tpu_cores, list) else None if num_processes != 1 and distributed_backend != "ddp_cpu": rank_zero_warn( "num_processes is only used for distributed_backend=\"ddp_cpu\". Ignoring it." ) self.num_processes = num_processes self.weights_summary = weights_summary self.max_epochs = max_epochs self.min_epochs = min_epochs self.max_steps = max_steps self.min_steps = min_steps if num_sanity_val_steps == -1: self.num_sanity_val_steps = float('inf') else: self.num_sanity_val_steps = num_sanity_val_steps self.reload_dataloaders_every_epoch = reload_dataloaders_every_epoch self.auto_lr_find = auto_lr_find self.auto_scale_batch_size = auto_scale_batch_size self._is_data_prepared = False self.replace_sampler_ddp = replace_sampler_ddp self.truncated_bptt_steps = truncated_bptt_steps self.resume_from_checkpoint = resume_from_checkpoint self.terminate_on_nan = terminate_on_nan self.shown_warnings = set() self.fast_dev_run = fast_dev_run if self.fast_dev_run: limit_train_batches = 1 limit_val_batches = 1 limit_test_batches = 1 self.num_sanity_val_steps = 0 self.max_epochs = 1 rank_zero_info( 'Running in fast_dev_run mode: will run a full train,' ' val and test loop using a single batch') # configure profiler if profiler is True: profiler = SimpleProfiler() self.profiler = profiler or PassThroughProfiler() # accumulated grads self.accumulate_grad_batches = accumulate_grad_batches self.configure_accumulated_gradients(accumulate_grad_batches) # override with environment flag gpus = os.environ.get('PL_TRAINER_GPUS', gpus) # for gpus allow int, string and gpu list if auto_select_gpus and isinstance(gpus, int): self.gpus = self.tuner.pick_multiple_gpus(gpus) else: self.gpus = gpus self.data_parallel_device_ids = device_parser.parse_gpu_ids(self.gpus) self.root_gpu = device_parser.determine_root_gpu_device( self.data_parallel_device_ids) self.root_device = torch.device("cpu") self.on_gpu = True if (self.data_parallel_device_ids and torch.cuda.is_available()) else False # tpu state flags self.use_tpu = False self.tpu_local_core_rank = None self.tpu_global_core_rank = None # distributed backend choice self.distributed_backend = distributed_backend self.set_distributed_mode(distributed_backend) # override dist backend when using tpus if self.on_tpu: self.distributed_backend = 'tpu' self.init_tpu() # init flags for SLURM+DDP to work self.world_size = 1 self.interactive_ddp_procs = [] self.configure_slurm_ddp(self.num_nodes) self.node_rank = self.determine_ddp_node_rank() self.local_rank = self.determine_local_rank() self.global_rank = 0 # NVIDIA setup self.set_nvidia_flags(self.is_slurm_managing_tasks, self.data_parallel_device_ids) self._progress_bar_callback = self.configure_progress_bar( progress_bar_refresh_rate, process_position) # logging self.configure_logger(logger) self.log_save_interval = log_save_interval self.row_log_interval = row_log_interval # how much of the data to use # TODO: remove in 0.10.0 if overfit_pct is not None: rank_zero_warn( "Argument `overfit_pct` is now set by `overfit_batches` since v0.8.0" " and this argument will be removed in v0.10.0", DeprecationWarning, ) overfit_batches = overfit_pct # TODO: remove in 0.10.0 if val_percent_check is not None: rank_zero_warn( "Argument `val_percent_check` is now set by `limit_val_batches` since v0.8.0" " and this argument will be removed in v0.10.0", DeprecationWarning, ) limit_val_batches = val_percent_check # TODO: remove in 0.10.0 if test_percent_check is not None: rank_zero_warn( "Argument `test_percent_check` is now set by `limit_test_batches` since v0.8.0" " and this argument will be removed in v0.10.0", DeprecationWarning, ) limit_test_batches = test_percent_check # TODO: remove in 0.10.0 if train_percent_check is not None: rank_zero_warn( "Argument `train_percent_check` is now set by `limit_train_batches` since v0.8.0" " and this argument will be removed in v0.10.0", DeprecationWarning, ) limit_train_batches = train_percent_check self.limit_train_batches = _determine_batch_limits( limit_train_batches, 'limit_train_batches') self.limit_val_batches = _determine_batch_limits( limit_val_batches, 'limit_val_batches') self.limit_test_batches = _determine_batch_limits( limit_test_batches, 'limit_test_batches') self.val_check_interval = _determine_batch_limits( val_check_interval, 'val_check_interval') self.overfit_batches = _determine_batch_limits(overfit_batches, 'overfit_batches') self.determine_data_use_amount(self.overfit_batches) # AMP init # These are the only lines needed after v0.8.0 # we wrap the user's forward with autocast and give it back at the end of fit self.autocast_original_forward = None self.precision = precision self.scaler = None self.amp_level = amp_level self.initializer.init_amp(amp_backend) self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv( 'KAGGLE_URL_BASE') # Callback system self.on_init_end()
if batch_idx % self.hparams.sync_batches == 0: self.model.alpha_sync(self.hparams.polyak) return actor_loss_v def validation_step(self, batch, batch_idx): to_log = dict() for k, v in batch.items(): to_log[k] = v.detach().cpu().numpy() to_log['epoch_nr'] = int(self.current_epoch) if self.logger is not None: self.logger.experiment.log(to_log) if __name__ == '__main__': mp.set_start_method('spawn') hparams = get_args() if hparams.debug: hparams.logger = None hparams.profiler = SimpleProfiler() else: hparams.logger = WandbLogger(project=hparams.project) seed_everything(hparams.seed) her = HER(hparams) trainer = pl.Trainer.from_argparse_args(hparams) trainer.callbacks.append(SpawnCallback()) trainer.fit(her)
def __init__( self, logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True, checkpoint_callback: Union[ModelCheckpoint, bool] = True, early_stop_callback: Optional[Union[EarlyStopping, bool]] = False, callbacks: List[Callback] = [], default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, process_position: int = 0, num_nodes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, num_tpu_cores: Optional[int] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: int = 1, overfit_pct: float = 0.0, track_grad_norm: int = -1, check_val_every_n_epoch: int = 1, fast_dev_run: bool = False, accumulate_grad_batches: Union[int, Dict[int, int], List[list]] = 1, max_epochs: int = 1000, min_epochs: int = 1, max_steps: Optional[int] = None, min_steps: Optional[int] = None, train_percent_check: float = 1.0, val_percent_check: float = 1.0, test_percent_check: float = 1.0, val_check_interval: float = 1.0, log_save_interval: int = 100, row_log_interval: int = 10, add_row_log_interval=None, # backward compatible, todo: remove in v0.8.0 distributed_backend: Optional[str] = None, precision: int = 32, print_nan_grads: bool = False, # backward compatible, todo: remove in v0.9.0 weights_summary: Optional[str] = 'full', weights_save_path: Optional[str] = None, amp_level: str = 'O1', num_sanity_val_steps: int = 5, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[str] = None, profiler: Optional[BaseProfiler] = None, benchmark: bool = False, reload_dataloaders_every_epoch: bool = False, auto_lr_find: Union[bool, str] = False, default_save_path=None, # backward compatible, todo: remove in v0.8.0 gradient_clip=None, # backward compatible, todo: remove in v0.8.0 nb_gpu_nodes=None, # backward compatible, todo: remove in v0.8.0 max_nb_epochs=None, # backward compatible, todo: remove in v0.8.0 min_nb_epochs=None, # backward compatible, todo: remove in v0.8.0 use_amp=None, # backward compatible, todo: remove in v0.9.0 show_progress_bar=None, # backward compatible, todo: remove in v0.9.0 nb_sanity_val_steps=None, # backward compatible, todo: remove in v0.8.0 terminate_on_nan: bool = False, **kwargs ): r""" Customize every aspect of training via flags Args: logger: Logger (or iterable collection of loggers) for experiment tracking. checkpoint_callback: Callback for checkpointing. early_stop_callback (:class:`pytorch_lightning.callbacks.EarlyStopping`): callbacks: Add a list of callbacks. default_root_dir: Default path for logs and weights when no logger/ckpt_callback passed default_save_path: .. warning:: .. deprecated:: 0.7.3 Use `default_root_dir` instead. Will remove 0.9.0. gradient_clip_val: 0 means don't clip. gradient_clip: .. warning:: .. deprecated:: 0.7.0 Use `gradient_clip_val` instead. Will remove 0.9.0. process_position: orders the tqdm bar when running multiple models on same machine. num_nodes: number of GPU nodes for distributed training. nb_gpu_nodes: .. warning:: .. deprecated:: 0.7.0 Use `num_nodes` instead. Will remove 0.9.0. gpus: Which GPUs to train on. auto_select_gpus: If enabled and `gpus` is an integer, pick available gpus automatically. This is especially useful when GPUs are configured to be in "exclusive mode", such that only one process at a time can access them. num_tpu_cores: How many TPU cores to train on (1 or 8). log_gpu_memory: None, 'min_max', 'all'. Might slow performance show_progress_bar: .. warning:: .. deprecated:: 0.7.2 Set `progress_bar_refresh_rate` to postive integer to enable. Will remove 0.9.0. progress_bar_refresh_rate: How often to refresh progress bar (in steps). Value ``0`` disables progress bar. overfit_pct: How much of training-, validation-, and test dataset to check. track_grad_norm: -1 no tracking. Otherwise tracks that norm check_val_every_n_epoch: Check val every n train epochs. fast_dev_run: runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). accumulate_grad_batches: Accumulates grads every k batches or as set up in the dict. max_epochs: Stop training once this number of epochs is reached. max_nb_epochs: .. warning:: .. deprecated:: 0.7.0 Use `max_epochs` instead. Will remove 0.9.0. min_epochs: Force training for at least these many epochs min_nb_epochs: .. warning:: .. deprecated:: 0.7.0 Use `min_epochs` instead. Will remove 0.9.0. max_steps: Stop training after this number of steps. Disabled by default (None). min_steps: Force training for at least these number of steps. Disabled by default (None). train_percent_check: How much of training dataset to check. val_percent_check: How much of validation dataset to check. test_percent_check: How much of test dataset to check. val_check_interval: How often within one training epoch to check the validation set log_save_interval: Writes logs to disk this often row_log_interval: How often to add logging rows (does not write to disk) add_row_log_interval: .. warning:: .. deprecated:: 0.7.0 Use `row_log_interval` instead. Will remove 0.9.0. distributed_backend: The distributed backend to use. use_amp: .. warning:: .. deprecated:: 0.7.0 Use `precision` instead. Will remove 0.9.0. precision: Full precision (32), half precision (16). print_nan_grads: .. warning:: .. deprecated:: 0.7.2 Has no effect. When detected, NaN grads will be printed automatically. Will remove 0.9.0. weights_summary: Prints a summary of the weights when training begins. weights_save_path: Where to save weights if specified. Will override default_root_dir for checkpoints only. Use this if for whatever reason you need the checkpoints stored in a different place than the logs written in `default_root_dir`. amp_level: The optimization level to use (O1, O2, etc...). num_sanity_val_steps: Sanity check runs n batches of val before starting the training routine. nb_sanity_val_steps: .. warning:: .. deprecated:: 0.7.0 Use `num_sanity_val_steps` instead. Will remove 0.8.0. truncated_bptt_steps: Truncated back prop breaks performs backprop every k steps of resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here. profiler: To profile individual steps during training and assist in reload_dataloaders_every_epoch: Set to True to reload dataloaders every epoch auto_lr_find: If set to True, will `initially` run a learning rate finder, trying to optimize initial learning for faster convergence. Sets learning rate in self.hparams.lr | self.hparams.learning_rate in the lightning module. To use a different key, set a string instead of True with the key name. benchmark: If true enables cudnn.benchmark. terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the end of each training batch, if any of the parameters or the loss are NaN or +/-inf. """ # Init callbacks self.callbacks = callbacks self.on_init_start() # benchmarking self.benchmark = benchmark torch.backends.cudnn.benchmark = self.benchmark # Transfer params self.num_nodes = num_nodes # Backward compatibility, TODO: remove in v0.8.0 if nb_gpu_nodes is not None: rank_zero_warn("Argument `nb_gpu_nodes` has renamed to `num_nodes` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.num_gpu_nodes = nb_gpu_nodes self.log_gpu_memory = log_gpu_memory self.gradient_clip_val = gradient_clip_val # Backward compatibility, TODO: remove in v0.8.0 if gradient_clip is not None: rank_zero_warn("Argument `gradient_clip` has renamed to `gradient_clip_val` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.gradient_clip = gradient_clip self.progress_bar_refresh_rate = progress_bar_refresh_rate self.check_val_every_n_epoch = check_val_every_n_epoch self.track_grad_norm = track_grad_norm self.on_gpu = True if (gpus and torch.cuda.is_available()) else False # tpu config self.on_tpu = num_tpu_cores is not None self.num_tpu_cores = num_tpu_cores assert num_tpu_cores in [1, 8, None], 'num_tpu_cores can only be 1 or 8' self.process_position = process_position self.weights_summary = weights_summary self.max_epochs = max_epochs # Backward compatibility, TODO: remove in v0.8.0 if max_nb_epochs is not None: rank_zero_warn("Argument `max_nb_epochs` has renamed to `max_epochs` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.max_nb_epochs = max_nb_epochs self.min_epochs = min_epochs # Backward compatibility, TODO: remove in v0.8.0 if min_nb_epochs is not None: rank_zero_warn("Argument `min_nb_epochs` has renamed to `min_epochs` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.min_nb_epochs = min_nb_epochs self.max_steps = max_steps self.min_steps = min_steps self.num_sanity_val_steps = num_sanity_val_steps # Backward compatibility, TODO: remove in v0.8.0 if nb_sanity_val_steps is not None: rank_zero_warn("Argument `nb_sanity_val_steps` has renamed to " "`num_sanity_val_steps` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.nb_sanity_val_steps = nb_sanity_val_steps # Backward compatibility, TODO: remove in v0.9.0 if print_nan_grads: rank_zero_warn("Argument `print_nan_grads` has no effect and will be removed in v0.9.0." " NaN grads will be printed automatically when detected.", DeprecationWarning) self.reload_dataloaders_every_epoch = reload_dataloaders_every_epoch self.auto_lr_find = auto_lr_find self.truncated_bptt_steps = truncated_bptt_steps self.resume_from_checkpoint = resume_from_checkpoint self.terminate_on_nan = terminate_on_nan self.shown_warnings = set() self.fast_dev_run = fast_dev_run if self.fast_dev_run: self.num_sanity_val_steps = 0 self.max_epochs = 1 log.info('Running in fast_dev_run mode: will run a full train,' ' val and test loop using a single batch') # set default save path if user didn't provide one self.default_root_dir = default_root_dir # Backward compatibility, TODO: remove in v0.8.0 if default_save_path is not None: self.default_root_dir = default_save_path if self.default_root_dir is None: self.default_root_dir = os.getcwd() # training bookeeping self.total_batch_idx = 0 self.running_loss = TensorRunningAccum(window_length=20) self.batch_idx = 0 self.tqdm_metrics = {} self.callback_metrics = {} self.num_val_batches = 0 self.num_training_batches = 0 self.num_test_batches = 0 self.train_dataloader = None self.test_dataloaders = None self.val_dataloaders = None # training state self.model = None self.testing = False self.disable_validation = False self.lr_schedulers = [] self.optimizers = None self.optimizer_frequencies = [] self.global_step = 0 self.current_epoch = 0 self.total_batches = 0 self.interrupted = False # configure logger self.configure_logger(logger) # configure profiler if profiler is True: profiler = SimpleProfiler() self.profiler = profiler or PassThroughProfiler() # configure early stop callback # creates a default one if none passed in self.configure_early_stopping(early_stop_callback) # configure checkpoint callback self.checkpoint_callback = checkpoint_callback self.weights_save_path = weights_save_path # accumulated grads self.accumulate_grad_batches = accumulate_grad_batches self.configure_accumulated_gradients(accumulate_grad_batches) # for gpus allow int, string and gpu list if auto_select_gpus and isinstance(gpus, int): self.gpus = pick_multiple_gpus(gpus) else: self.gpus = gpus self.data_parallel_device_ids = parse_gpu_ids(self.gpus) self.root_gpu = determine_root_gpu_device(self.data_parallel_device_ids) self.root_device = torch.device("cpu") # tpu state flags self.use_tpu = False self.tpu_local_core_rank = None self.tpu_global_core_rank = None # distributed backend choice self.use_ddp = False self.use_ddp2 = False self.use_dp = False self.single_gpu = False self.distributed_backend = distributed_backend self.set_distributed_mode(distributed_backend, self.num_nodes) # override dist backend when using tpus if self.on_tpu: self.init_tpu() self.current_tpu_idx = None # init flags for SLURM+ddp to work self.proc_rank = 0 self.world_size = 1 self.node_rank = 0 self.configure_slurm_ddp(self.num_nodes) # nvidia setup self.set_nvidia_flags(self.is_slurm_managing_tasks, self.data_parallel_device_ids) # can't init progress bar here because starting a new process # means the progress_bar won't survive pickling # backward compatibility if show_progress_bar is not None: self.show_progress_bar = show_progress_bar # logging self.log_save_interval = log_save_interval self.val_check_interval = val_check_interval # backward compatibility if add_row_log_interval is not None: rank_zero_warn("`add_row_log_interval` has renamed to `row_log_interval` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) if not row_log_interval: # in case you did not set the proper value row_log_interval = add_row_log_interval self.row_log_interval = row_log_interval # how much of the data to use self.overfit_pct = overfit_pct self.determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct) # 16 bit mixed precision training using apex self.amp_level = amp_level self.precision = precision # Backward compatibility, TODO: remove in v0.9.0 if use_amp is not None: rank_zero_warn("`use_amp` has been replaced by `precision` since v0.7.0" " and this argument will be removed in v0.9.0", DeprecationWarning) self.precision = 16 if use_amp else 32 assert self.precision in (16, 32), 'only 32 or 16 bit precision supported' if self.precision == 16 and self.num_tpu_cores is None: use_amp = True self.init_amp(use_amp) # Callback system self.on_init_end()
def test_simple_profiler_summary(tmpdir, extended): """Test the summary of `SimpleProfiler`.""" profiler = SimpleProfiler(extended=extended) profiler.start_time = 63.0 hooks = [ "on_train_start", "on_train_end", "on_train_epoch_start", "on_train_epoch_end", "on_before_batch_transfer", "on_fit_start", ] sometime = 0.773434 sep = os.linesep max_action_len = len("on_before_batch_transfer") for i, hook in enumerate(hooks): with profiler.profile(hook): pass profiler.recorded_durations[hook] = [sometime + i] if extended: header_string = ( f"{sep}| {'Action':<{max_action_len}s}\t| {'Mean duration (s)':<15}\t| {'Num calls':<15}\t|" f" {'Total time (s)':<15}\t| {'Percentage %':<15}\t|") output_string_len = len(header_string.expandtabs()) sep_lines = f"{sep}{'-'* output_string_len}" expected_text = ( f"Profiler Report{sep}" f"{sep_lines}" f"{sep}| Action | Mean duration (s) | Num calls | Total time (s) | Percentage % |" # noqa: E501 f"{sep_lines}" f"{sep}| Total | - | 6 | 7.0 | 100 % |" # noqa: E501 f"{sep_lines}" f"{sep}| on_fit_start | 5.7734 | 1 | 5.7734 | 82.478 |" # noqa: E501 f"{sep}| on_before_batch_transfer | 4.7734 | 1 | 4.7734 | 68.192 |" # noqa: E501 f"{sep}| on_train_epoch_end | 3.7734 | 1 | 3.7734 | 53.906 |" # noqa: E501 f"{sep}| on_train_epoch_start | 2.7734 | 1 | 2.7734 | 39.62 |" # noqa: E501 f"{sep}| on_train_end | 1.7734 | 1 | 1.7734 | 25.335 |" # noqa: E501 f"{sep}| on_train_start | 0.77343 | 1 | 0.77343 | 11.049 |" # noqa: E501 f"{sep_lines}{sep}") else: header_string = ( f"{sep}| {'Action':<{max_action_len}s}\t| {'Mean duration (s)':<15}\t| {'Total time (s)':<15}\t|" ) output_string_len = len(header_string.expandtabs()) sep_lines = f"{sep}{'-'* output_string_len}" expected_text = ( f"Profiler Report{sep}" f"{sep_lines}" f"{sep}| Action | Mean duration (s) | Total time (s) |" f"{sep_lines}" f"{sep}| on_fit_start | 5.7734 | 5.7734 |" f"{sep}| on_before_batch_transfer | 4.7734 | 4.7734 |" f"{sep}| on_train_epoch_end | 3.7734 | 3.7734 |" f"{sep}| on_train_epoch_start | 2.7734 | 2.7734 |" f"{sep}| on_train_end | 1.7734 | 1.7734 |" f"{sep}| on_train_start | 0.77343 | 0.77343 |" f"{sep_lines}{sep}") summary = profiler.summary().expandtabs() assert expected_text == summary
def test_pytorch_profiler_deepcopy(tmpdir): pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profiler", schedule=None) pytorch_profiler.start("on_train_start") torch.tensor(1) pytorch_profiler.describe() assert deepcopy(pytorch_profiler) @pytest.mark.parametrize( ["profiler", "expected"], [ (None, PassThroughProfiler), (SimpleProfiler(), SimpleProfiler), (AdvancedProfiler(), AdvancedProfiler), ("simple", SimpleProfiler), ("Simple", SimpleProfiler), ("advanced", AdvancedProfiler), ("pytorch", PyTorchProfiler), ], ) def test_trainer_profiler_correct_args(profiler, expected): kwargs = {"profiler": profiler} if profiler is not None else {} trainer = Trainer(**kwargs) assert isinstance(trainer.profiler, expected) def test_trainer_profiler_incorrect_str_arg(): with pytest.raises(
f"\n# of train examples: {n_train}" f"\n# of val examples: {n_val}" f"\n# of test examples: {len(test_dataset)}") # init model model = LitModel(args) if args.patience is not None: early_stop_ckpt = EarlyStopping(monitor='val_loss', verbose=True, patience=args.patience) else: early_stop_ckpt = None profiler = SimpleProfiler() lightning_log_pth = '/lightning_logs' if not os.path.isdir(lightning_log_pth): logger.warning(f"Unable to find {lightning_log_pth} to log to! " f"If not running Grid then ignore.") save_dir = '' else: save_dir = lightning_log_pth tensorboard = TensorBoardLogger(save_dir=save_dir) mdl_ckpt = ModelCheckpoint(filepath=save_dir, save_top_k = 1, verbose = True, monitor = 'val_loss')