def start_predicting(self, trainer): with ExitStack(): # set up training routine self._results = trainer.run_predict() # Make sure all workers have finished training before returning to the user hvd.join()
def request_dataloader(self, dataloader_fx: Callable) -> DataLoader: """Handles downloading data in the GPU or TPU case. Args: dataloader_fx: The bound dataloader getter Returns: The dataloader """ dataloader = dataloader_fx() # get the function we'll use to get data if self.use_ddp or self.use_ddp2: # all processes wait until data download has happened torch_distrib.barrier() # data download/load on TPU elif self.use_tpu and XLA_AVAILABLE: # all processes wait until data download has happened torch_xla.core.xla_model.rendezvous('pl.TrainerDataLoadingMixin.get_dataloaders') elif self.use_horovod: # all processes wait until data download has happened hvd.join() return dataloader
def _init_model(self, model=None): """Load model desc from save path and parse to model.""" if model is not None: return model model_cfg = ClassFactory.__configs__.get('model') if 'model_desc_file' in model_cfg and model_cfg.model_desc_file is not None: desc_file = model_cfg.model_desc_file.replace( "{model_zoo}", self.model_zoo_path) desc_file = desc_file.replace("{local_base_path}", self.local_base_path) if ":" not in desc_file: desc_file = os.path.abspath(desc_file) if ":" in desc_file: local_desc_file = FileOps.join_path( self.local_output_path, os.path.basename(desc_file)) FileOps.copy_file(desc_file, local_desc_file) desc_file = local_desc_file if self.horovod: hvd.join() model_desc = Config(desc_file) logging.info("net_desc:{}".format(model_desc)) elif 'model_desc' in model_cfg and model_cfg.model_desc is not None: model_desc = model_cfg.model_desc else: return None if model_desc is not None: self.model_desc = model_desc net_desc = NetworkDesc(model_desc) model = net_desc.to_model() return model else: return None
def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"): """ Reduces a tensor from several distributed processes to one aggregated tensor. Args: tensor: the tensor to sync and reduce group: the process group to gather results from. Defaults to all processes (world) reduce_op: the reduction operation. Defaults to 'mean'/'avg'. Can also be a string 'sum' to calculate the sum during reduction. Return: reduced value, except when the input was not a tensor the output remains is unchanged """ if group is not None: raise ValueError( "Horovod does not support allreduce using a subcommunicator at this time. " "Unset `group`." ) if reduce_op in (None, "avg", "mean"): reduce_op = hvd.Average elif reduce_op == "sum": reduce_op = hvd.Sum else: raise ValueError(f"unrecognized `reduce_op`: {reduce_op}") # sync all processes before reduction hvd.join() return hvd.allreduce(tensor, op=reduce_op)
def start_testing(self, trainer): with ExitStack() as stack: # set up training routine # self.trainer.train_loop.setup_training(self.trainer.model) self._results = trainer.run_test() # Make sure all workers have finished training before returning to the user hvd.join()
def horovod_train(self, model): if torch.cuda.is_available() and self.on_gpu: # Horovod: pin GPU to local rank assert self.root_gpu == hvd.local_rank() torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) self.device = torch.device('cuda', self.root_gpu) # avoid duplicating progress bar if hvd.rank() != 0 and self.progress_bar_callback is not None: self.progress_bar_callback.disable() # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model) # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in self.optimizers: for param_group in optimizer.param_groups: param_group['lr'] *= hvd.size() if self.use_amp: # An example model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(model.state_dict(), root_rank=0) for optimizer in self.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) def filter_named_parameters(model, optimizer): opt_params = set([p for group in optimizer.param_groups for p in group.get('params', [])]) return [(name, p) for name, p in model.named_parameters() if p in opt_params] # Horovod: wrap optimizers to perform gradient aggregation via allreduce self.optimizers = [ hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters(model, optimizer)) for optimizer in self.optimizers ] # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.proc_rank = hvd.rank() rank_zero_only.rank = self.proc_rank with ExitStack() as stack: for optimizer in self.optimizers: # Synchronization will be performed explicitly following backward() stack.enter_context(optimizer.skip_synchronize()) self.run_pretrain_routine(model) # Make sure all workers have finished training before returning to the user hvd.join()
def train(self): with ExitStack() as stack: for optimizer in self.trainer.optimizers: # Synchronization will be performed explicitly following backward() stack.enter_context(optimizer.skip_synchronize()) result = self.trainer.run_pretrain_routine(self.trainer.model) # Make sure all workers have finished training before returning to the user hvd.join() return result
def start_training(self, trainer): with ExitStack() as stack: for optimizer in trainer.optimizers: # Synchronization will be performed explicitly following backward() stack.enter_context(optimizer.skip_synchronize()) # set up training routine self._results = trainer.run_train() # Make sure all workers have finished training before returning to the user hvd.join()
def _init_dataloader(self): """Init dataloader from timm.""" if self.distributed and hvd.local_rank( ) == 0 and 'remote_data_dir' in self.config.dataset: FileOps.copy_folder(self.config.dataset.remote_data_dir, self.config.dataset.data_dir) if self.distributed: hvd.join() args = self.config.dataset train_dir = os.path.join(self.config.dataset.data_dir, 'train') dataset_train = Dataset(train_dir) world_size, rank = None, None if self.distributed: world_size, rank = hvd.size(), hvd.rank() self.trainer.train_loader = create_loader( dataset_train, input_size=tuple(args.input_size), batch_size=args.batch_size, is_training=True, use_prefetcher=self.config.prefetcher, rand_erase_prob=args.reprob, rand_erase_mode=args.remode, rand_erase_count=args.recount, color_jitter=args.color_jitter, auto_augment=args.aa, interpolation='random', mean=tuple(args.mean), std=tuple(args.std), num_workers=args.workers, distributed=self.distributed, world_size=world_size, rank=rank) valid_dir = os.path.join(self.config.dataset.data_dir, 'val') dataset_eval = Dataset(valid_dir) self.trainer.valid_loader = create_loader( dataset_eval, input_size=tuple(args.input_size), batch_size=4 * args.batch_size, is_training=False, use_prefetcher=self.config.prefetcher, interpolation=args.interpolation, mean=tuple(args.mean), std=tuple(args.std), num_workers=args.workers, distributed=self.distributed, world_size=world_size, rank=rank) self.trainer.batch_num_train = len(self.trainer.train_loader) self.trainer.batch_num_valid = len(self.trainer.valid_loader)
def train(self): with ExitStack() as stack: for optimizer in self.trainer.optimizers: # Synchronization will be performed explicitly following backward() stack.enter_context(optimizer.skip_synchronize()) # set up training routine self.trainer.train_loop.setup_training(self.trainer.model) # train or test results = self.train_or_test() # Make sure all workers have finished training before returning to the user hvd.join() return results
def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None): if group is not None: raise ValueError( "Horovod does not support allgather using a subcommunicator at this time. " "Unset `group`." ) if len(result.shape) == 0: # Convert scalars to single dimension tensors result = result.reshape(1) # sync and gather all hvd.join() gathered = hvd.allgather(result) gathered_result = list(gathered.split(1, dim=0)) return gathered_result
def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): if group is not None: raise ValueError( "Horovod does not support allreduce using a subcommunicator at this time. " "Unset `group`." ) if reduce_op is None or reduce_op == "sum": reduce_op = hvd.Sum elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): reduce_op = hvd.Average else: raise ValueError(f"unrecognized `reduce_op`: {reduce_op}") # sync all processes before reduction hvd.join() return hvd.allreduce(output, op=reduce_op)
def _init_model(self): """Initialize the model architecture for full train step. :return: train model :rtype: class """ logging.info('Initializing model') if 'model_desc' in self.cfg and self.cfg.model_desc is not None: if self.horovod: hvd.join() model_desc = self.cfg.model_desc self.model_desc = self.cfg.model_desc net_desc = NetworkDesc(model_desc) model = net_desc.to_model() return model else: return None
def _init_dataloader(self, mode, loader=None): """Init dataloader.""" if loader is not None: return loader if self.horovod: if hvd.local_rank() == 0: Dataset() hvd.join() if mode == "train" and self.hps is not None and self.hps.get( "dataset") is not None: dataset = Dataset(mode=mode, hp=self.hps) else: dataset = Dataset(mode=mode) if self.horovod: sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=hvd.size(), rank=hvd.rank()) dataset.sampler = sampler return dataset.dataloader
def _init_model(self): """Initialize the model architecture for full train step. :return: train model :rtype: class """ logging.info('Initializing model') if self.cfg.model_desc: logging.debug("model_desc: {}".format(self.cfg.model_desc)) _file = FileOps.join_path(self.worker_path, "model_desc_{}.json".format(self._worker_id)) with open(_file, "w") as f: json.dump(self.cfg.model_desc, f) if self.cfg.distributed: hvd.join() model_desc = self.cfg.model_desc net_desc = NetworkDesc(model_desc) model = net_desc.to_model() return model else: return None
def restore_weights(self, model: LightningModule): """ We attempt to restore weights in this order: 1. HPC weights. 2. if no HPC weights restore checkpoint_path weights 3. otherwise don't restore weights """ # clear cache before restore if self.on_gpu: torch.cuda.empty_cache() # if script called from hpc resubmit, load weights did_restore_hpc_weights = self.restore_hpc_weights_if_needed(model) # clear cache after restore if self.on_gpu: torch.cuda.empty_cache() if not did_restore_hpc_weights: if self.resume_from_checkpoint is not None: self.restore(self.resume_from_checkpoint, on_gpu=self.on_gpu) # wait for all models to restore weights if self.use_ddp or self.use_ddp2: # wait for all processes to catch up torch_distrib.barrier() # wait for all models to restore weights if self.on_tpu and XLA_AVAILABLE: # wait for all processes to catch up torch_xla.core.xla_model.rendezvous( "pl.TrainerIOMixin.restore_weights") elif self.use_horovod: # wait for all processes to catch up hvd.join() # clear cache after restore if self.on_gpu: torch.cuda.empty_cache()
def train_epoch(self, train_dataloader): self.model.train() torch.set_grad_enabled(True) self.current_step = 0 self.callback_handler.handle(self, self.model, "on_train_epoch_start") epochs = self.settings["epochs"] dl_iter = iter(train_dataloader) pbar = tqdm(range(self.settings["steps_per_epoch"]), dynamic_ncols=True, disable=(hvd.rank() != 0)) for step in pbar: self.current_step = step self.global_step = step + self.current_epoch * self.settings[ "steps_per_epoch"] try: data = next(dl_iter) except StopIteration: dl_iter = iter(train_dataloader) data = next(dl_iter) current_loss = self.train_step(data, step) avg_loss = self.metric_container["loss"].avg # set pbar description pbar.set_description( f"TRAIN hvd rank: {hvd.rank()} epoch {self.current_epoch+1}/{epochs} idx {step} \ current loss {current_loss}, avg loss {avg_loss}") hvd.join() if hvd.rank() == 0: self.callback_handler.handle(self, self.model, "on_train_epoch_end")
def join(self): if self.on_gpu: hvd.join(self.local_rank) else: hvd.join()
import logging import horovod.torch as hvd from vega.core.common.class_factory import ClassFactory from vega.core.common.user_config import UserConfig from vega.core.common.file_ops import FileOps parser = argparse.ArgumentParser(description='Horovod Fully Train') parser.add_argument('--cf_file', type=str, help='ClassFactory pickle file') args = parser.parse_args() if 'VEGA_INIT_ENV' in os.environ: exec(os.environ.copy()['VEGA_INIT_ENV']) logging.info('start horovod setting') hvd.init() try: import moxing as mox mox.file.set_auth(obs_client_log=False) except: pass FileOps.copy_file(args.cf_file, './cf_file.pickle') hvd.join() with open('./cf_file.pickle', 'rb') as f: cf_content = pickle.load(f) ClassFactory.__configs__ = cf_content.get('configs') ClassFactory.__registry__ = cf_content.get('registry') UserConfig().__data__ = cf_content.get('data') cls_trainer = ClassFactory.get_cls('trainer') trainer = cls_trainer(None, 0) trainer.train_process()
def sync_horovod(self): if self.use_horovod: hvd.join(hvd.local_rank() if self.on_gpu else -1)
def barrier(self, name: Optional[str] = None): hvd.join()
def run_training_epoch(self): # get model model = self.get_model() # Epoch start events with self.profiler.profile('on_epoch_start'): # callbacks self.on_epoch_start() # model hooks if self.is_function_implemented('on_epoch_start'): model.on_epoch_start() # track local dataloader so TPU can wrap each epoch train_dataloader = self.train_dataloader # on TPU we have to wrap it under the ParallelLoader if self.use_tpu: device = xm.xla_device(self.tpu_id) train_dataloader = xla_pl.ParallelLoader(train_dataloader, [device]) train_dataloader = train_dataloader.per_device_loader(device) # bookkeeping outputs = [] # run epoch for batch_idx, (batch, is_last_batch) in self.profiler.profile_iterable( enumerate(_with_is_last(train_dataloader)), "get_train_batch"): # stop epoch if we limited the number of training batches if batch_idx >= self.num_training_batches: break self.batch_idx = batch_idx model.global_step = self.global_step # --------------- # RUN TRAIN STEP # --------------- _outputs = self.run_training_batch(batch, batch_idx) batch_result, grad_norm_dic, batch_step_metrics, batch_output = _outputs # only track outputs when user implements training_epoch_end # otherwise we will build up unnecessary memory if self.is_overridden('training_epoch_end', model=self.get_model()): outputs.append(batch_output) # when returning -1 from train_step, we end epoch early early_stop_epoch = batch_result == -1 # TODO: consolidate all actions that need to take place only after # self.accumulate_grad_batches steps (optimizer step, lr update, global step increment) if (self.batch_idx + 1) % self.accumulate_grad_batches == 0: # update lr self.update_learning_rates(interval='step') # --------------- # RUN VAL STEP # --------------- is_val_check_batch = (batch_idx + 1) % self.val_check_batch == 0 can_check_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0 can_check_val = not self.disable_validation and can_check_epoch should_check_val = is_val_check_batch or early_stop_epoch should_check_val = should_check_val or ( is_last_batch and self.val_check_batch == float('inf')) should_check_val = can_check_val and should_check_val # --------------- # CHECKPOINTING, EARLY STOPPING # --------------- # fast_dev_run always forces val checking after train batch if self.fast_dev_run or should_check_val: self.run_evaluation(test_mode=self.testing) self.call_checkpoint_callback() # when logs should be saved should_save_log = ( batch_idx + 1) % self.log_save_interval == 0 or early_stop_epoch if should_save_log or self.fast_dev_run: if self.proc_rank == 0 and self.logger is not None: self.logger.save() # when metrics should be logged should_log_metrics = batch_idx % self.row_log_interval == 0 or early_stop_epoch if should_log_metrics or self.fast_dev_run: # logs user requested information to logger self.log_metrics(batch_step_metrics, grad_norm_dic) # progress global step according to grads progress if (self.batch_idx + 1) % self.accumulate_grad_batches == 0: self.global_step += 1 self.total_batch_idx += 1 # max steps reached, end training if self.max_steps is not None and self.max_steps == self.global_step: break # end epoch early # stop when the flag is changed or we've gone past the amount # requested in the batches if early_stop_epoch or self.fast_dev_run: break if self.use_horovod: hvd.join(hvd.local_rank() if self.on_gpu else -1) # process epoch outputs model = self.get_model() if self.is_overridden('training_epoch_end', model=model): epoch_output = model.training_epoch_end(outputs) _processed_outputs = self.process_output(epoch_output) log_epoch_metrics = _processed_outputs[2] callback_epoch_metrics = _processed_outputs[3] self.log_metrics(log_epoch_metrics, {}) self.callback_metrics.update(callback_epoch_metrics) self.add_progress_bar_metrics(_processed_outputs[1]) # when no val loop is present or fast-dev-run still need to call checkpoints if not self.is_overridden('validation_step') and not ( self.fast_dev_run or should_check_val): self.call_checkpoint_callback() # Epoch end events with self.profiler.profile('on_epoch_end'): # callbacks self.on_epoch_end() # model hooks if self.is_function_implemented('on_epoch_end'): model.on_epoch_end()
def run_pretrain_routine(self, model: LightningModule): """Sanity check a few things before starting actual training. Args: model: The model to run sanity test on. """ ref_model = model if self.data_parallel: ref_model = model.module # give model convenience properties ref_model.trainer = self # set local properties on the model self.copy_trainer_model_properties(ref_model) # init amp. Must be done here instead of __init__ to allow ddp to work if self.use_native_amp and self.precision == 16: self.scaler = torch.cuda.amp.GradScaler() # log hyper-parameters if self.logger is not None: # save exp to get started self.logger.log_hyperparams(ref_model.module_arguments) self.logger.save() if self.use_ddp or self.use_ddp2: torch_distrib.barrier() # wait for all models to restore weights if self.on_tpu and XLA_AVAILABLE: # wait for all processes to catch up torch_xla.core.xla_model.rendezvous( "pl.Trainer.run_pretrain_routine") elif self.use_horovod: # wait for all processes to catch up hvd.join() # register auto-resubmit when on SLURM self.register_slurm_signal_handlers() # print model summary # TODO: remove self.testing condition because model.summarize() is wiping out the weights if self.proc_rank == 0 and self.weights_summary is not None and not self.testing: if self.weights_summary in ['full', 'top']: ref_model.summarize(mode=self.weights_summary) else: raise MisconfigurationException( "weights_summary can be None, 'full' or 'top'") # track model now. # if cluster resets state, the model will update with the saved weights self.model = model # set up checkpoint callback self.configure_checkpoint_callback() # restore training and model before hpc call self.restore_weights(model) # when testing requested only run test and return if self.testing: # only load test dataloader for testing # self.reset_test_dataloader(ref_model) self.run_evaluation(test_mode=True) return # check if we should run validation during training self.disable_validation = not (self.is_overridden('validation_step') and self.val_percent_check > 0) \ and not self.fast_dev_run # run tiny validation (if validation defined) # to make sure program won't crash during val if not self.disable_validation and self.num_sanity_val_steps > 0: self.reset_val_dataloader(ref_model) # hook and callback ref_model.on_sanity_check_start() self.on_sanity_check_start() eval_results = self._evaluate(model, self.val_dataloaders, self.num_sanity_val_steps, False) _, _, _, callback_metrics, _ = self.process_output(eval_results) self.on_sanity_check_end() # verify that early stop has conditioned on a metric that exists if self.enable_early_stop: self.early_stop_callback._validate_condition_metric( callback_metrics) # clear cache before training if self.on_gpu: torch.cuda.empty_cache() # CORE TRAINING LOOP self.train()
def barrier(self, name: str = None): hvd.join()
def on_train_epoch_end(self): hvd.join(hvd.local_rank() if self.trainer.on_gpu else -1)
def validation_phase(self, dataset: torch.utils.data.Dataset): self.callback_handler.handle(self, self.model, "on_validation_epoch_start") validation_sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=hvd.size(), rank=hvd.rank()) validation_dataloader = torch.utils.data.DataLoader( dataset, shuffle=False, batch_size=self.settings["validation_batch_size"], num_workers=self.settings["num_workers"], sampler=validation_sampler, ) if self.settings["validation_steps"] is None: self.settings["validation_steps"] = len(validation_dataloader) epochs = self.settings["epochs"] validation_sum_loss = AverageMeter() val_metric = AverageMeter() self.model.eval() with torch.no_grad(): dl_iter = iter(validation_dataloader) pbar = tqdm(range(self.settings["validation_steps"]), dynamic_ncols=True, disable=(hvd.rank() != 0)) for step in pbar: self.current_step = step if hvd.rank() == 0: self.callback_handler.handle(self, self.model, "on_validation_step_start") try: data = next(dl_iter) except StopIteration: dl_iter = iter(validation_dataloader) data = next(dl_iter) loss, metrics = self.model.train_step(data) # add to average meter for loss validation_sum_loss.update( loss.item(), self.settings["validation_batch_size"]) # validation metric update if hvd.rank() == 0: val_metric.update( metrics[self.settings["validation_metric"]], self.settings["validation_batch_size"]) for _, (key, val) in enumerate(metrics.items()): self.validation_metrics_container[key].update( val, self.settings["batch_size"]) # set pbar description pbar.set_description(( f"VALIDATION hvd rank: {hvd.rank()} epoch {self.current_epoch+1}/{epochs} step {step}" f" current loss {validation_sum_loss.current}, avg loss {validation_sum_loss.avg}" f", validation_metric {val_metric.avg}")) hvd.join() if hvd.rank() == 0: self.callback_handler.handle(self, self.model, "on_validation_step_end") hvd.join() if hvd.rank() == 0: logging.info( f"Validation result metric for epoch {self.current_epoch} = {val_metric.avg}" ) self.callback_handler.handle(self, self.model, "on_validation_epoch_end")
def start_testing(self, trainer): with ExitStack(): self._results = trainer.run_test() # Make sure all workers have finished training before returning to the user hvd.join()
def join(self) -> None: if self.root_device.type == "cuda": hvd.join(self.local_rank) else: hvd.join()
lr_scaler = hvd.size() optimizer = torch.optim.Adam(model.parameters(), lr=0.01 * lr_scaler) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) for epoch in range(args.epochs): start = time() print(f"Training epoch {epoch}") train_loss, y_pred, y = process_epoch(train_loader, model, train=True, optimizer=optimizer) hvd.join(gpu_to_use) hvd.broadcast_parameters(model.state_dict(), root_rank=0) print(f"Epoch {epoch:02d}. Train loss: {train_loss:.4f}.") hvd.join(gpu_to_use) t_final = time() - start total_rows = train_dataset.num_rows_processed print(f"run_time: {t_final} - rows: {total_rows} - " f"epochs: {epoch} - dl_thru: {total_rows / t_final}") hvd.join(gpu_to_use) if hvd.local_rank() == 0: print("Training complete")
def barrier(self, *args, **kwargs): if torch_distrib.is_initialized(): hvd.join()