Пример #1
0
    def start_predicting(self, trainer):
        with ExitStack():
            # set up training routine
            self._results = trainer.run_predict()

        # Make sure all workers have finished training before returning to the user
        hvd.join()
Пример #2
0
    def request_dataloader(self, dataloader_fx: Callable) -> DataLoader:
        """Handles downloading data in the GPU or TPU case.

        Args:
            dataloader_fx: The bound dataloader getter

        Returns:
            The dataloader
        """
        dataloader = dataloader_fx()

        # get the function we'll use to get data
        if self.use_ddp or self.use_ddp2:
            # all processes wait until data download has happened
            torch_distrib.barrier()

        # data download/load on TPU
        elif self.use_tpu and XLA_AVAILABLE:
            # all processes wait until data download has happened
            torch_xla.core.xla_model.rendezvous('pl.TrainerDataLoadingMixin.get_dataloaders')

        elif self.use_horovod:
            # all processes wait until data download has happened
            hvd.join()

        return dataloader
Пример #3
0
 def _init_model(self, model=None):
     """Load model desc from save path and parse to model."""
     if model is not None:
         return model
     model_cfg = ClassFactory.__configs__.get('model')
     if 'model_desc_file' in model_cfg and model_cfg.model_desc_file is not None:
         desc_file = model_cfg.model_desc_file.replace(
             "{model_zoo}", self.model_zoo_path)
         desc_file = desc_file.replace("{local_base_path}",
                                       self.local_base_path)
         if ":" not in desc_file:
             desc_file = os.path.abspath(desc_file)
         if ":" in desc_file:
             local_desc_file = FileOps.join_path(
                 self.local_output_path, os.path.basename(desc_file))
             FileOps.copy_file(desc_file, local_desc_file)
             desc_file = local_desc_file
         if self.horovod:
             hvd.join()
         model_desc = Config(desc_file)
         logging.info("net_desc:{}".format(model_desc))
     elif 'model_desc' in model_cfg and model_cfg.model_desc is not None:
         model_desc = model_cfg.model_desc
     else:
         return None
     if model_desc is not None:
         self.model_desc = model_desc
         net_desc = NetworkDesc(model_desc)
         model = net_desc.to_model()
         return model
     else:
         return None
Пример #4
0
    def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"):
        """
        Reduces a tensor from several distributed processes to one aggregated tensor.

        Args:
            tensor: the tensor to sync and reduce
            group: the process group to gather results from. Defaults to all processes (world)
            reduce_op: the reduction operation. Defaults to 'mean'/'avg'.
                Can also be a string 'sum' to calculate the sum during reduction.

        Return:
            reduced value, except when the input was not a tensor the output remains is unchanged
        """
        if group is not None:
            raise ValueError(
                "Horovod does not support allreduce using a subcommunicator at this time. "
                "Unset `group`."
            )

        if reduce_op in (None, "avg", "mean"):
            reduce_op = hvd.Average
        elif reduce_op == "sum":
            reduce_op = hvd.Sum
        else:
            raise ValueError(f"unrecognized `reduce_op`: {reduce_op}")

        # sync all processes before reduction
        hvd.join()
        return hvd.allreduce(tensor, op=reduce_op)
Пример #5
0
    def start_testing(self, trainer):
        with ExitStack() as stack:
            # set up training routine
            # self.trainer.train_loop.setup_training(self.trainer.model)
            self._results = trainer.run_test()

        # Make sure all workers have finished training before returning to the user
        hvd.join()
Пример #6
0
    def horovod_train(self, model):
        if torch.cuda.is_available() and self.on_gpu:
            # Horovod: pin GPU to local rank
            assert self.root_gpu == hvd.local_rank()
            torch.cuda.set_device(self.root_gpu)
            model.cuda(self.root_gpu)
            self.device = torch.device('cuda', self.root_gpu)

        # avoid duplicating progress bar
        if hvd.rank() != 0 and self.progress_bar_callback is not None:
            self.progress_bar_callback.disable()

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)

        # Horovod: scale the learning rate by the number of workers to account for
        # increased total batch size
        for optimizer in self.optimizers:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= hvd.size()

        if self.use_amp:
            # An example
            model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
            self.optimizers = optimizers

        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        for optimizer in self.optimizers:
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        def filter_named_parameters(model, optimizer):
            opt_params = set([p for group in optimizer.param_groups for p in group.get('params', [])])
            return [(name, p) for name, p in model.named_parameters() if p in opt_params]

        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
        self.optimizers = [
            hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters(model, optimizer))
            for optimizer in self.optimizers
        ]

        # Update logger rank info from Horovod to avoid race conditions from  different ranks
        # creating directories / writing files in the same locations.
        self.proc_rank = hvd.rank()
        rank_zero_only.rank = self.proc_rank

        with ExitStack() as stack:
            for optimizer in self.optimizers:
                # Synchronization will be performed explicitly following backward()
                stack.enter_context(optimizer.skip_synchronize())

            self.run_pretrain_routine(model)

        # Make sure all workers have finished training before returning to the user
        hvd.join()
Пример #7
0
    def train(self):
        with ExitStack() as stack:
            for optimizer in self.trainer.optimizers:
                # Synchronization will be performed explicitly following backward()
                stack.enter_context(optimizer.skip_synchronize())

            result = self.trainer.run_pretrain_routine(self.trainer.model)

        # Make sure all workers have finished training before returning to the user
        hvd.join()
        return result
Пример #8
0
    def start_training(self, trainer):
        with ExitStack() as stack:
            for optimizer in trainer.optimizers:
                # Synchronization will be performed explicitly following backward()
                stack.enter_context(optimizer.skip_synchronize())

            # set up training routine
            self._results = trainer.run_train()

        # Make sure all workers have finished training before returning to the user
        hvd.join()
Пример #9
0
 def _init_dataloader(self):
     """Init dataloader from timm."""
     if self.distributed and hvd.local_rank(
     ) == 0 and 'remote_data_dir' in self.config.dataset:
         FileOps.copy_folder(self.config.dataset.remote_data_dir,
                             self.config.dataset.data_dir)
     if self.distributed:
         hvd.join()
     args = self.config.dataset
     train_dir = os.path.join(self.config.dataset.data_dir, 'train')
     dataset_train = Dataset(train_dir)
     world_size, rank = None, None
     if self.distributed:
         world_size, rank = hvd.size(), hvd.rank()
     self.trainer.train_loader = create_loader(
         dataset_train,
         input_size=tuple(args.input_size),
         batch_size=args.batch_size,
         is_training=True,
         use_prefetcher=self.config.prefetcher,
         rand_erase_prob=args.reprob,
         rand_erase_mode=args.remode,
         rand_erase_count=args.recount,
         color_jitter=args.color_jitter,
         auto_augment=args.aa,
         interpolation='random',
         mean=tuple(args.mean),
         std=tuple(args.std),
         num_workers=args.workers,
         distributed=self.distributed,
         world_size=world_size,
         rank=rank)
     valid_dir = os.path.join(self.config.dataset.data_dir, 'val')
     dataset_eval = Dataset(valid_dir)
     self.trainer.valid_loader = create_loader(
         dataset_eval,
         input_size=tuple(args.input_size),
         batch_size=4 * args.batch_size,
         is_training=False,
         use_prefetcher=self.config.prefetcher,
         interpolation=args.interpolation,
         mean=tuple(args.mean),
         std=tuple(args.std),
         num_workers=args.workers,
         distributed=self.distributed,
         world_size=world_size,
         rank=rank)
     self.trainer.batch_num_train = len(self.trainer.train_loader)
     self.trainer.batch_num_valid = len(self.trainer.valid_loader)
Пример #10
0
    def train(self):
        with ExitStack() as stack:
            for optimizer in self.trainer.optimizers:
                # Synchronization will be performed explicitly following backward()
                stack.enter_context(optimizer.skip_synchronize())

            # set up training routine
            self.trainer.train_loop.setup_training(self.trainer.model)

            # train or test
            results = self.train_or_test()

        # Make sure all workers have finished training before returning to the user
        hvd.join()
        return results
Пример #11
0
    def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None):
        if group is not None:
            raise ValueError(
                "Horovod does not support allgather using a subcommunicator at this time. "
                "Unset `group`."
            )

        if len(result.shape) == 0:
            # Convert scalars to single dimension tensors
            result = result.reshape(1)

        # sync and gather all
        hvd.join()
        gathered = hvd.allgather(result)
        gathered_result = list(gathered.split(1, dim=0))
        return gathered_result
Пример #12
0
    def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
        if group is not None:
            raise ValueError(
                "Horovod does not support allreduce using a subcommunicator at this time. "
                "Unset `group`."
            )

        if reduce_op is None or reduce_op == "sum":
            reduce_op = hvd.Sum
        elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):
            reduce_op = hvd.Average
        else:
            raise ValueError(f"unrecognized `reduce_op`: {reduce_op}")

        # sync all processes before reduction
        hvd.join()
        return hvd.allreduce(output, op=reduce_op)
Пример #13
0
    def _init_model(self):
        """Initialize the model architecture for full train step.

        :return: train model
        :rtype: class
        """
        logging.info('Initializing model')
        if 'model_desc' in self.cfg and self.cfg.model_desc is not None:
            if self.horovod:
                hvd.join()
            model_desc = self.cfg.model_desc
            self.model_desc = self.cfg.model_desc
            net_desc = NetworkDesc(model_desc)
            model = net_desc.to_model()
            return model
        else:
            return None
Пример #14
0
 def _init_dataloader(self, mode, loader=None):
     """Init dataloader."""
     if loader is not None:
         return loader
     if self.horovod:
         if hvd.local_rank() == 0:
             Dataset()
         hvd.join()
     if mode == "train" and self.hps is not None and self.hps.get(
             "dataset") is not None:
         dataset = Dataset(mode=mode, hp=self.hps)
     else:
         dataset = Dataset(mode=mode)
     if self.horovod:
         sampler = torch.utils.data.distributed.DistributedSampler(
             dataset, num_replicas=hvd.size(), rank=hvd.rank())
         dataset.sampler = sampler
     return dataset.dataloader
Пример #15
0
    def _init_model(self):
        """Initialize the model architecture for full train step.

        :return: train model
        :rtype: class
        """
        logging.info('Initializing model')
        if self.cfg.model_desc:
            logging.debug("model_desc: {}".format(self.cfg.model_desc))
            _file = FileOps.join_path(self.worker_path, "model_desc_{}.json".format(self._worker_id))
            with open(_file, "w") as f:
                json.dump(self.cfg.model_desc, f)
            if self.cfg.distributed:
                hvd.join()
            model_desc = self.cfg.model_desc
            net_desc = NetworkDesc(model_desc)
            model = net_desc.to_model()
            return model
        else:
            return None
Пример #16
0
    def restore_weights(self, model: LightningModule):
        """
        We attempt to restore weights in this order:
        1. HPC weights.
        2. if no HPC weights restore checkpoint_path weights
        3. otherwise don't restore weights
        """
        # clear cache before restore
        if self.on_gpu:
            torch.cuda.empty_cache()

        # if script called from hpc resubmit, load weights
        did_restore_hpc_weights = self.restore_hpc_weights_if_needed(model)

        # clear cache after restore
        if self.on_gpu:
            torch.cuda.empty_cache()

        if not did_restore_hpc_weights:
            if self.resume_from_checkpoint is not None:
                self.restore(self.resume_from_checkpoint, on_gpu=self.on_gpu)

        # wait for all models to restore weights
        if self.use_ddp or self.use_ddp2:
            # wait for all processes to catch up
            torch_distrib.barrier()

        # wait for all models to restore weights
        if self.on_tpu and XLA_AVAILABLE:
            # wait for all processes to catch up
            torch_xla.core.xla_model.rendezvous(
                "pl.TrainerIOMixin.restore_weights")

        elif self.use_horovod:
            # wait for all processes to catch up
            hvd.join()

        # clear cache after restore
        if self.on_gpu:
            torch.cuda.empty_cache()
Пример #17
0
    def train_epoch(self, train_dataloader):
        self.model.train()
        torch.set_grad_enabled(True)
        self.current_step = 0

        self.callback_handler.handle(self, self.model, "on_train_epoch_start")

        epochs = self.settings["epochs"]

        dl_iter = iter(train_dataloader)

        pbar = tqdm(range(self.settings["steps_per_epoch"]),
                    dynamic_ncols=True,
                    disable=(hvd.rank() != 0))

        for step in pbar:
            self.current_step = step
            self.global_step = step + self.current_epoch * self.settings[
                "steps_per_epoch"]
            try:
                data = next(dl_iter)
            except StopIteration:
                dl_iter = iter(train_dataloader)
                data = next(dl_iter)

            current_loss = self.train_step(data, step)

            avg_loss = self.metric_container["loss"].avg

            # set pbar description
            pbar.set_description(
                f"TRAIN hvd rank: {hvd.rank()} epoch {self.current_epoch+1}/{epochs} idx {step} \
                current loss {current_loss}, avg loss {avg_loss}")

        hvd.join()
        if hvd.rank() == 0:
            self.callback_handler.handle(self, self.model,
                                         "on_train_epoch_end")
Пример #18
0
 def join(self):
     if self.on_gpu:
         hvd.join(self.local_rank)
     else:
         hvd.join()
Пример #19
0
import logging
import horovod.torch as hvd
from vega.core.common.class_factory import ClassFactory
from vega.core.common.user_config import UserConfig
from vega.core.common.file_ops import FileOps

parser = argparse.ArgumentParser(description='Horovod Fully Train')
parser.add_argument('--cf_file', type=str, help='ClassFactory pickle file')
args = parser.parse_args()

if 'VEGA_INIT_ENV' in os.environ:
    exec(os.environ.copy()['VEGA_INIT_ENV'])
logging.info('start horovod setting')
hvd.init()
try:
    import moxing as mox

    mox.file.set_auth(obs_client_log=False)
except:
    pass
FileOps.copy_file(args.cf_file, './cf_file.pickle')
hvd.join()
with open('./cf_file.pickle', 'rb') as f:
    cf_content = pickle.load(f)
ClassFactory.__configs__ = cf_content.get('configs')
ClassFactory.__registry__ = cf_content.get('registry')
UserConfig().__data__ = cf_content.get('data')
cls_trainer = ClassFactory.get_cls('trainer')
trainer = cls_trainer(None, 0)
trainer.train_process()
Пример #20
0
 def sync_horovod(self):
     if self.use_horovod:
         hvd.join(hvd.local_rank() if self.on_gpu else -1)
Пример #21
0
 def barrier(self, name: Optional[str] = None):
     hvd.join()
Пример #22
0
    def run_training_epoch(self):

        # get model
        model = self.get_model()

        # Epoch start events
        with self.profiler.profile('on_epoch_start'):
            # callbacks
            self.on_epoch_start()

            # model hooks
            if self.is_function_implemented('on_epoch_start'):
                model.on_epoch_start()

        # track local dataloader so TPU can wrap each epoch
        train_dataloader = self.train_dataloader

        # on TPU we have to wrap it under the ParallelLoader
        if self.use_tpu:
            device = xm.xla_device(self.tpu_id)
            train_dataloader = xla_pl.ParallelLoader(train_dataloader,
                                                     [device])
            train_dataloader = train_dataloader.per_device_loader(device)

        # bookkeeping
        outputs = []

        # run epoch
        for batch_idx, (batch,
                        is_last_batch) in self.profiler.profile_iterable(
                            enumerate(_with_is_last(train_dataloader)),
                            "get_train_batch"):
            # stop epoch if we limited the number of training batches
            if batch_idx >= self.num_training_batches:
                break

            self.batch_idx = batch_idx

            model.global_step = self.global_step

            # ---------------
            # RUN TRAIN STEP
            # ---------------
            _outputs = self.run_training_batch(batch, batch_idx)
            batch_result, grad_norm_dic, batch_step_metrics, batch_output = _outputs

            # only track outputs when user implements training_epoch_end
            # otherwise we will build up unnecessary memory
            if self.is_overridden('training_epoch_end',
                                  model=self.get_model()):
                outputs.append(batch_output)

            # when returning -1 from train_step, we end epoch early
            early_stop_epoch = batch_result == -1

            # TODO: consolidate all actions that need to take place only after
            # self.accumulate_grad_batches steps (optimizer step, lr update, global step increment)
            if (self.batch_idx + 1) % self.accumulate_grad_batches == 0:
                # update lr
                self.update_learning_rates(interval='step')

            # ---------------
            # RUN VAL STEP
            # ---------------
            is_val_check_batch = (batch_idx + 1) % self.val_check_batch == 0
            can_check_epoch = (self.current_epoch +
                               1) % self.check_val_every_n_epoch == 0
            can_check_val = not self.disable_validation and can_check_epoch
            should_check_val = is_val_check_batch or early_stop_epoch
            should_check_val = should_check_val or (
                is_last_batch and self.val_check_batch == float('inf'))
            should_check_val = can_check_val and should_check_val

            # ---------------
            # CHECKPOINTING, EARLY STOPPING
            # ---------------
            # fast_dev_run always forces val checking after train batch
            if self.fast_dev_run or should_check_val:
                self.run_evaluation(test_mode=self.testing)
                self.call_checkpoint_callback()

            # when logs should be saved
            should_save_log = (
                batch_idx +
                1) % self.log_save_interval == 0 or early_stop_epoch
            if should_save_log or self.fast_dev_run:
                if self.proc_rank == 0 and self.logger is not None:
                    self.logger.save()

            # when metrics should be logged
            should_log_metrics = batch_idx % self.row_log_interval == 0 or early_stop_epoch
            if should_log_metrics or self.fast_dev_run:
                # logs user requested information to logger
                self.log_metrics(batch_step_metrics, grad_norm_dic)

            # progress global step according to grads progress
            if (self.batch_idx + 1) % self.accumulate_grad_batches == 0:
                self.global_step += 1
            self.total_batch_idx += 1

            # max steps reached, end training
            if self.max_steps is not None and self.max_steps == self.global_step:
                break

            # end epoch early
            # stop when the flag is changed or we've gone past the amount
            # requested in the batches
            if early_stop_epoch or self.fast_dev_run:
                break

        if self.use_horovod:
            hvd.join(hvd.local_rank() if self.on_gpu else -1)

        # process epoch outputs
        model = self.get_model()
        if self.is_overridden('training_epoch_end', model=model):
            epoch_output = model.training_epoch_end(outputs)
            _processed_outputs = self.process_output(epoch_output)
            log_epoch_metrics = _processed_outputs[2]
            callback_epoch_metrics = _processed_outputs[3]
            self.log_metrics(log_epoch_metrics, {})
            self.callback_metrics.update(callback_epoch_metrics)
            self.add_progress_bar_metrics(_processed_outputs[1])

        # when no val loop is present or fast-dev-run still need to call checkpoints
        if not self.is_overridden('validation_step') and not (
                self.fast_dev_run or should_check_val):
            self.call_checkpoint_callback()

        # Epoch end events
        with self.profiler.profile('on_epoch_end'):
            # callbacks
            self.on_epoch_end()
            # model hooks
            if self.is_function_implemented('on_epoch_end'):
                model.on_epoch_end()
Пример #23
0
    def run_pretrain_routine(self, model: LightningModule):
        """Sanity check a few things before starting actual training.

        Args:
            model: The model to run sanity test on.
        """
        ref_model = model
        if self.data_parallel:
            ref_model = model.module

        # give model convenience properties
        ref_model.trainer = self

        # set local properties on the model
        self.copy_trainer_model_properties(ref_model)

        # init amp. Must be done here instead of __init__ to allow ddp to work
        if self.use_native_amp and self.precision == 16:
            self.scaler = torch.cuda.amp.GradScaler()

        # log hyper-parameters
        if self.logger is not None:
            # save exp to get started
            self.logger.log_hyperparams(ref_model.module_arguments)

            self.logger.save()

        if self.use_ddp or self.use_ddp2:
            torch_distrib.barrier()

        # wait for all models to restore weights
        if self.on_tpu and XLA_AVAILABLE:
            # wait for all processes to catch up
            torch_xla.core.xla_model.rendezvous(
                "pl.Trainer.run_pretrain_routine")

        elif self.use_horovod:
            # wait for all processes to catch up
            hvd.join()

        # register auto-resubmit when on SLURM
        self.register_slurm_signal_handlers()

        # print model summary
        # TODO: remove self.testing condition because model.summarize() is wiping out the weights
        if self.proc_rank == 0 and self.weights_summary is not None and not self.testing:
            if self.weights_summary in ['full', 'top']:
                ref_model.summarize(mode=self.weights_summary)
            else:
                raise MisconfigurationException(
                    "weights_summary can be None, 'full' or 'top'")

        # track model now.
        # if cluster resets state, the model will update with the saved weights
        self.model = model

        # set up checkpoint callback
        self.configure_checkpoint_callback()

        # restore training and model before hpc call
        self.restore_weights(model)

        # when testing requested only run test and return
        if self.testing:
            # only load test dataloader for testing
            # self.reset_test_dataloader(ref_model)
            self.run_evaluation(test_mode=True)
            return

        # check if we should run validation during training
        self.disable_validation = not (self.is_overridden('validation_step') and self.val_percent_check > 0) \
            and not self.fast_dev_run

        # run tiny validation (if validation defined)
        # to make sure program won't crash during val
        if not self.disable_validation and self.num_sanity_val_steps > 0:
            self.reset_val_dataloader(ref_model)

            # hook and callback
            ref_model.on_sanity_check_start()
            self.on_sanity_check_start()

            eval_results = self._evaluate(model, self.val_dataloaders,
                                          self.num_sanity_val_steps, False)
            _, _, _, callback_metrics, _ = self.process_output(eval_results)

            self.on_sanity_check_end()

            # verify that early stop has conditioned on a metric that exists
            if self.enable_early_stop:
                self.early_stop_callback._validate_condition_metric(
                    callback_metrics)

        # clear cache before training
        if self.on_gpu:
            torch.cuda.empty_cache()

        # CORE TRAINING LOOP
        self.train()
Пример #24
0
 def barrier(self, name: str = None):
     hvd.join()
Пример #25
0
 def on_train_epoch_end(self):
     hvd.join(hvd.local_rank() if self.trainer.on_gpu else -1)
Пример #26
0
    def validation_phase(self, dataset: torch.utils.data.Dataset):

        self.callback_handler.handle(self, self.model,
                                     "on_validation_epoch_start")

        validation_sampler = torch.utils.data.distributed.DistributedSampler(
            dataset, num_replicas=hvd.size(), rank=hvd.rank())

        validation_dataloader = torch.utils.data.DataLoader(
            dataset,
            shuffle=False,
            batch_size=self.settings["validation_batch_size"],
            num_workers=self.settings["num_workers"],
            sampler=validation_sampler,
        )

        if self.settings["validation_steps"] is None:
            self.settings["validation_steps"] = len(validation_dataloader)

        epochs = self.settings["epochs"]
        validation_sum_loss = AverageMeter()
        val_metric = AverageMeter()
        self.model.eval()

        with torch.no_grad():
            dl_iter = iter(validation_dataloader)

            pbar = tqdm(range(self.settings["validation_steps"]),
                        dynamic_ncols=True,
                        disable=(hvd.rank() != 0))

            for step in pbar:
                self.current_step = step
                if hvd.rank() == 0:
                    self.callback_handler.handle(self, self.model,
                                                 "on_validation_step_start")
                try:
                    data = next(dl_iter)
                except StopIteration:
                    dl_iter = iter(validation_dataloader)
                    data = next(dl_iter)

                loss, metrics = self.model.train_step(data)

                # add to average meter for loss
                validation_sum_loss.update(
                    loss.item(), self.settings["validation_batch_size"])

                # validation metric update
                if hvd.rank() == 0:
                    val_metric.update(
                        metrics[self.settings["validation_metric"]],
                        self.settings["validation_batch_size"])

                    for _, (key, val) in enumerate(metrics.items()):
                        self.validation_metrics_container[key].update(
                            val, self.settings["batch_size"])

                # set pbar description
                pbar.set_description((
                    f"VALIDATION hvd rank: {hvd.rank()} epoch {self.current_epoch+1}/{epochs} step {step}"
                    f" current loss  {validation_sum_loss.current}, avg loss {validation_sum_loss.avg}"
                    f", validation_metric {val_metric.avg}"))
                hvd.join()
                if hvd.rank() == 0:
                    self.callback_handler.handle(self, self.model,
                                                 "on_validation_step_end")

            hvd.join()
            if hvd.rank() == 0:
                logging.info(
                    f"Validation result metric for epoch {self.current_epoch} = {val_metric.avg}"
                )
                self.callback_handler.handle(self, self.model,
                                             "on_validation_epoch_end")
Пример #27
0
    def start_testing(self, trainer):
        with ExitStack():
            self._results = trainer.run_test()

        # Make sure all workers have finished training before returning to the user
        hvd.join()
Пример #28
0
 def join(self) -> None:
     if self.root_device.type == "cuda":
         hvd.join(self.local_rank)
     else:
         hvd.join()
Пример #29
0
lr_scaler = hvd.size()

optimizer = torch.optim.Adam(model.parameters(), lr=0.01 * lr_scaler)

hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)

optimizer = hvd.DistributedOptimizer(optimizer,
                                     named_parameters=model.named_parameters())

for epoch in range(args.epochs):
    start = time()
    print(f"Training epoch {epoch}")
    train_loss, y_pred, y = process_epoch(train_loader,
                                          model,
                                          train=True,
                                          optimizer=optimizer)
    hvd.join(gpu_to_use)
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    print(f"Epoch {epoch:02d}. Train loss: {train_loss:.4f}.")
    hvd.join(gpu_to_use)
    t_final = time() - start
    total_rows = train_dataset.num_rows_processed
    print(f"run_time: {t_final} - rows: {total_rows} - "
          f"epochs: {epoch} - dl_thru: {total_rows / t_final}")

hvd.join(gpu_to_use)
if hvd.local_rank() == 0:
    print("Training complete")
Пример #30
0
 def barrier(self, *args, **kwargs):
     if torch_distrib.is_initialized():
         hvd.join()