示例#1
0
    def pre_execute_hook(env: det.EnvContext,
                         hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

            # This is option is available for when TF ignores `gpu_options.visible_device_list`.
            # TODO (DET-3762): Remove this once it's no longer necessary.
            if env.experiment_config.get("data",
                                         {}).get("set_cuda_visible_devices",
                                                 False):
                logging.info(
                    "Setting `CUDA_VISIBLE_DEVICES` environment variables "
                    "and disabling NCCL_P2P_DISABLE")
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())
                os.environ["NCCL_P2P_DISABLE"] = "1"

        # Initialize random seeds.
        if env.experiment_config.input_from_dataflow():
            logging.debug("Using tensorpack dataflows as input.")
            process_rank = 0 if not hvd_config.use else hvd.rank()
            EstimatorTrialController.set_random_seed(env.trial_seed +
                                                     process_rank)
        else:
            # Set identical random seeds on all training processes.
            # When using horovod, each worker will receive a unique
            # shard of the dataset.
            EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()
示例#2
0
    def wrap_dataset(self, dataset: Any, shard_dataset: bool = True) -> Any:
        """
        This should be used to wrap ``tf.data.Dataset`` objects immediately after
        they have been created. Users should use the output of this wrapper as the
        new instance of their dataset. If users create multiple datasets (e.g., one
        for training and one for testing), users should wrap each dataset
        independently. E.g., If users instantiate their training dataset within
        ``build_train_spec()``, they should call ``dataset = wrap_dataset(dataset)``
        prior to passing it into ``tf.estimator.TrainSpec``.

        Args:
            dataset: tf.data.Dataset
            shard_dataset:
                When performing multi-slot (distributed) training, this
                controls whether the dataset is sharded so that each training process
                (one per slot) sees unique data. If set to False, users must manually
                configure each process to use unique data.

        """
        if not self.env.training:
            return dataset

        hvd.require_horovod_type("tensorflow",
                                 "EstimatorContext.wrap_dataset was called.")

        self.dataset_initialized = True
        if not self.hvd_config.use or self.input_from_dataflow or not shard_dataset:
            if self.hvd_config and not shard_dataset:
                logging.info("Dataset sharding skipped.")
            return dataset

        dataset = dataset.shard(hvd.size(), hvd.rank())
        logging.debug(
            f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.")
        return dataset
示例#3
0
    def wrap_dataset(self, dataset: Any, shard_dataset: bool = True) -> Any:
        """
        This should be used to wrap ``tf.data.Dataset`` objects immediately after
        they have been created. Users should use the output of this wrapper as the
        new instance of their dataset. If users create multiple datasets (e.g.,
        one for training and one for validation), users should wrap each dataset
        independently.

        Args:
            dataset: tf.data.Dataset
            shard_dataset:
                When performing multi-slot (distributed) training, this
                controls whether the dataset is sharded so that each training process
                (one per slot) sees unique data. If set to False, users must manually
                configure each process to use unique data.
        """
        if not self.env.managed_training:
            return dataset

        self.dataset_initialized = True
        if not self.hvd_config.use or not isinstance(dataset, tf.data.Dataset) or not shard_dataset:

            if self.hvd_config and not shard_dataset:
                logging.info("Dataset sharding skipped.")
            return dataset

        hvd.require_horovod_type("tensorflow.keras", "TFKerasContext.wrap_dataset was called.")
        dataset = dataset.shard(hvd.size(), hvd.rank())
        logging.debug(f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.")
        return dataset
示例#4
0
    def __init__(self, *args: List[Any], **kwargs: Dict[str, Any]) -> None:
        super().__init__(*args, **kwargs)  # type: ignore

        self.batch_size = self.context.get_per_slot_batch_size()
        self.scheduling_unit = self.env.experiment_config.scheduling_unit()

        logging.debug("Starting LoopTrialController initialization.")

        if self.hvd_config.use:
            self.is_chief = hvd.rank() == 0
            training_process_rank = hvd.local_rank()
        else:
            self.is_chief = True
            training_process_rank = 0

        if self.hvd_config.use and not self.is_chief:
            log_level = (logging.DEBUG
                         if self.env.experiment_config.debug_enabled() else
                         logging.WARNING)
            logging.getLogger().setLevel(log_level)

        logging.debug(
            f"Training coordination initialized on local rank {training_process_rank}, "
            f"using hvd: {self.hvd_config.use}.")

        # Initialize communication directly between training processes.
        self.train_process_comm_chief = None  # type: Optional[ipc.ZMQBroadcastServer]
        self.train_process_comm_worker = None  # type: Optional[ipc.ZMQBroadcastClient]
        if self.hvd_config.use:
            self._initialize_train_process_comm()
示例#5
0
    def __init__(self, *args: List[Any], **kwargs: Dict[str, Any]) -> None:
        super().__init__(*args, **kwargs)  # type: ignore

        self.batch_size = self.context.get_per_slot_batch_size()
        self.scheduling_unit = self.env.experiment_config.scheduling_unit()

        logging.debug("Starting LoopTrialController initialization.")

        if self.hvd_config.use:
            self.is_chief = hvd.rank() == 0
            rank = hvd.rank()
        else:
            self.is_chief = True
            rank = 0

        if self.hvd_config.use and not self.is_chief:
            log_level = (logging.DEBUG
                         if self.env.experiment_config.debug_enabled() else
                         logging.WARNING)
            logging.getLogger().setLevel(log_level)

        logging.debug(
            f"TrialController initialized on rank {rank}, using hvd: {self.hvd_config.use}."
        )
示例#6
0
    def _set_data_loaders(self) -> None:
        skip_batches = (self.env.first_step() - 1) * self.batches_per_step

        nreplicas = hvd.size() if self.hvd_config.use else 1
        rank = hvd.rank() if self.hvd_config.use else 0

        self.training_loader = self.trial.build_training_data_loader(
        ).get_data_loader(repeat=True,
                          skip=skip_batches,
                          num_replicas=nreplicas,
                          rank=rank)

        validation_dataset = self.trial.build_validation_data_loader()
        if self._evaluate_batch_defined():
            self.validation_loader = validation_dataset.get_data_loader(
                repeat=False, skip=0, num_replicas=nreplicas, rank=rank)
        elif self.is_chief:
            self.validation_loader = validation_dataset.get_data_loader(
                repeat=False, skip=0, num_replicas=1, rank=0)
示例#7
0
    def pre_execute_hook(env: det.EnvContext, hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

        # Initialize random seeds.
        if env.experiment_config.input_from_dataflow():
            logging.debug("Using tensorpack dataflows as input.")
            process_rank = 0 if not hvd_config.use else hvd.rank()
            EstimatorTrialController.set_random_seed(env.trial_seed + process_rank)
        else:
            # Set identical random seeds on all training processes.
            # When using horovod, each worker will receive a unique
            # shard of the dataset.
            EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()
示例#8
0
    def _set_data_loaders(self) -> None:
        skip_batches = self.env.initial_workload.total_batches_processed

        nreplicas = hvd.size() if self.hvd_config.use else 1
        rank = hvd.rank() if self.hvd_config.use else 0

        self.training_loader = self.trial.build_training_data_loader(
        ).get_data_loader(repeat=True,
                          skip=skip_batches,
                          num_replicas=nreplicas,
                          rank=rank)
        self.context._epoch_len = len(self.training_loader)

        validation_dataset = self.trial.build_validation_data_loader()
        if self._evaluate_batch_defined():
            self.validation_loader = validation_dataset.get_data_loader(
                repeat=False, skip=0, num_replicas=nreplicas, rank=rank)
        elif self.is_chief:
            self.validation_loader = validation_dataset.get_data_loader(
                repeat=False, skip=0, num_replicas=1, rank=0)
    def wrap_dataset(self, dataset: Any) -> Any:
        """
        This should be used to wrap ``tf.data.Dataset`` objects immediately after
        they have been created. Users should use the output of this wrapper as the
        new instance of their dataset. If users create multiple datasets (e.g., one
        for training and one for testing), users should wrap each dataset
        independently. E.g., If users instantiate their training dataset within
        ``build_train_spec()``, they should call ``dataset = wrap_dataset(dataset)``
        prior to passing it into ``tf.estimator.TrainSpec``.
        """
        hvd.require_horovod_type("tensorflow",
                                 "EstimatorContext.wrap_dataset was called.")

        self.dataset_initialized = True
        if not self.hvd_config.use or self.input_from_dataflow:
            return dataset
        dataset = dataset.shard(hvd.size(), hvd.rank())
        logging.debug(
            f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.")
        return dataset
    def _set_data_loaders(self) -> None:
        skip_batches = self.env.initial_workload.total_batches_processed

        nreplicas = hvd.size() if self.hvd_config.use else 1
        rank = hvd.rank() if self.hvd_config.use else 0

        # TODO: the number of ways a user could get this wrong is alarming.  Right now we don't
        # have any validation, but we should add some.  Maybe deprecate the old way?  Or mark the
        # new way as "advanced"?
        train_data = self.trial.build_training_data_loader()
        if isinstance(train_data, pytorch.DataLoader):
            # Old-API, a user-provided det.pytorch.DataLoader.
            self.training_loader = train_data.get_data_loader(
                repeat=True,
                skip=skip_batches,
                num_replicas=nreplicas,
                rank=rank)
        else:
            # New-API, assume the user called context.make_training_batch_sampler.
            self.training_loader = train_data

        self.context._epoch_len = len(self.training_loader)

        validation_data = self.trial.build_validation_data_loader()
        if self._evaluate_batch_defined():
            if isinstance(validation_data, pytorch.DataLoader):
                # Old-API, a user-provided det.pytorch.DataLoader.
                self.validation_loader = validation_data.get_data_loader(
                    repeat=False, skip=0, num_replicas=nreplicas, rank=rank)
            else:
                # New-API, assume the user called context.make_validation_batch_sampler.
                self.validation_loader = validation_data
        elif self.is_chief:
            if isinstance(validation_data, pytorch.DataLoader):
                # Old-API, a user-provided det.pytorch.DataLoader.
                self.validation_loader = validation_data.get_data_loader(
                    repeat=False, skip=0, num_replicas=1, rank=0)
            else:
                # Oh shit, I hope the user didn't call make_validation_batch_sampler; that would
                # be bad for them here.
                self.validation_loader = validation_data
示例#11
0
    def wrap_dataset(self, dataset: Any) -> Any:
        """
        This should be used to wrap ``tf.data.Dataset`` objects immediately after
        they have been created. Users should use the output of this wrapper as the
        new instance of their dataset. If users create multiple datasets (e.g.,
        one for training and one for testing), users should wrap each dataset
        independently.

        Args:
            dataset: tf.data.Dataset
        """
        self.dataset_initialized = True
        if not self.hvd_config.use or not isinstance(dataset, tf.data.Dataset):
            return dataset

        hvd.require_horovod_type("tensorflow.keras",
                                 "TFKerasContext.wrap_dataset was called.")
        dataset = dataset.shard(hvd.size(), hvd.rank())
        logging.debug(
            f"Sharded dataset to index {hvd.rank()} of {hvd.size()}.")
        return dataset
示例#12
0
    def _init_shard(self) -> None:
        if not self._hvd_config.use:
            return

        self._shard_rank = hvd.rank()
        self._num_shards = hvd.size()