Пример #1
0
    def _configure_session(
        env: det.EnvContext,
        hvd_config: horovod.HorovodContext,
        session_config: tf.compat.v1.ConfigProto,
    ) -> Optional[tf.compat.v1.Session]:
        if not tf.executing_eagerly():
            session_config.gpu_options.allow_growth = True
            if hvd_config.use:
                # We launch a horovod process per GPU. Each process
                # needs to bind to a unique GPU.
                session_config.gpu_options.visible_device_list = str(
                    hvd.local_rank())

            session = tf.compat.v1.Session(
                graph=tf.compat.v1.get_default_graph(), config=session_config)

            tf.compat.v1.keras.backend.set_session(session)

            return session
        else:
            gpus = tf.config.experimental.list_physical_devices("GPU")

            if len(gpus) > 0:
                local_rank = hvd.local_rank() if hvd_config.use else 0
                gpu = gpus[local_rank]
                tf.config.set_visible_devices(gpu, "GPU")
                tf.config.experimental.set_memory_growth(gpu, True)

            return None
Пример #2
0
    def _init_paths(self) -> None:
        """
        Create a unique model directory for each training process. If
        a load path is provided, copy the checkpoint into the model
        directory of each training process. This model directory will
        be used to initialize an Estimator. We also update the paths in
        the CheckpointState metadata file to the new directory location.
        """
        # Add suffix so that horovod processes don't overwrite each other.
        suffix = str(0) if not self.hvd_config.use else str(hvd.local_rank())
        if self.load_path is None:
            self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix))
            logging.debug(f"Estimator directory set to {self.estimator_dir}.")
            return

        for callback in self.train_hooks:
            if isinstance(callback, estimator.RunHook):
                callback.on_checkpoint_load(str(self.load_path))

        self.estimator_dir = pathlib.Path(tempfile.mkdtemp(suffix=suffix))
        if self.estimator_dir.exists():
            shutil.rmtree(str(self.estimator_dir))
        logging.debug(
            f"Copying from {self.load_path} to {self.estimator_dir}.")
        shutil.copytree(str(self.load_path), str(self.estimator_dir))

        # Calibrate the CheckpointState metadata file to the new location.
        estimator._update_checkpoint_path_in_state_file(self.estimator_dir)
        logging.debug(f"Load path set to {self.estimator_dir}.")
Пример #3
0
    def pre_execute_hook(env: det.EnvContext,
                         hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

            # This is option is available for when TF ignores `gpu_options.visible_device_list`.
            # TODO (DET-3762): Remove this once it's no longer necessary.
            if env.experiment_config.get("data",
                                         {}).get("set_cuda_visible_devices",
                                                 False):
                logging.info(
                    "Setting `CUDA_VISIBLE_DEVICES` environment variables "
                    "and disabling NCCL_P2P_DISABLE")
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())
                os.environ["NCCL_P2P_DISABLE"] = "1"

        # Initialize random seeds.
        if env.experiment_config.input_from_dataflow():
            logging.debug("Using tensorpack dataflows as input.")
            process_rank = 0 if not hvd_config.use else hvd.rank()
            EstimatorTrialController.set_random_seed(env.trial_seed +
                                                     process_rank)
        else:
            # Set identical random seeds on all training processes.
            # When using horovod, each worker will receive a unique
            # shard of the dataset.
            EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()
Пример #4
0
    def __init__(self, *args: List[Any], **kwargs: Dict[str, Any]) -> None:
        super().__init__(*args, **kwargs)  # type: ignore

        self.batch_size = self.context.get_per_slot_batch_size()
        self.scheduling_unit = self.env.experiment_config.scheduling_unit()

        logging.debug("Starting LoopTrialController initialization.")

        if self.hvd_config.use:
            self.is_chief = hvd.rank() == 0
            training_process_rank = hvd.local_rank()
        else:
            self.is_chief = True
            training_process_rank = 0

        if self.hvd_config.use and not self.is_chief:
            log_level = (logging.DEBUG
                         if self.env.experiment_config.debug_enabled() else
                         logging.WARNING)
            logging.getLogger().setLevel(log_level)

        logging.debug(
            f"Training coordination initialized on local rank {training_process_rank}, "
            f"using hvd: {self.hvd_config.use}.")

        # Initialize communication directly between training processes.
        self.train_process_comm_chief = None  # type: Optional[ipc.ZMQBroadcastServer]
        self.train_process_comm_worker = None  # type: Optional[ipc.ZMQBroadcastClient]
        if self.hvd_config.use:
            self._initialize_train_process_comm()
Пример #5
0
    def _init_session_config(
        session_config: tf.compat.v1.ConfigProto,
        env: det.EnvContext,
        hvd_config: horovod.HorovodContext,
    ) -> tf.compat.v1.ConfigProto:
        if session_config is None:
            session_config = tf.compat.v1.ConfigProto()
        session_config.gpu_options.allow_growth = True

        if not hvd_config.use:
            return session_config

        if version.parse(tf.__version__) >= version.parse("2.5.0"):
            gpus = tf.config.experimental.list_physical_devices("GPU")

            if len(gpus) > 0:
                local_rank = hvd.local_rank() if hvd_config.use else 0
                gpu = gpus[local_rank]
                tf.config.experimental.set_visible_devices(gpu, "GPU")
                tf.config.experimental.set_memory_growth(gpu, True)

        session_config.gpu_options.visible_device_list = str(
            horovod.hvd.local_rank())

        return session_config
Пример #6
0
    def _init_session_config(
        cls: Type["EstimatorTrialController"],
        session_config: tf.compat.v1.ConfigProto,
        env: det.EnvContext,
        use_horovod: bool = False,
    ) -> tf.compat.v1.ConfigProto:
        if session_config is None:
            session_config = tf.compat.v1.ConfigProto()
        session_config.gpu_options.allow_growth = True

        if not use_horovod:
            return session_config

        if version.parse(tf.__version__) >= version.parse("2.5.0"):
            gpus = tf.config.experimental.list_physical_devices("GPU")

            if len(gpus) > 0:
                local_rank = hvd.local_rank() if use_horovod else 0
                gpu = gpus[local_rank]
                tf.config.experimental.set_visible_devices(gpu, "GPU")
                tf.config.experimental.set_memory_growth(gpu, True)

        session_config.gpu_options.visible_device_list = str(horovod.hvd.local_rank())

        return session_config
Пример #7
0
    def pre_execute_hook(env: det.EnvContext,
                         hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

            # This is option is available for when TF ignores `gpu_options.visible_device_list`.
            # TODO (DET-3762): Remove this once it's no longer necessary.
            if env.experiment_config.get("data",
                                         {}).get("set_cuda_visible_devices",
                                                 False):
                logging.info(
                    "Setting `CUDA_VISIBLE_DEVICES` environment variables "
                    "and disabling NCCL_P2P_DISABLE")
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())
                os.environ["NCCL_P2P_DISABLE"] = "1"

        # Initialize random seeds.
        # Set identical random seeds on all training processes.
        # When using horovod, each worker will receive a unique
        # shard of the dataset.
        EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()

        # Set the default session before importing any user code. If the default session isn't
        # set and users call TF code that detects GPUs, it would map the processes to all of
        # the GPUs. We set the default session before importing any user code to prevent this
        # this problem.
        EstimatorTrialController._set_default_tensorflow_session(
            env=env, hvd_config=hvd_config)
Пример #8
0
 def _init_device(self) -> None:
     self.n_gpus = len(self.env.container_gpus)
     if self.hvd_config.use:
         check.gt(self.n_gpus, 0)
         # We launch a horovod process per GPU. Each process
         # needs to bind to a unique GPU.
         self.device = torch.device(hvd.local_rank())
         torch.cuda.set_device(self.device)
     elif self.n_gpus > 0:
         self.device = torch.device("cuda", 0)
     else:
         self.device = torch.device("cpu")
     check.is_not_none(self.device)
Пример #9
0
    def pre_execute_hook(env: det.EnvContext,
                         hvd_config: horovod.HorovodContext) -> None:
        # Initialize the correct horovod.
        if hvd_config.use:
            hvd.require_horovod_type("tensorflow", "EstimatorTrial is in use.")
            hvd.init()

            # This is option is available for when TF ignores `gpu_options.visible_device_list`.
            # TODO (DET-3762): Remove this once it's no longer necessary.
            if env.experiment_config.get("data",
                                         {}).get("set_cuda_visible_devices",
                                                 False):
                logging.info(
                    "Setting `CUDA_VISIBLE_DEVICES` environment variables "
                    "and disabling NCCL_P2P_DISABLE")
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())
                os.environ["NCCL_P2P_DISABLE"] = "1"

        # Initialize random seeds.
        # Set identical random seeds on all training processes.
        # When using horovod, each worker will receive a unique
        # shard of the dataset.
        EstimatorTrialController.set_random_seed(env.trial_seed)

        if version.parse(tf.__version__) >= version.parse("2.0.0"):
            tf.compat.v1.disable_v2_behavior()

        # Set the default session before importing any user code. If the default session isn't
        # set and users call TF code that detects GPUs, it would map the processes to all of
        # the GPUs. We set the default session before importing any user code to prevent this
        # this problem. This default session does not have any effect within the Estimator itself.
        EstimatorTrialController._set_default_tensorflow_session(
            env=env, hvd_config=hvd_config, session_config=None)

        logging.debug("Applying tf.estimator patches.")

        @monkey_patch.monkey_patch_decorator(_NewCheckpointListenerForEvaluate,
                                             "_evaluate")
        def patch_estimator_eval_on_checkpoint(original, *args,
                                               **kwargs):  # type: ignore
            # With a single worker and multiple devices,
            # `tf.estimator.train_and_evaluate` attempts to execute `eval_spec` even if
            # `input_fn` or `steps` is None, which causes an error when evaluating the
            # model function. Apply a monkey-patch to skip the internal function that
            # ultimately runs the evaluation.
            logging.info("Skipping %s(*%s, **%s)", original.__name__, args,
                         kwargs)
Пример #10
0
    def _configure_storage(self) -> None:
        session_config = None  # type: Optional[tf.compat.v1.ConfigProto]
        if self._hvd_config.use:
            # For multi-GPU training, we map processes to individual GPUs. TF requires
            # that for each instantiation of `tf.Session`, the process is mapped
            # to the same GPU.
            session_config = tf.compat.v1.ConfigProto()
            session_config.gpu_options.visible_device_list = str(
                hvd.local_rank())

        scheme = "wss" if self._env.use_tls else "ws"
        rw_coordinator_url = (
            f"{scheme}://{self._env.master_addr}:{self._env.master_port}/ws/data-layer/"
        )
        data_layer_type = self._env.experiment_config.get_data_layer_type()

        if data_layer_type == StorageTypes.SHARED_FS.value:
            local_cache_dir_path = self._env.experiment_config[
                "data_layer"].get("container_storage_path")
            local_cache_path = init_container_storage_path(
                configured_storage_path=local_cache_dir_path)

            storage_config = storage.LFSConfigurations(
                storage_dir_path=str(local_cache_path))
            self._storage = storage.LFSStorage(
                storage_config, tensorflow_config=session_config)

        elif data_layer_type == StorageTypes.S3.value:
            local_cache_dir_path = self._env.experiment_config[
                "data_layer"].get("local_cache_container_path")
            local_cache_path = init_container_storage_path(
                configured_storage_path=local_cache_dir_path)

            storage_config = storage.S3Configurations(
                bucket=self._env.experiment_config["data_layer"]["bucket"],
                bucket_directory_path=self._env.experiment_config["data_layer"]
                ["bucket_directory_path"],
                url=rw_coordinator_url,
                local_cache_dir=str(local_cache_path),
                access_key=self._env.experiment_config["data_layer"].get(
                    "access_key"),
                secret_key=self._env.experiment_config["data_layer"].get(
                    "secret_key"),
                endpoint_url=self._env.experiment_config["data_layer"].get(
                    "endpoint_url"),
                coordinator_cert_file=self._env.master_cert_file,
                coordinator_cert_name=self._env.master_cert_name,
            )
            self._storage = storage.S3Storage(storage_config,
                                              tensorflow_config=session_config)

        elif data_layer_type == StorageTypes.GCS.value:
            local_cache_dir_path = self._env.experiment_config[
                "data_layer"].get("local_cache_container_path")
            local_cache_path = init_container_storage_path(
                configured_storage_path=local_cache_dir_path)
            storage_config = storage.GCSConfigurations(
                bucket=self._env.experiment_config["data_layer"]["bucket"],
                bucket_directory_path=self._env.experiment_config["data_layer"]
                ["bucket_directory_path"],
                url=rw_coordinator_url,
                local_cache_dir=str(local_cache_path),
                coordinator_cert_file=self._env.master_cert_file,
                coordinator_cert_name=self._env.master_cert_name,
            )
            self._storage = storage.GCSStorage(
                storage_config, tensorflow_config=session_config)

        else:
            raise AssertionError(
                "Please select a supported data_layer type. Supported types include: "
                f"{[i.value for i in StorageTypes]}")