示例#1
0
    def patch(args: argparse.Namespace) -> None:
        check_false(args.all and args.agent_id)

        if not (args.all or args.agent_id):
            print("Error: must specify exactly one of `--all` or agent_id",
                  file=sys.stderr)
            sys.exit(1)

        if args.agent_id:
            agent_ids = [args.agent_id]
        else:
            r = api.get(args.master, "agents")
            agent_ids = sorted(local_id(a) for a in r.json().keys())

        drain_mode = None if enabled else args.drain

        for agent_id in agent_ids:
            action = "enable" if enabled else "disable"
            path = f"api/v1/agents/{agent_id}/{action}"

            payload = None
            if not enabled and drain_mode:
                payload = {
                    "drain": drain_mode,
                }

            api.post(args.master, path, payload)
            status = "Disabled" if not enabled else "Enabled"
            print(f"{status} agent {agent_id}.", file=sys.stderr)

        # When draining, check if there're any tasks currently running on
        # these slots, and list them.
        if drain_mode:
            rsp = api.get(args.master, "tasks")
            tasks_data = {
                k: t
                for (k, t) in rsp.json().items()
                if any(a in agent_ids for r in t.get("resources", [])
                       for a in r["agent_devices"])
            }

            if not (args.json or args.csv):
                if tasks_data:
                    print("Tasks still in progress on draining nodes.")
                else:
                    print("No tasks in progress on draining nodes.")

            cli_task.render_tasks(args, tasks_data)
示例#2
0
    def wrap_optimizer(self, optimizer: Any) -> Any:
        """
        This should be used to wrap optimizer objects immediately after they have
        been created. Users should use the output of this wrapper as the new instance
        of their optimizer. For example, if users create their optimizer within
        ``build_estimator()``, they should call ``optimizer = wrap_optimizer(optimzer)``
        prior to passing the optimizer into their Estimator.
        """
        if not self.env.managed_training:
            return optimizer

        self.optimizer_initialized = True
        if not self.hvd_config.use:
            return optimizer

        check.check_false(
            isinstance(optimizer, str),
            "Please specify an optimizer object instead of using a string name.",
        )

        hvd.require_horovod_type(
            "tensorflow", "EstimatorContext.wrap_optimizer was called.")
        use_compression = self.hvd_config.fp16_compression

        # The signature of our horovod optimizer changed after we rebased onto 0.21.
        hvd_sig = inspect.signature(hvd.DistributedOptimizer)
        horovod_kwargs = {
            "compression":
            hvd.compression.Compression.fp16
            if use_compression else hvd.compression.Compression.none,
            "average_aggregated_gradients":
            self.hvd_config.average_aggregated_gradients,
        }
        if "aggregation_frequency" in hvd_sig.parameters:
            horovod_kwargs[
                "aggregation_frequency"] = self.hvd_config.aggregation_frequency
        else:
            horovod_kwargs[
                "backward_passes_per_step"] = self.hvd_config.aggregation_frequency

        optimizer = hvd.DistributedOptimizer(optimizer, **horovod_kwargs)
        logging.debug(
            "Initialized optimizer for distributed and optimized parallel training."
        )
        return optimizer
示例#3
0
    def patch(args: argparse.Namespace) -> None:
        check_false(args.all and args.agent_id)

        if not (args.all or args.agent_id):
            print("Error: must specify exactly one of `--all` or agent_id")
            sys.exit(1)

        if args.agent_id:
            agent_ids = [args.agent_id]
        else:
            r = api.get(args.master, "agents")
            agent_ids = sorted(local_id(a) for a in r.json().keys())

        for agent_id in agent_ids:
            path = "agents/{}/slots".format(agent_id)
            headers = {"Content-Type": "application/merge-patch+json"}
            payload = {"enabled": enabled}

            api.patch(args.master, path, body=payload, headers=headers)
            status = "Disabled" if not enabled else "Enabled"
            print("{} agent {}".format(status, agent_id))
示例#4
0
    def cache_dataset(
        self,
        dataset_id: str,
        dataset_version: str,
        shuffle: bool,
        skip_shuffle_at_epoch_end: bool,
    ) -> Callable:

        # Perform lazy initialization of storage so that if users are not
        # using data layer, we are not creating unused directories.
        self._configure_storage()

        if self._training:
            # We only check the training cacheable for re-use, because for EstimatorTrial
            # it's possible that the validation cacheable is called every time validation
            # is performed.
            check.check_false(
                self._decorator_used,
                "Pleas use both `@context.experimental.cache_train_dataset(dataset_name, "
                "dataset_version)` and `@context.experimental.cache_validation_dataset("
                "dataset_name, dataset_version)` exactly once.",
            )
        self._decorator_used = True
        dataset_version += "_train" if self._training else "_val"

        def _wrap(make_dataset_fn: Callable) -> Callable:
            @functools.wraps(make_dataset_fn)
            def _decorated_fn(*args: Any, **kwargs: Any) -> Any:
                @self._storage.cacheable(  # type: ignore
                    dataset_id=dataset_id,
                    dataset_version=dataset_version,
                )
                def make_dataset() -> yogadl.DataRef:
                    return make_dataset_fn(*args, **kwargs)

                logging.info(
                    f"Preparing dataset: {dataset_id}:{dataset_version}.")
                logging.debug(
                    f"Calling make dataset for: {dataset_id}:{dataset_version} "
                    f"with following start_offset: {self._offset}, "
                    f"shuffle: {shuffle} shuffle_seed: {self._shuffle_seed} "
                    f"shard_rank: {self._shard_rank}, world size: {self._num_shards} "
                    f"training: {self._training}.")

                stream_from_cache = make_dataset().stream(
                    start_offset=self._offset,
                    shuffle=shuffle,
                    skip_shuffle_at_epoch_end=skip_shuffle_at_epoch_end,
                    shuffle_seed=self._shuffle_seed,
                    shard_rank=self._shard_rank,
                    num_shards=self._num_shards,
                    drop_shard_remainder=True if self._training else False,
                )
                self._dataset_length = len(stream_from_cache)
                logging.info(
                    f"Dataset {dataset_id}:{dataset_version} preparation finished."
                )

                return tensorflow.make_tf_dataset(stream_from_cache)

            return _decorated_fn

        return _wrap