예제 #1
0
def tune_mnist_mxnet(num_samples=10, num_epochs=10):
    logger.info("Downloading MNIST data...")
    mnist_data = mx.test_utils.get_mnist()
    logger.info("Got MNIST data, starting Ray Tune.")

    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-3, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }

    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

    analysis = tune.run(
        tune.with_parameters(
            train_mnist_mxnet, mnist=mnist_data, num_epochs=num_epochs
        ),
        resources_per_trial={
            "cpu": 1,
        },
        metric="mean_accuracy",
        mode="max",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        name="tune_mnist_mxnet",
    )
    return analysis
예제 #2
0
def _set_api_key(api_key_file: Optional[str] = None,
                 api_key: Optional[str] = None):
    """Set WandB API key from `wandb_config`. Will pop the
    `api_key_file` and `api_key` keys from `wandb_config` parameter"""
    if api_key_file:
        if api_key:
            raise ValueError("Both WandB `api_key_file` and `api_key` set.")
        with open(api_key_file, "rt") as fp:
            api_key = fp.readline().strip()
    # Try to get API key from external hook
    if not api_key and WANDB_SETUP_API_KEY_HOOK in os.environ:
        try:
            api_key = _load_class(os.environ[WANDB_SETUP_API_KEY_HOOK])()
        except Exception as e:
            logger.exception(
                f"Error executing {WANDB_SETUP_API_KEY_HOOK} to setup API key: {e}",
                exc_info=e,
            )
    if api_key:
        os.environ[WANDB_ENV_VAR] = api_key
    elif not os.environ.get(WANDB_ENV_VAR):
        try:
            # Check if user is already logged into wandb.
            wandb.ensure_configured()
            if wandb.api.api_key:
                logger.info("Already logged into W&B.")
                return
        except AttributeError:
            pass
        raise ValueError(
            "No WandB API key found. Either set the {} environment "
            "variable, pass `api_key` or `api_key_file` to the"
            "`WandbLoggerCallback` class as arguments, "
            "or run `wandb login` from the command line".format(WANDB_ENV_VAR))
예제 #3
0
def tune_mnist_mxnet(num_samples=10, num_epochs=10):
    logger.info("Downloading MNIST data...")
    mnist_data = mx.test_utils.get_mnist()
    logger.info("Got MNIST data, starting Ray Tune.")

    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-3, 1e-1),
        "batch_size": tune.choice([32, 64, 128])
    }

    scheduler = ASHAScheduler(metric="mean_accuracy",
                              mode="max",
                              max_t=num_epochs,
                              grace_period=1,
                              reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    tune.run(partial(train_mnist_mxnet,
                     mnist=mnist_data,
                     num_epochs=num_epochs),
             resources_per_trial={
                 "cpu": 1,
             },
             config=config,
             num_samples=num_samples,
             scheduler=scheduler,
             progress_reporter=reporter,
             name="tune_mnist_mxnet")
예제 #4
0
파일: wandb.py 프로젝트: srikalyan/ray
def _set_api_key(wandb_config):
    """Set WandB API key from `wandb_config`. Will pop the
    `api_key_file` and `api_key` keys from `wandb_config` parameter"""
    api_key_file = os.path.expanduser(wandb_config.pop("api_key_file", ""))
    api_key = wandb_config.pop("api_key", None)

    if api_key_file:
        if api_key:
            raise ValueError("Both WandB `api_key_file` and `api_key` set.")
        with open(api_key_file, "rt") as fp:
            api_key = fp.readline().strip()
    if api_key:
        os.environ[WANDB_ENV_VAR] = api_key
    elif not os.environ.get(WANDB_ENV_VAR):
        try:
            # Check if user is already logged into wandb.
            wandb.ensure_configured()
            if wandb.api.api_key:
                logger.info("Already logged into W&B.")
                return
        except AttributeError:
            pass
        raise ValueError(
            "No WandB API key found. Either set the {} environment "
            "variable, pass `api_key` or `api_key_file` in the config, "
            "or run `wandb login` from the command line".format(WANDB_ENV_VAR))
예제 #5
0
    def on_episode_end(
        self,
        worker: RolloutWorker,
        base_env: BaseEnv,
        policies: Dict[str, Policy],
        episode: MultiAgentEpisode,
        **kwargs,
    ):
        ego_speed = episode.user_data["ego_speed"]
        mean_ego_speed = {
            agent_id: np.mean(speed_hist) for agent_id, speed_hist in ego_speed.items()
        }

        distance_travelled = dict()
        for _id, info in episode._agent_to_last_info.items():
            if info.get("_group_info"):
                for i, _info in enumerate(info["_group_info"]):
                    distance_travelled[f"{_id}:AGENT-{i}"] = np.mean(_info["score"])
            else:
                distance_travelled[_id] = np.mean(info["score"])

        speed_list = list(map(lambda x: round(x, 3), mean_ego_speed.values()))
        dist_list = list(map(lambda x: round(x, 3), distance_travelled.values()))
        reward_list = list(map(lambda x: round(x, 3), episode.agent_rewards.values()))

        episode.custom_metrics[f"mean_ego_speed"] = sum(speed_list) / max(
            1, len(speed_list)
        )
        episode.custom_metrics[f"distance_travelled"] = sum(dist_list) / max(
            1, len(dist_list)
        )

        logger.info(f"episode {episode.episode_id} ended with {episode.length} steps")
예제 #6
0
파일: wandb.py 프로젝트: smorad/ray
def _set_api_key(api_key_file: Optional[str] = None, api_key: Optional[str] = None):
    """Set WandB API key from `wandb_config`. Will pop the
    `api_key_file` and `api_key` keys from `wandb_config` parameter"""
    if api_key_file:
        if api_key:
            raise ValueError("Both WandB `api_key_file` and `api_key` set.")
        with open(api_key_file, "rt") as fp:
            api_key = fp.readline().strip()
    if api_key:
        os.environ[WANDB_ENV_VAR] = api_key
    elif not os.environ.get(WANDB_ENV_VAR):
        try:
            # Check if user is already logged into wandb.
            wandb.ensure_configured()
            if wandb.api.api_key:
                logger.info("Already logged into W&B.")
                return
        except AttributeError:
            pass
        raise ValueError(
            "No WandB API key found. Either set the {} environment "
            "variable, pass `api_key` or `api_key_file` to the"
            "`WandbLoggerCallback` class as arguments, "
            "or run `wandb login` from the command line".format(WANDB_ENV_VAR)
        )
예제 #7
0
파일: test_api.py 프로젝트: srikalyan/ray
 def train(config, reporter):
     import sys
     from ray import logger
     for i in range(10):
         reporter(timesteps_total=i)
     print("PRINT_STDOUT")
     print("PRINT_STDERR", file=sys.stderr)
     logger.info("LOG_STDERR")
예제 #8
0
        def step(self):
            self.iter += 1

            print("PRINT_STDOUT: {}".format(self.msg))
            print("PRINT_STDERR: {}".format(self.msg), file=sys.stderr)
            logger.info("LOG_STDERR: {}".format(self.msg))

            return {"num_resets": self.num_resets, "done": self.iter > 1}
예제 #9
0
    def stop_all(self):
        if not self._start:
            self._start = time.time()
            return False

        now = time.time()
        if now - self._start >= self._timeout_seconds:
            logger.info(f"Reached timeout of {self._timeout_seconds} seconds. "
                        f"Stopping all trials.")
            return True
        return False
예제 #10
0
파일: cloud.py 프로젝트: alipay/ray
    def upload(
        self,
        cloud_path: Optional[str] = None,
        local_path: Optional[str] = None,
        clean_before: bool = False,
    ):
        """Upload checkpoint to cloud.

        This will push the checkpoint directory from local storage
        to ``cloud_path``.

        If a ``cloud_path`` argument is provided and ``self.cloud_path``
        is unset, it will be set to ``cloud_path``.

        Args:
            cloud_path: Cloud path to load checkpoint from.
                Defaults to ``self.cloud_path``.
            local_path: Local path to save checkpoint at.
                Defaults to ``self.local_path``.
            clean_before: If True, deletes potentially existing
                cloud bucket before storing new data.

        """
        local_path = local_path or self.local_path
        if not local_path:
            raise RuntimeError(
                "Could not upload trial checkpoint: No local "
                "path is set. Fix this by either passing a "
                "`local_path` to your call to `upload()` or by "
                "passing a `local_path` into the constructor."
            )

        cloud_path = cloud_path or self.cloud_path
        if not cloud_path:
            raise RuntimeError(
                "Could not download trial checkpoint: No cloud "
                "path is set. Fix this by either passing a "
                "`cloud_path` to your call to `download()` or by "
                "passing a `cloud_path` into the constructor. The latter "
                "should automatically be done if you pass the correct "
                "`tune.SyncConfig`."
            )

        if not self.cloud_path:
            self.cloud_path = cloud_path

        if clean_before:
            logger.info(f"Clearing bucket contents before upload: {cloud_path}")
            delete_at_uri(cloud_path)

        # Actually upload
        upload_to_uri(local_path, cloud_path)

        return cloud_path
예제 #11
0
 def on_episode_start(
     self,
     worker: RolloutWorker,
     base_env: BaseEnv,
     policies: Dict[str, Policy],
     episode: MultiAgentEpisode,
     **kwargs,
 ):
     logger.info("episode {} started".format(episode.episode_id))
     episode.user_data["ego_speed"] = defaultdict(lambda: [])
     episode.user_data["step_heading_error"] = dict()
예제 #12
0
def _predict(model: xgb.Booster,
             data: RayDMatrix,
             num_actors: int = 4,
             cpus_per_actor: int = 0,
             gpus_per_actor: int = 0,
             resources_per_actor: Optional[Dict] = None,
             **kwargs):
    _assert_ray_support()

    if not ray.is_initialized():
        ray.init()

    # Create remote actors
    actors = [
        _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor,
                      resources_per_actor) for i in range(num_actors)
    ]
    logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.")

    # Split data across workers
    wait_load = []
    for _, actor in enumerate(actors):
        wait_load.extend(_trigger_data_load(actor, data, []))

    try:
        ray.get(wait_load)
    except Exception as exc:
        logger.warning(f"Caught an error during prediction: {str(exc)}")
        _shutdown(actors, force=True)
        raise

    # Put model into object store
    model_ref = ray.put(model)

    logger.info("[RayXGBoost] Starting XGBoost prediction.")

    # Train
    fut = [actor.predict.remote(model_ref, data, **kwargs) for actor in actors]

    try:
        actor_results = ray.get(fut)
    except Exception as exc:
        logger.warning(f"Caught an error during prediction: {str(exc)}")
        _shutdown(remote_workers=actors, force=True)
        raise

    _shutdown(remote_workers=actors, force=False)

    return combine_data(data.sharding, actor_results)
예제 #13
0
파일: stopper.py 프로젝트: smorad/ray
    def stop_all(self):
        now = time.time()

        if self._last_check:
            taken = now - self._last_check
            self._budget -= taken

        self._last_check = now

        if self._budget <= 0:
            logger.info(f"Reached timeout of {self._timeout_seconds} seconds. "
                        f"Stopping all trials.")
            return True

        return False
예제 #14
0
        def step(self):
            self.iter += 1

            print("PRINT_STDOUT: {}".format(self.msg))
            print("PRINT_STDERR: {}".format(self.msg), file=sys.stderr)
            logger.info("LOG_STDERR: {}".format(self.msg))

            if self.sleep:
                time.sleep(self.sleep)

            return {
                "id": self.config["id"],
                "num_resets": self.num_resets,
                "done": self.iter > 1,
                "iter": self.iter
            }
예제 #15
0
파일: rollout.py 프로젝트: valaxkong/SMARTS
def rollout(trainer, env_name, metrics_handler, num_steps, num_episodes,
            log_dir):
    """Reference: https://github.com/ray-project/ray/blob/master/rllib/rollout.py"""
    policy_agent_mapping = default_policy_agent_mapping
    assert hasattr(trainer, "workers") and isinstance(trainer.workers,
                                                      WorkerSet)
    env = trainer.workers.local_worker().env
    multiagent = isinstance(env, MultiAgentEnv)
    if trainer.workers.local_worker().multiagent:
        policy_agent_mapping = trainer.config["multiagent"][
            "policy_mapping_fn"]
    policy_map = trainer.workers.local_worker().policy_map
    state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
    use_lstm = {p: len(s) > 0 for p, s in state_init.items()}

    action_init = {
        p: flatten_to_single_ndarray(m.action_space.sample())
        for p, m in policy_map.items()
    }

    for episode in range(num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.0)
        done = False
        reward_total = 0.0
        step = 0
        while not done and step < num_steps:
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = trainer.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id,
                        )
                        agent_states[agent_id] = p_state
                    else:
                        a_action = trainer.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id,
                        )
                    a_action = flatten_to_single_ndarray(a_action)
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)

            metrics_handler.log_step(
                episode=episode,
                observations=multi_obs,
                actions=action,
                rewards=reward,
                dones=done,
                infos=info,
            )

            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            # filter dead agents
            if multiagent:
                next_obs = {
                    agent_id: obs
                    for agent_id, obs in next_obs.items() if not done[agent_id]
                }

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward

            step += 1
            obs = next_obs
        logger.info("\nEpisode #{}: steps: {} reward: {}".format(
            episode, step, reward_total))
        if done:
            episode += 1
    metrics_handler.write_to_csv(csv_dir=log_dir)
예제 #16
0
파일: main.py 프로젝트: amogkam/xgboost_ray
def _train(params: Dict,
           dtrain: RayDMatrix,
           *args,
           evals=(),
           num_actors: int = 4,
           cpus_per_actor: int = 0,
           gpus_per_actor: int = -1,
           resources_per_actor: Optional[Dict] = None,
           checkpoint_prefix: Optional[str] = None,
           checkpoint_path: str = "/tmp",
           checkpoint_frequency: int = 5,
           **kwargs):
    _assert_ray_support()

    if not ray.is_initialized():
        ray.init()

    if gpus_per_actor == -1:
        gpus_per_actor = 0
        if "tree_method" in params and params["tree_method"].startswith("gpu"):
            gpus_per_actor = 1

    if cpus_per_actor <= 0:
        cluster_cpus = _ray_get_cluster_cpus() or 1
        cpus_per_actor = min(int(_get_max_node_cpus() or 1),
                             int(cluster_cpus // num_actors))

    if "nthread" in params:
        if params["nthread"] > cpus_per_actor:
            raise ValueError(
                "Specified number of threads greater than number of CPUs. "
                "\nFIX THIS by passing a lower value for the `nthread` "
                "parameter or a higher number for `cpus_per_actor`.")
    else:
        params["nthread"] = cpus_per_actor

    # Create remote actors
    actors = [
        _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor,
                      resources_per_actor, checkpoint_prefix, checkpoint_path,
                      checkpoint_frequency) for i in range(num_actors)
    ]
    logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.")

    # Split data across workers
    wait_load = []
    for _, actor in enumerate(actors):
        wait_load.extend(_trigger_data_load(actor, dtrain, evals))

    ray.get(wait_load)

    logger.info("[RayXGBoost] Starting XGBoost training.")

    # Start tracker
    env = _start_rabit_tracker(num_actors)
    rabit_args = [("%s=%s" % item).encode() for item in env.items()]

    # Train
    fut = [
        actor.train.remote(rabit_args, params, dtrain, evals, *args, **kwargs)
        for actor in actors
    ]

    try:
        ray.get(fut)
    except RayActorError:
        for actor in actors:
            ray.kill(actor)
        raise

    # All results should be the same because of Rabit tracking. So we just
    # return the first one.
    res: Dict[str, Any] = ray.get(fut[0])
    bst = res["bst"]
    evals_result = res["evals_result"]

    all_res = ray.get(fut)
    total_n = sum([res["train_n"] or 0 for res in all_res])

    logger.info(f"[RayXGBoost] Finished XGBoost training on training data "
                f"with total N={total_n:,}.")

    if checkpoint_prefix:
        _cleanup(checkpoint_prefix, checkpoint_path, num_actors)

    return bst, evals_result
예제 #17
0
def _train(params: Dict,
           dtrain: RayDMatrix,
           *args,
           evals=(),
           num_actors: int = 4,
           cpus_per_actor: int = 0,
           gpus_per_actor: int = -1,
           resources_per_actor: Optional[Dict] = None,
           checkpoint_prefix: Optional[str] = None,
           checkpoint_path: str = "/tmp",
           checkpoint_frequency: int = 5,
           **kwargs) -> Tuple[xgb.Booster, Dict, Dict]:
    _assert_ray_support()

    if not ray.is_initialized():
        ray.init()

    if gpus_per_actor == -1:
        gpus_per_actor = 0
        if "tree_method" in params and params["tree_method"].startswith("gpu"):
            gpus_per_actor = 1

    if cpus_per_actor <= 0:
        cluster_cpus = _ray_get_cluster_cpus() or 1
        cpus_per_actor = min(int(_get_max_node_cpus() or 1),
                             int(cluster_cpus // num_actors))

    if "nthread" in params:
        if params["nthread"] > cpus_per_actor:
            raise ValueError(
                "Specified number of threads greater than number of CPUs. "
                "\nFIX THIS by passing a lower value for the `nthread` "
                "parameter or a higher number for `cpus_per_actor`.")
    else:
        params["nthread"] = cpus_per_actor

    # Create queue for communication from worker to caller.
    # Always create queue.
    queue = Queue()

    # Create remote actors
    actors = [
        _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor,
                      resources_per_actor, queue, checkpoint_prefix,
                      checkpoint_path, checkpoint_frequency)
        for i in range(num_actors)
    ]
    logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.")

    # Split data across workers
    wait_load = []
    for _, actor in enumerate(actors):
        wait_load.extend(_trigger_data_load(actor, dtrain, evals))

    try:
        ray.get(wait_load)
    except Exception:
        _shutdown(actors, queue, force=True)
        raise

    logger.info("[RayXGBoost] Starting XGBoost training.")

    # Start tracker
    env = _start_rabit_tracker(num_actors)
    rabit_args = [("%s=%s" % item).encode() for item in env.items()]

    # Train
    fut = [
        actor.train.remote(rabit_args, params, dtrain, evals, *args, **kwargs)
        for actor in actors
    ]

    callback_returns = [list() for _ in range(len(actors))]
    try:
        not_ready = fut
        while not_ready:
            if queue:
                while not queue.empty():
                    (actor_rank, item) = queue.get()
                    if isinstance(item, Callable):
                        item()
                    else:
                        callback_returns[actor_rank].append(item)
            ready, not_ready = ray.wait(not_ready, timeout=0)
            logger.debug("[RayXGBoost] Waiting for results...")
            ray.get(ready)
        # Once everything is ready
        ray.get(fut)
    # The inner loop should catch all exceptions
    except Exception:
        _shutdown(remote_workers=actors, queue=queue, force=True)
        raise

    # All results should be the same because of Rabit tracking. So we just
    # return the first one.
    res: Dict[str, Any] = ray.get(fut[0])
    bst = res["bst"]
    evals_result = res["evals_result"]
    additional_results = {}

    if callback_returns:
        additional_results["callback_returns"] = callback_returns

    all_res = ray.get(fut)
    total_n = sum(res["train_n"] or 0 for res in all_res)

    logger.info(f"[RayXGBoost] Finished XGBoost training on training data "
                f"with total N={total_n:,}.")

    if checkpoint_prefix:
        _cleanup(checkpoint_prefix, checkpoint_path, num_actors)

    _shutdown(remote_workers=actors, queue=queue, force=False)

    return bst, evals_result, additional_results