예제 #1
0
def start_connected_cluster():
    # Start the Ray processes.
    g = Cluster(initialize_head=True, connect=True)
    yield g
    # The code after the yield will run as teardown code.
    ray.shutdown()
    g.shutdown()
예제 #2
0
def create_cluster(num_nodes):
    cluster = Cluster()
    for i in range(num_nodes):
        cluster.add_node(resources={str(i): 100}, object_store_memory=10**9)

    ray.init(redis_address=cluster.redis_address)
    return cluster
예제 #3
0
def ray_start_reconstruction(request):
    num_nodes = request.param

    plasma_store_memory = 10**9

    cluster = Cluster(
        initialize_head=True,
        head_node_args={
            "num_cpus": 1,
            "object_store_memory": plasma_store_memory // num_nodes,
            "redis_max_memory": 10**7,
            "redirect_output": True,
            "_internal_config": json.dumps({
                "initial_reconstruction_timeout_milliseconds": 200
            })
        })
    for i in range(num_nodes - 1):
        cluster.add_node(
            num_cpus=1,
            object_store_memory=plasma_store_memory // num_nodes,
            redirect_output=True,
            _internal_config=json.dumps({
                "initial_reconstruction_timeout_milliseconds": 200
            }))
    ray.init(redis_address=cluster.redis_address)

    yield plasma_store_memory, num_nodes, cluster

    # Clean up the Ray cluster.
    ray.shutdown()
    cluster.shutdown()
예제 #4
0
def ray_start_empty_cluster():
    cluster = Cluster()
    yield cluster

    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
예제 #5
0
def ray_start_empty_cluster():
    cluster = Cluster()
    yield cluster

    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
예제 #6
0
    def test_redis_password_cluster(self, password, shutdown_only):
        @ray.remote
        def f():
            return 1

        node_args = {"redis_password": password}
        cluster = Cluster(
            initialize_head=True, connect=True, head_node_args=node_args)
        cluster.add_node(**node_args)

        object_id = f.remote()
        ray.get(object_id)
예제 #7
0
def cluster_start():
    # Start the Ray processes.
    cluster = Cluster(initialize_head=True,
                      connect=True,
                      head_node_args={
                          "resources":
                          dict(CPU=1),
                          "_internal_config":
                          json.dumps({"num_heartbeats_timeout": 10})
                      })
    yield cluster
    ray.shutdown()
    cluster.shutdown()
예제 #8
0
def start_connected_cluster():
    # Start the Ray processes.
    g = Cluster(initialize_head=True,
                connect=True,
                head_node_args={
                    "resources": dict(CPU=1),
                    "_internal_config":
                    json.dumps({"num_heartbeats_timeout": 10})
                })
    yield g
    # The code after the yield will run as teardown code.
    ray.shutdown()
    g.shutdown()
예제 #9
0
def ray_start_workers_separate_multinode(request):
    num_nodes = request.param[0]
    num_initial_workers = request.param[1]
    # Start the Ray processes.
    cluster = Cluster()
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_initial_workers)
    ray.init(redis_address=cluster.redis_address)

    yield num_nodes, num_initial_workers
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
예제 #10
0
def start_connected_longer_cluster():
    """Creates a cluster with a longer timeout."""
    g = Cluster(initialize_head=True,
                connect=True,
                head_node_args={
                    "resources": dict(CPU=1),
                    "_internal_config":
                    json.dumps({"num_heartbeats_timeout": 20})
                })
    yield g
    # The code after the yield will run as teardown code.
    ray.shutdown()
    g.shutdown()
예제 #11
0
    def test_redis_password_cluster(self, password, shutdown_only):
        @ray.remote
        def f():
            return 1

        node_args = {"redis_password": password}
        cluster = Cluster(initialize_head=True,
                          connect=True,
                          head_node_args=node_args)
        cluster.add_node(**node_args)

        object_id = f.remote()
        ray.get(object_id)
예제 #12
0
def run(args, parser):
    if args.config_file:
        with open(args.config_file) as f:
            experiments = yaml.load(f)
    else:
        # Note: keep this in sync with tune/config_parser.py
        experiments = {
            args.experiment_name: {  # i.e. log to ~/ray_results/default
                "run": args.run,
                "checkpoint_freq": args.checkpoint_freq,
                "local_dir": args.local_dir,
                "resources_per_trial": (
                    args.resources_per_trial and
                    resources_to_json(args.resources_per_trial)),
                "stop": args.stop,
                "config": dict(args.config, env=args.env),
                "restore": args.restore,
                "num_samples": args.num_samples,
                "upload_dir": args.upload_dir,
            }
        }

    for exp in experiments.values():
        if not exp.get("run"):
            parser.error("the following arguments are required: --run")
        if not exp.get("env") and not exp.get("config", {}).get("env"):
            parser.error("the following arguments are required: --env")

    if args.ray_num_nodes:
        cluster = Cluster()
        for _ in range(args.ray_num_nodes):
            cluster.add_node(
                resources={
                    "num_cpus": args.ray_num_cpus or 1,
                    "num_gpus": args.ray_num_gpus or 0,
                },
                object_store_memory=args.ray_object_store_memory,
                redis_max_memory=args.ray_redis_max_memory)
        ray.init(redis_address=cluster.redis_address)
    else:
        ray.init(
            redis_address=args.redis_address,
            object_store_memory=args.ray_object_store_memory,
            redis_max_memory=args.ray_redis_max_memory,
            num_cpus=args.ray_num_cpus,
            num_gpus=args.ray_num_gpus)
    run_experiments(
        experiments,
        scheduler=_make_scheduler(args),
        queue_trials=args.queue_trials,
        resume=args.resume)
예제 #13
0
def test_cluster():
    """Basic test for adding and removing nodes in cluster."""
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    assert node.all_processes_alive()
    assert node2.all_processes_alive()
    g.remove_node(node2)
    g.remove_node(node)
    assert not any(node.any_processes_alive() for node in g.list_all_nodes())
예제 #14
0
def cluster_start():
    # Start the Ray processes.
    cluster = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 1,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 10
            })
        })
    yield cluster
    ray.shutdown()
    cluster.shutdown()
예제 #15
0
def start_connected_cluster():
    # Start the Ray processes.
    g = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 1,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 10
            })
        })
    yield g
    # The code after the yield will run as teardown code.
    ray.shutdown()
    g.shutdown()
예제 #16
0
def start_connected_longer_cluster():
    """Creates a cluster with a longer timeout."""
    g = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 1,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 20
            })
        })
    yield g
    # The code after the yield will run as teardown code.
    ray.shutdown()
    g.shutdown()
예제 #17
0
def start_connected_emptyhead_cluster():
    """Starts head with no resources."""

    cluster = Cluster(initialize_head=True,
                      connect=True,
                      head_node_args={
                          "resources":
                          dict(CPU=0),
                          "_internal_config":
                          json.dumps({"num_heartbeats_timeout": 10})
                      })
    # Pytest doesn't play nicely with imports
    _register_all()
    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
예제 #18
0
def test_cluster():
    """Basic test for adding and removing nodes in cluster."""
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    assert node.remaining_processes_alive()
    assert node2.remaining_processes_alive()
    g.remove_node(node2)
    g.remove_node(node)
    assert not any(n.any_processes_alive() for n in [node, node2])
예제 #19
0
def start_connected_emptyhead_cluster():
    """Starts head with no resources."""

    cluster = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 0,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 10
            })
        })
    # Pytest doesn't play nicely with imports
    _register_all()
    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
예제 #20
0
def ray_start_combination(request):
    num_nodes = request.param[0]
    num_workers_per_scheduler = request.param[1]
    # Start the Ray processes.
    cluster = Cluster(
        initialize_head=True,
        head_node_args={
            "num_cpus": 10,
            "redis_max_memory": 10**7
        })
    for i in range(num_nodes - 1):
        cluster.add_node(num_cpus=10)
    ray.init(redis_address=cluster.redis_address)

    yield num_nodes, num_workers_per_scheduler
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
예제 #21
0
def ray_initialize_cluster():
    # Start with 4 workers and 4 cores.
    num_nodes = 4
    num_workers_per_scheduler = 8

    cluster = Cluster()
    for _ in range(num_nodes):
        cluster.add_node(
            num_cpus=num_workers_per_scheduler,
            _internal_config=json.dumps({
                "initial_reconstruction_timeout_milliseconds": 1000,
                "num_heartbeats_timeout": 10,
            }))
    ray.init(redis_address=cluster.redis_address)

    yield None

    ray.shutdown()
    cluster.shutdown()
예제 #22
0
def _start_new_cluster():
    cluster = Cluster(initialize_head=True,
                      connect=True,
                      head_node_args={
                          "resources":
                          dict(CPU=1),
                          "_internal_config":
                          json.dumps({"num_heartbeats_timeout": 10})
                      })
    # Pytest doesn't play nicely with imports
    _register_all()
    return cluster
예제 #23
0
def test_cluster():
    """Basic test for adding and removing nodes in cluster."""
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    assert node.all_processes_alive()
    assert node2.all_processes_alive()
    g.remove_node(node2)
    g.remove_node(node)
    assert not any(n.any_processes_alive() for n in [node, node2])
예제 #24
0
def ray_start_reconstruction(request):
    num_nodes = request.param

    plasma_store_memory = 10**9

    cluster = Cluster(
        initialize_head=True,
        head_node_args={
            "num_cpus": 1,
            "object_store_memory": plasma_store_memory // num_nodes,
            "redis_max_memory": 10**7,
            "redirect_output": True,
            "_internal_config": json.dumps({
                "initial_reconstruction_timeout_milliseconds": 200
            })
        })
    for i in range(num_nodes - 1):
        cluster.add_node(
            num_cpus=1,
            object_store_memory=plasma_store_memory // num_nodes,
            redirect_output=True,
            _internal_config=json.dumps({
                "initial_reconstruction_timeout_milliseconds": 200
            }))
    ray.init(redis_address=cluster.redis_address)

    yield plasma_store_memory, num_nodes, cluster

    # Clean up the Ray cluster.
    ray.shutdown()
    cluster.shutdown()
예제 #25
0
def ray_start_workers_separate_multinode(request):
    num_nodes = request.param[0]
    num_initial_workers = request.param[1]
    # Start the Ray processes.
    cluster = Cluster()
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_initial_workers)
    ray.init(redis_address=cluster.redis_address)

    yield num_nodes, num_initial_workers
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
예제 #26
0
def ray_start_two_nodes():
    # Start the Ray processes.
    cluster = Cluster()
    for _ in range(2):
        cluster.add_node(num_cpus=0,
                         _internal_config=json.dumps(
                             {"num_heartbeats_timeout": 40}))
    ray.init(redis_address=cluster.redis_address)

    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
예제 #27
0
def ray_start_cluster():
    node_args = {
        "resources":
        dict(CPU=8),
        "_internal_config":
        json.dumps({
            "initial_reconstruction_timeout_milliseconds": 1000,
            "num_heartbeats_timeout": 10
        })
    }
    # Start with 4 worker nodes and 8 cores each.
    g = Cluster(initialize_head=True, connect=True, head_node_args=node_args)
    workers = []
    for _ in range(4):
        workers.append(g.add_node(**node_args))
    g.wait_for_nodes()
    yield g
    ray.shutdown()
    g.shutdown()
예제 #28
0
def ray_start_combination(request):
    num_nodes = request.param[0]
    num_workers_per_scheduler = request.param[1]
    # Start the Ray processes.
    cluster = Cluster(initialize_head=True,
                      head_node_args={
                          "num_cpus": 10,
                          "redis_max_memory": 10**7
                      })
    for i in range(num_nodes - 1):
        cluster.add_node(num_cpus=10)
    ray.init(redis_address=cluster.redis_address)

    yield num_nodes, num_workers_per_scheduler, cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
예제 #29
0
def ray_start_cluster():
    node_args = {
        "num_cpus":
        4,
        "_internal_config":
        json.dumps({
            "initial_reconstruction_timeout_milliseconds": 1000,
            "num_heartbeats_timeout": 10
        })
    }
    # Start with 3 worker nodes and 4 cores each.
    cluster = Cluster(initialize_head=True,
                      connect=True,
                      head_node_args=node_args)
    workers = []
    for _ in range(3):
        workers.append(cluster.add_node(**node_args))
    cluster.wait_for_nodes()
    yield cluster
    ray.shutdown()
    cluster.shutdown()
예제 #30
0
def ray_start_cluster():
    node_args = {
        "num_cpus": 8,
        "_internal_config": json.dumps({
            "initial_reconstruction_timeout_milliseconds": 1000,
            "num_heartbeats_timeout": 10
        })
    }
    # Start with 4 worker nodes and 8 cores each.
    cluster = Cluster(
        initialize_head=True, connect=True, head_node_args=node_args)
    workers = []
    for _ in range(4):
        workers.append(cluster.add_node(**node_args))
    cluster.wait_for_nodes()
    yield cluster
    ray.shutdown()
    cluster.shutdown()
예제 #31
0
def ray_initialize_cluster():
    # Start with 4 workers and 4 cores.
    num_nodes = 4
    num_workers_per_scheduler = 8

    cluster = Cluster()
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_workers_per_scheduler,
                         _internal_config=json.dumps({
                             "initial_reconstruction_timeout_milliseconds":
                             1000,
                             "num_heartbeats_timeout":
                             10,
                         }))
    ray.init(redis_address=cluster.redis_address)

    yield cluster

    ray.shutdown()
    cluster.shutdown()
예제 #32
0
def test_shutdown():
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    g.shutdown()
    assert not any(n.any_processes_alive() for n in [node, node2])
예제 #33
0
def run(args, parser):
    if args.config_file:
        with open(args.config_file) as f:
            experiments = yaml.load(f)
    else:
        # Note: keep this in sync with tune/config_parser.py
        experiments = {
            args.experiment_name: {  # i.e. log to ~/ray_results/default
                "run": args.run,
                "checkpoint_freq": args.checkpoint_freq,
                "local_dir": args.local_dir,
                "resources_per_trial": (
                    args.resources_per_trial and
                    resources_to_json(args.resources_per_trial)),
                "stop": args.stop,
                "config": dict(args.config, env=args.env),
                "restore": args.restore,
                "num_samples": args.num_samples,
                "upload_dir": args.upload_dir,
            }
        }

    # The default maximum number of bytes to allocate to the object store unless
    # overridden by the user.
    DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = 20 * 10**9
    # The smallest cap on the memory used by the object store that we allow.
    OBJECT_STORE_MINIMUM_MEMORY_BYTES = 10**7
    # The default maximum number of bytes that the non-primary Redis shards are
    # allowed to use unless overridden by the user.
    DEFAULT_REDIS_MAX_MEMORY_BYTES = 10**10
    # The smallest cap on the memory used by Redis that we allow.
    REDIS_MINIMUM_MEMORY_BYTES = 10**7

    def on_episode_end(info):
        episode = info["episode"]
        env = info['env'].get_unwrapped()[0]
        if hasattr(env, 'capital'):
            capital_return = (env.capital -
                              env.initial_funds) / env.initial_funds
            episode.custom_metrics['capital_return'] = capital_return

    key = list(experiments.keys())[0]
    experiments[key]["config"]["callbacks"] = {
        "on_episode_end": tune.function(on_episode_end)
    }

    for exp in experiments.values():
        if not exp.get("run"):
            parser.error("the following arguments are required: --run")
        if not exp.get("env") and not exp.get("config", {}).get("env"):
            parser.error("the following arguments are required: --env")

    if args.ray_num_nodes:
        cluster = Cluster()
        for _ in range(args.ray_num_nodes):
            cluster.add_node(resources={
                "num_cpus": args.ray_num_cpus or 1,
                "num_gpus": args.ray_num_gpus or 0,
            },
                             object_store_memory=args.ray_object_store_memory,
                             redis_max_memory=args.ray_redis_max_memory)
        ray.init(redis_address=cluster.redis_address)
    else:
        print('init')
        ray.init(redis_address=args.redis_address,
                 object_store_memory=int(0.5 * 10**9),
                 redis_max_memory=int(0.5 * 10**9),
                 num_cpus=args.ray_num_cpus,
                 num_gpus=args.ray_num_gpus)
    run_experiments(experiments,
                    scheduler=_make_scheduler(args),
                    queue_trials=args.queue_trials,
                    resume=args.resume)
예제 #34
0
def test_shutdown():
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    g.shutdown()
    assert not any(n.any_processes_alive() for n in [node, node2])