Exemplos de Cluster em Python, exemplos de ray.tests.cluster_utils.Cluster em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_object_manager.py Projeto: robertnishihara/ray

def create_cluster(num_nodes):
    cluster = Cluster()
    for i in range(num_nodes):
        cluster.add_node(resources={str(i): 100}, object_store_memory=10**9)

    ray.init(redis_address=cluster.redis_address)
    return cluster

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_stress.py Projeto: robertnishihara/ray

def ray_start_reconstruction(request):
    num_nodes = request.param

    plasma_store_memory = int(0.5 * 10**9)

    cluster = Cluster(
        initialize_head=True,
        head_node_args={
            "num_cpus": 1,
            "object_store_memory": plasma_store_memory // num_nodes,
            "redis_max_memory": 10**7,
            "_internal_config": json.dumps({
                "initial_reconstruction_timeout_milliseconds": 200
            })
        })
    for i in range(num_nodes - 1):
        cluster.add_node(
            num_cpus=1,
            object_store_memory=plasma_store_memory // num_nodes,
            _internal_config=json.dumps({
                "initial_reconstruction_timeout_milliseconds": 200
            }))
    ray.init(redis_address=cluster.redis_address)

    yield plasma_store_memory, num_nodes, cluster

    # Clean up the Ray cluster.
    ray.shutdown()
    cluster.shutdown()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_node_manager.py Projeto: robertnishihara/ray

def ray_start_empty_cluster():
    cluster = Cluster()
    yield cluster

    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_ray_init.py Projeto: robertnishihara/ray

    def test_redis_password_cluster(self, password, shutdown_only):
        @ray.remote
        def f():
            return 1

        node_args = {"redis_password": password}
        cluster = Cluster(
            initialize_head=True, connect=True, head_node_args=node_args)
        cluster.add_node(**node_args)

        object_id = f.remote()
        ray.get(object_id)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_component_failures.py Projeto: robertnishihara/ray

def ray_start_workers_separate_multinode(request):
    num_nodes = request.param[0]
    num_initial_workers = request.param[1]
    # Start the Ray processes.
    cluster = Cluster()
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_initial_workers)
    ray.init(redis_address=cluster.redis_address)

    yield num_nodes, num_initial_workers
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_global_state.py Projeto: robertnishihara/ray

def cluster_start():
    # Start the Ray processes.
    cluster = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 1,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 10
            })
        })
    yield cluster
    ray.shutdown()
    cluster.shutdown()

Exemplo n.º 7

0

Exibir arquivo

Arquivo: train.py Projeto: robertnishihara/ray

def run(args, parser):
    if args.config_file:
        with open(args.config_file) as f:
            experiments = yaml.load(f)
    else:
        # Note: keep this in sync with tune/config_parser.py
        experiments = {
            args.experiment_name: {  # i.e. log to ~/ray_results/default
                "run": args.run,
                "checkpoint_freq": args.checkpoint_freq,
                "local_dir": args.local_dir,
                "resources_per_trial": (
                    args.resources_per_trial and
                    resources_to_json(args.resources_per_trial)),
                "stop": args.stop,
                "config": dict(args.config, env=args.env),
                "restore": args.restore,
                "num_samples": args.num_samples,
                "upload_dir": args.upload_dir,
            }
        }

    for exp in experiments.values():
        if not exp.get("run"):
            parser.error("the following arguments are required: --run")
        if not exp.get("env") and not exp.get("config", {}).get("env"):
            parser.error("the following arguments are required: --env")

    if args.ray_num_nodes:
        cluster = Cluster()
        for _ in range(args.ray_num_nodes):
            cluster.add_node(
                num_cpus=args.ray_num_cpus or 1,
                num_gpus=args.ray_num_gpus or 0,
                object_store_memory=args.ray_object_store_memory,
                redis_max_memory=args.ray_redis_max_memory)
        ray.init(redis_address=cluster.redis_address)
    else:
        ray.init(
            redis_address=args.redis_address,
            object_store_memory=args.ray_object_store_memory,
            redis_max_memory=args.ray_redis_max_memory,
            num_cpus=args.ray_num_cpus,
            num_gpus=args.ray_num_gpus)
    run_experiments(
        experiments,
        scheduler=_make_scheduler(args),
        queue_trials=args.queue_trials,
        resume=args.resume)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_multi_node_2.py Projeto: robertnishihara/ray

def start_connected_longer_cluster():
    """Creates a cluster with a longer timeout."""
    g = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 1,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 20
            })
        })
    yield g
    # The code after the yield will run as teardown code.
    ray.shutdown()
    g.shutdown()

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_multi_node_2.py Projeto: robertnishihara/ray

def start_connected_cluster():
    # Start the Ray processes.
    g = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 1,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 10
            })
        })
    yield g
    # The code after the yield will run as teardown code.
    ray.shutdown()
    g.shutdown()

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_failure.py Projeto: robertnishihara/ray

def ray_start_two_nodes():
    # Start the Ray processes.
    cluster = Cluster()
    for _ in range(2):
        cluster.add_node(
            num_cpus=0,
            _internal_config=json.dumps({
                "num_heartbeats_timeout": 40
            }))
    ray.init(redis_address=cluster.redis_address)

    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_cluster.py Projeto: robertnishihara/ray

def start_connected_emptyhead_cluster():
    """Starts head with no resources."""

    cluster = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 0,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 10
            })
        })
    # Pytest doesn't play nicely with imports
    _register_all()
    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_stress.py Projeto: robertnishihara/ray

def ray_start_combination(request):
    num_nodes = request.param[0]
    num_workers_per_scheduler = request.param[1]
    # Start the Ray processes.
    cluster = Cluster(
        initialize_head=True,
        head_node_args={
            "num_cpus": 10,
            "redis_max_memory": 10**7
        })
    for i in range(num_nodes - 1):
        cluster.add_node(num_cpus=10)
    ray.init(redis_address=cluster.redis_address)

    yield num_nodes, num_workers_per_scheduler, cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test_component_failures.py Projeto: robertnishihara/ray

def ray_initialize_cluster():
    # Start with 4 workers and 4 cores.
    num_nodes = 4
    num_workers_per_scheduler = 8

    cluster = Cluster()
    for _ in range(num_nodes):
        cluster.add_node(
            num_cpus=num_workers_per_scheduler,
            _internal_config=json.dumps({
                "initial_reconstruction_timeout_milliseconds": 1000,
                "num_heartbeats_timeout": 10,
            }))
    ray.init(redis_address=cluster.redis_address)

    yield cluster

    ray.shutdown()
    cluster.shutdown()

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_multi_node_2.py Projeto: robertnishihara/ray

def test_cluster():
    """Basic test for adding and removing nodes in cluster."""
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    assert node.remaining_processes_alive()
    assert node2.remaining_processes_alive()
    g.remove_node(node2)
    g.remove_node(node)
    assert not any(n.any_processes_alive() for n in [node, node2])

Exemplo n.º 15

0

Exibir arquivo

Arquivo: test_component_failures.py Projeto: robertnishihara/ray

def ray_start_cluster():
    node_args = {
        "num_cpus": 4,
        "_internal_config": json.dumps({
            "initial_reconstruction_timeout_milliseconds": 1000,
            "num_heartbeats_timeout": 10
        })
    }
    # Start with 3 worker nodes and 4 cores each.
    cluster = Cluster(
        initialize_head=True, connect=True, head_node_args=node_args)
    workers = []
    for _ in range(3):
        workers.append(cluster.add_node(**node_args))
    cluster.wait_for_nodes()
    yield cluster
    ray.shutdown()
    cluster.shutdown()

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_failure.py Projeto: robertnishihara/ray

def test_connect_with_disconnected_node(shutdown_only):
    config = json.dumps({
        "num_heartbeats_timeout": 50,
        "heartbeat_timeout_milliseconds": 10,
    })
    cluster = Cluster()
    cluster.add_node(num_cpus=0, _internal_config=config)
    ray.init(redis_address=cluster.redis_address)
    info = relevant_errors(ray_constants.REMOVED_NODE_ERROR)
    assert len(info) == 0
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(dead_node, allow_graceful=False)
    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1, timeout=2)
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(dead_node, allow_graceful=False)
    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=2)
    # This node is killed by SIGTERM, ray_monitor will not mark it again.
    removing_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(removing_node, allow_graceful=True)
    with pytest.raises(Exception, match=('Timing out of wait.')):
        wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2)
    # There is no connection error to a dead node.
    info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR)
    assert len(info) == 0

Exemplo n.º 17

0

Exibir arquivo

Arquivo: test_multi_node_2.py Projeto: robertnishihara/ray

def test_shutdown():
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    g.shutdown()
    assert not any(n.any_processes_alive() for n in [node, node2])

Exemplo n.º 18

0

Exibir arquivo

Arquivo: train.py Projeto: xiangtju/RLRoboticAssembly

def run(args, parser):
    if args.config_file:
        with open(args.config_file) as f:
            experiments = yaml.safe_load(f)

            # add callbacks for self-defined metric
            # and save successful transitions from RL agents 
            experiment_name = next(iter(experiments))
            experiments[experiment_name]["config"]["optimizer"]["robot_demo_path"] = dir_path            
            experiments[experiment_name]["config"]["callbacks"] = {
                    "on_episode_start": on_episode_start,
                    "on_episode_step": on_episode_step,
                    "on_episode_end": on_episode_end,
                    "on_sample_end": on_sample_end,
                    "on_train_result": on_train_result,
                    "on_postprocess_traj": on_postprocess_traj
                    }
    else:
        # Note: keep this in sync with tune/config_parser.py
        experiments = {
            args.experiment_name: {  # i.e. log to ~/ray_results/default
                "run": args.run,
                "checkpoint_freq": args.checkpoint_freq,
                "keep_checkpoints_num": args.keep_checkpoints_num,
                "checkpoint_score_attr": args.checkpoint_score_attr,
                "local_dir": args.local_dir,
                "resources_per_trial": (
                    args.resources_per_trial and
                    resources_to_json(args.resources_per_trial)),
                "stop": args.stop,
                "config": dict(args.config, env=args.env),
                "restore": args.restore,
                "num_samples": args.num_samples,
                "upload_dir": args.upload_dir,
            }
        }

    for exp in experiments.values():
        if not exp.get("run"):
            parser.error("the following arguments are required: --run")
        if not exp.get("env") and not exp.get("config", {}).get("env"):
            parser.error("the following arguments are required: --env")
        if args.eager:
            exp["config"]["eager"] = True
        if args.trace:
            if not exp["config"].get("eager"):
                raise ValueError("Must enable --eager to enable tracing.")
            exp["config"]["eager_tracing"] = True

    if args.ray_num_nodes:
        cluster = Cluster()
        for _ in range(args.ray_num_nodes):
            cluster.add_node(
                num_cpus=args.ray_num_cpus or 1,
                num_gpus=args.ray_num_gpus or 0,
                object_store_memory=args.ray_object_store_memory,
                memory=args.ray_memory,
                redis_max_memory=args.ray_redis_max_memory)
        ray.init(address=cluster.address) #, log_to_driver=False)
    else:
        ray.init(
            address=args.ray_address,
            object_store_memory=args.ray_object_store_memory,
            memory=args.ray_memory,
            redis_max_memory=args.ray_redis_max_memory,
            num_cpus=args.ray_num_cpus,
            num_gpus=args.ray_num_gpus)
            # log_to_driver=False) # disable the loggings
                                 # https://github.com/ray-project/ray/issues/5048 
    
    run_experiments(
        experiments,
        scheduler=_make_scheduler(args),
        queue_trials=args.queue_trials,
        resume=args.resume)

Exemplo n.º 19

0

Exibir arquivo

def test_connect_with_disconnected_node(shutdown_only):
    config = json.dumps({
        "num_heartbeats_timeout": 50,
        "heartbeat_timeout_milliseconds": 10,
    })
    cluster = Cluster()
    cluster.add_node(num_cpus=0, _internal_config=config)
    ray.init(redis_address=cluster.redis_address)
    info = relevant_errors(ray_constants.REMOVED_NODE_ERROR)
    assert len(info) == 0
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(dead_node, allow_graceful=False)
    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1, timeout=2)
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(dead_node, allow_graceful=False)
    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=2)
    # This node is killed by SIGTERM, ray_monitor will not mark it again.
    removing_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(removing_node, allow_graceful=True)
    with pytest.raises(Exception, match=('Timing out of wait.')):
        wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2)
    # There is no connection error to a dead node.
    info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR)
    assert len(info) == 0

Exemplo n.º 20

0

Exibir arquivo

Arquivo: test_multi_node_2.py Projeto: pangfd/ray-1

def test_shutdown():
    g = Cluster(initialize_head=False)
    node = g.add_node()
    node2 = g.add_node()
    g.shutdown()
    assert not any(n.any_processes_alive() for n in [node, node2])

Exemplo n.º 21

0

Exibir arquivo

Arquivo: pbt.py Projeto: ray1201/ray-1

from ray.tune.schedulers import PopulationBasedTraining
from ray.tests.cluster_utils import Cluster

num_redis_shards = 5
redis_max_memory = 10**8
object_store_memory = 10**8
num_nodes = 3

message = ("Make sure there is enough memory on this machine to run this "
           "workload. We divide the system memory by 2 to provide a buffer.")
assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory <
        ray.utils.get_system_memory() / 2), message

# Simulate a cluster on one machine.

cluster = Cluster()
for i in range(num_nodes):
    cluster.add_node(redis_port=6379 if i == 0 else None,
                     num_redis_shards=num_redis_shards if i == 0 else None,
                     num_cpus=10,
                     num_gpus=0,
                     resources={str(i): 2},
                     object_store_memory=object_store_memory,
                     redis_max_memory=redis_max_memory)
ray.init(address=cluster.address)

# Run the workload.

pbt = PopulationBasedTraining(time_attr="training_iteration",
                              metric="episode_reward_mean",
                              mode="max",

Exemplo n.º 22

0

Exibir arquivo

Arquivo: impala.py Projeto: robertnishihara/ray

from ray.tune import run_experiments
from ray.tests.cluster_utils import Cluster

num_redis_shards = 5
redis_max_memory = 10**8
object_store_memory = 10**8
num_nodes = 1

message = ("Make sure there is enough memory on this machine to run this "
           "workload. We divide the system memory by 2 to provide a buffer.")
assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory <
        ray.utils.get_system_memory() / 2), message

# Simulate a cluster on one machine.

cluster = Cluster()
for i in range(num_nodes):
    cluster.add_node(
        redis_port=6379 if i == 0 else None,
        num_redis_shards=num_redis_shards if i == 0 else None,
        num_cpus=10,
        num_gpus=0,
        resources={str(i): 2},
        object_store_memory=object_store_memory,
        redis_max_memory=redis_max_memory)
ray.init(redis_address=cluster.redis_address)

# Run the workload.

run_experiments({
    "impala": {

Exemplo n.º 23

0

Exibir arquivo

Arquivo: node_failures.py Projeto: blockspacer/ray-2

import ray
from ray.tests.cluster_utils import Cluster

num_redis_shards = 5
redis_max_memory = 10**8
object_store_memory = 10**8
num_nodes = 10

message = ("Make sure there is enough memory on this machine to run this "
           "workload. We divide the system memory by 2 to provide a buffer.")
assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory <
        ray.utils.get_system_memory() / 2), message

# Simulate a cluster on one machine.

cluster = Cluster()
for i in range(num_nodes):
    cluster.add_node(redis_port=6379 if i == 0 else None,
                     num_redis_shards=num_redis_shards if i == 0 else None,
                     num_cpus=2,
                     num_gpus=0,
                     resources={str(i): 2},
                     object_store_memory=object_store_memory,
                     redis_max_memory=redis_max_memory)
ray.init(redis_address=cluster.redis_address)

# Run the workload.


@ray.remote
def f(*xs):

Exemplo n.º 24

0

Exibir arquivo

Arquivo: train.py Projeto: noncomposmentis/Arena-Baselines

def run(args, parser):

    # create exps from configs
    if args.config_file:
        # load configs from yaml
        with open(args.config_file) as f:
            exps = yaml.safe_load(f)

    else:
        exps = create_exps(
            args=args,
        )

    arena_exps = create_arena_exps(
        exps=exps,
        args=args,
        parser=parser,
    )

    # config ray cluster
    if args.ray_num_nodes:
        cluster = Cluster()
        for ray_node in range(args.ray_num_nodes):
            cluster.add_node(
                num_cpus=args.ray_num_cpus or 1,
                num_gpus=args.ray_num_gpus or 0,
                object_store_memory=args.ray_object_store_memory,
                memory=args.ray_memory,
                redis_max_memory=args.ray_redis_max_memory,
            )
        ray.init(
            address=cluster.redis_address,
        )
    else:
        ray.init(
            address=args.ray_address,
            object_store_memory=args.ray_object_store_memory,
            memory=args.ray_memory,
            redis_max_memory=args.ray_redis_max_memory,
            num_cpus=args.ray_num_cpus,
            num_gpus=args.ray_num_gpus,
        )

    if len(arena_exps.keys()) > 1:
        logger.warning(
            "There are multiple experiments scheduled, ray==0.7.4 will run them one by one, instead of cocurrently. "
            "However, recent ray can run them cocurrently. But the recent ray has failed our test (the rllib is broken)"
            "This is mainly due to there are grid search used in configs that is not supported by original rllib. "
        )

    if args.eval:

        # evaluate policies

        if len(arena_exps.keys()) < 1:
            raise ValueError

        elif len(arena_exps.keys()) >= 1:

            if len(arena_exps.keys()) > 1:

                arena_exp_key = inquire_select(
                    choices=list(arena_exps.keys()),
                    key="arena_exp_key",
                )

            else:
                # if there is just one arena_exps
                arena_exp_key = list(arena_exps.keys())[0]

        logger.info("Evaluating arena_exp_key: {}".format(
            arena_exp_key,
        ))

        arena_exp = arena_exps[arena_exp_key]

        answers = prompt(
            [{
                'type': 'input',
                'name': 'eval_log_path',
                'message': 'Where do you want to log the results of this evaluation?',
                'default': '../eval_log_path/'
            }],
            style=custom_style_2,
        )

        prepare_path(answers['eval_log_path'])

        from ray.rllib.evaluation.rollout_worker import RolloutWorker

        # worker = ArenaRolloutWorker(
        # TODO: RolloutWorker does not support monitor for multi-agent envs
        worker = RolloutWorker(
            env_creator=lambda _: ArenaRllibEnv(
                env=arena_exp["env"],
                env_config=arena_exp["config"]["env_config"],
            ),
            policy=arena_exp["config"]["multiagent"]["policies"],
            policy_mapping_fn=arena_exp["config"]["multiagent"]["policy_mapping_fn"],
            batch_mode="complete_episodes",
            batch_steps=500,
            num_envs=1,
            monitor_path=answers['eval_log_path'],
        )

        logger.info("Testing worker...")
        sample_start = time.time()
        worker.sample()
        sample_time = time.time() - sample_start
        logger.info("Finish testing worker.")

        policy_ids = list(worker.policy_map.keys())

        checkpoints = inquire_checkpoints(
            local_dir=arena_exp["local_dir"],
            policy_ids=policy_ids,
        )

        checkpoint_paths = checkpoints_2_checkpoint_paths(checkpoints)

        num_checkpoint_paths = {}
        for policy_id, checkpoint_paths_per_policy_id in checkpoint_paths.items():
            num_checkpoint_paths[policy_id] = len(
                checkpoint_paths_per_policy_id
            )

        num_sampling = np.prod(list(num_checkpoint_paths.values()))

        confirm = inquire_confirm("You have scheduled {} sampling, each sampling will take {} minutes, which means {} hours in total.".format(
            num_sampling,
            sample_time / 60.0,
            num_sampling * sample_time / 60.0 / 60.0,
        ))
        if not confirm:
            os.exit()

        result_matrix = run_result_matrix(
            checkpoint_paths=checkpoint_paths,
            worker=worker,
        )

        result_matrix = np.asarray(result_matrix)

        vis_result_matrix(
            result_matrix=result_matrix,
            log_path=answers['eval_log_path'],
        )

    else:

        run_experiments(
            arena_exps,
            scheduler=_make_scheduler(args),
            queue_trials=args.queue_trials,
            resume=args.resume,
        )