def ray_start_empty_cluster(): cluster = Cluster() yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_reconstruction(request): num_nodes = request.param plasma_store_memory = int(0.5 * 10**9) cluster = Cluster( initialize_head=True, head_node_args={ "num_cpus": 1, "object_store_memory": plasma_store_memory // num_nodes, "redis_max_memory": 10**7, "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 }) }) for i in range(num_nodes - 1): cluster.add_node( num_cpus=1, object_store_memory=plasma_store_memory // num_nodes, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 })) ray.init(redis_address=cluster.redis_address) yield plasma_store_memory, num_nodes, cluster # Clean up the Ray cluster. ray.shutdown() cluster.shutdown()
def ray_start_workers_separate_multinode(request): num_nodes = request.param[0] num_initial_workers = request.param[1] # Start the Ray processes. cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_initial_workers) ray.init(redis_address=cluster.redis_address) yield num_nodes, num_initial_workers # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def cluster_start(): # Start the Ray processes. cluster = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) yield cluster ray.shutdown() cluster.shutdown()
def start_connected_cluster(): # Start the Ray processes. g = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def ray_start_two_nodes(): # Start the Ray processes. cluster = Cluster() for _ in range(2): cluster.add_node( num_cpus=0, _internal_config=json.dumps({ "num_heartbeats_timeout": 40 })) ray.init(redis_address=cluster.redis_address) yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def start_connected_longer_cluster(): """Creates a cluster with a longer timeout.""" g = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 20 }) }) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def ray_start_cluster(): node_args = { "num_cpus": 4, "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10 }) } # Start with 3 worker nodes and 4 cores each. cluster = Cluster( initialize_head=True, connect=True, head_node_args=node_args) workers = [] for _ in range(3): workers.append(cluster.add_node(**node_args)) cluster.wait_for_nodes() yield cluster ray.shutdown() cluster.shutdown()
def start_connected_emptyhead_cluster(): """Starts head with no resources.""" cluster = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 0, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) # Pytest doesn't play nicely with imports _register_all() yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_combination(request): num_nodes = request.param[0] num_workers_per_scheduler = request.param[1] # Start the Ray processes. cluster = Cluster( initialize_head=True, head_node_args={ "num_cpus": 10, "redis_max_memory": 10**7 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) ray.init(redis_address=cluster.redis_address) yield num_nodes, num_workers_per_scheduler, cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_combination(request): num_nodes = request.param[0] num_workers_per_scheduler = request.param[1] # Start the Ray processes. cluster = Cluster( initialize_head=True, head_node_args={ "num_cpus": 10, "redis_max_memory": 10**7 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) ray.init(address=cluster.address) yield num_nodes, num_workers_per_scheduler, cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_temp_plasma_store_socket(): ray.init(plasma_store_socket_name="/tmp/i_am_a_temp_socket") assert os.path.exists( "/tmp/i_am_a_temp_socket"), "Specified socket path not found." ray.shutdown() try: os.remove("/tmp/i_am_a_temp_socket") except OSError: pass # It could have been removed by Ray. cluster = Cluster(True) cluster.add_node(plasma_store_socket_name="/tmp/i_am_a_temp_socket_2") assert os.path.exists( "/tmp/i_am_a_temp_socket_2"), "Specified socket path not found." cluster.shutdown() try: os.remove("/tmp/i_am_a_temp_socket_2") except OSError: pass # It could have been removed by Ray.
def ray_initialize_cluster(): # Start with 4 workers and 4 cores. num_nodes = 4 num_workers_per_scheduler = 8 cluster = Cluster() for _ in range(num_nodes): cluster.add_node( num_cpus=num_workers_per_scheduler, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10, })) ray.init(redis_address=cluster.redis_address) yield cluster ray.shutdown() cluster.shutdown()
def _ray_start_cluster(**kwargs): init_kwargs = get_default_fixture_ray_kwargs() num_nodes = 0 do_init = False # num_nodes & do_init are not arguments for ray.init, so delete them. if "num_nodes" in kwargs: num_nodes = kwargs["num_nodes"] del kwargs["num_nodes"] if "do_init" in kwargs: do_init = kwargs["do_init"] del kwargs["do_init"] elif num_nodes > 0: do_init = True init_kwargs.update(kwargs) cluster = Cluster() remote_nodes = [] for _ in range(num_nodes): remote_nodes.append(cluster.add_node(**init_kwargs)) if do_init: ray.init(redis_address=cluster.redis_address) yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_shutdown(): g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() g.shutdown() assert not any(n.any_processes_alive() for n in [node, node2])
class RayExecutor(Executor): def __init__(self, **kwargs): super(RayExecutor, self).__init__() mode = kwargs.get('ray_mode', 'local') self.resource_idx = 0 if mode == 'local': node_kwargs = { 'num_cpus': 4, 'object_store_memory': 10**9, 'resources': { 'Node_0': 100 } } self.cluster = Cluster(initialize_head=True, head_node_args=node_kwargs) self.num_nodes = kwargs.get('ray_num_nodes', 4) self.nodes = [] self.resources = [] i = 1 for _ in range(self.num_nodes): node, resource = self._create_local_node(i, node_kwargs) self.nodes.append(node) self.resources.append(resource) self._create_local_node(i, node_kwargs) redis_address = self.cluster.redis_address ray.init(redis_address=redis_address) else: redis_address = kwargs.get('redis_address', '127.0.0.1') ray.init(redis_address=redis_address) self.resources = [] self.nodes = ray.global_state.client_table() for node in self.nodes: for resource in node['Resources']: if 'Node' in resource and resource != 'Node_0': self.resources.append(resource) def __del__(self): self.cluster.shutdown() ray.disconnect() def _create_local_node(self, i, node_kwargs): resource = 'Node_{}'.format(i) node_kwargs['resources'] = {resource: 100} node = self.cluster.add_node(**node_kwargs) return node, resource def get_next_resource(self): resource = self.resources[self.resource_idx % self.num_nodes] self.resource_idx += 1 return resource def exec(self, dag): num_stages = len(dag) actors = [] task_handles = [] for i in range(num_stages): stage = dag.pop() for operator in stage: actor = OperatorActor._remote( args=[operator], kwargs={}, resources={self.get_next_resource(): 1}) actors.append(actor) task_handles.append(actor.run.remote()) ray.get(task_handles)
class RayExecutorQueueTest(unittest.TestCase): def setUp(self): self.trial_executor = RayTrialExecutor(queue_trials=True, refresh_period=0) self.cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({"num_heartbeats_timeout": 10}) }) # Pytest doesn't play nicely with imports _register_all() def tearDown(self): ray.shutdown() self.cluster.shutdown() _register_all() # re-register the evicted objects def testQueueTrial(self): """Tests that reset handles NotImplemented properly.""" def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) cpu_only = create_trial(1, 0) self.assertTrue(self.trial_executor.has_resources(cpu_only.resources)) self.trial_executor.start_trial(cpu_only) gpu_only = create_trial(0, 1) self.assertTrue(self.trial_executor.has_resources(gpu_only.resources)) def testHeadBlocking(self): def create_trial(cpu, gpu=0): return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu)) gpu_trial = create_trial(1, 1) self.assertTrue(self.trial_executor.has_resources(gpu_trial.resources)) self.trial_executor.start_trial(gpu_trial) # TODO(rliaw): This behavior is probably undesirable, but right now # trials with different resource requirements is not often used. cpu_only_trial = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources(cpu_only_trial.resources)) self.cluster.add_node(num_cpus=1, num_gpus=1) self.cluster.wait_for_nodes() self.assertTrue( self.trial_executor.has_resources(cpu_only_trial.resources)) self.trial_executor.start_trial(cpu_only_trial) cpu_only_trial2 = create_trial(1, 0) self.assertTrue( self.trial_executor.has_resources(cpu_only_trial2.resources)) self.trial_executor.start_trial(cpu_only_trial2) cpu_only_trial3 = create_trial(1, 0) self.assertFalse( self.trial_executor.has_resources(cpu_only_trial3.resources))