Exemplo n.º 1
0
def driver_0(redis_address, driver_index):
    """The script for driver 0.

    This driver should create five actors that each use one GPU and some actors
    that use no GPUs. After a while, it should exit.
    """
    ray.init(redis_address=redis_address)

    # Wait for all the nodes to join the cluster.
    _wait_for_nodes_to_join(total_num_nodes)

    # Start some long running task. Driver 2 will make sure the worker running
    # this task has been killed.
    for i in range(num_long_running_tasks_per_driver):
        long_running_task.remote(driver_index, i, redis_address)

    # Create some actors that require one GPU.
    actors_one_gpu = [Actor1.remote(driver_index, i, redis_address)
                      for i in range(5)]
    # Create some actors that don't require any GPUs.
    actors_no_gpus = [Actor0.remote(driver_index, 5 + i, redis_address)
                      for i in range(5)]

    for _ in range(1000):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])
        ray.get([actor.check_ids.remote() for actor in actors_no_gpus])

    # Start a long-running method on one actor and make sure this doesn't
    # affect anything.
    actors_no_gpus[0].long_running_method.remote()

    _broadcast_event("DRIVER_0_DONE", redis_address)
Exemplo n.º 2
0
    def testFailedTask(self):
        reload(test_functions)
        ray.init(num_workers=3, driver_mode=ray.SILENT_MODE)

        test_functions.throw_exception_fct1.remote()
        test_functions.throw_exception_fct1.remote()
        wait_for_errors(b"task", 2)
        self.assertEqual(len(relevant_errors(b"task")), 2)
        for task in relevant_errors(b"task"):
            self.assertIn(b"Test function 1 intentionally failed.",
                          task.get(b"message"))

        x = test_functions.throw_exception_fct2.remote()
        try:
            ray.get(x)
        except Exception as e:
            self.assertIn("Test function 2 intentionally failed.", str(e))
        else:
            # ray.get should throw an exception.
            self.assertTrue(False)

        x, y, z = test_functions.throw_exception_fct3.remote(1.0)
        for ref in [x, y, z]:
            try:
                ray.get(ref)
            except Exception as e:
                self.assertIn("Test function 3 intentionally failed.", str(e))
            else:
                # ray.get should throw an exception.
                self.assertTrue(False)
Exemplo n.º 3
0
def get(object_ids):
    """Get a single or a collection of remote objects from the object store.

    This method is identical to `ray.get` except it adds support for tuples,
    ndarrays and dictionaries.

    Args:
        object_ids: Object ID of the object to get, a list, tuple, ndarray of
            object IDs to get or a dict of {key: object ID}.

    Returns:
        A Python object, a list of Python objects or a dict of {key: object}.
    """
    if isinstance(object_ids, (tuple, np.ndarray)):
        return ray.get(list(object_ids))
    elif isinstance(object_ids, dict):
        keys_to_get = [
            k for k, v in object_ids.items() if isinstance(v, ray.ObjectID)
        ]
        ids_to_get = [
            v for k, v in object_ids.items() if isinstance(v, ray.ObjectID)
        ]
        values = ray.get(ids_to_get)

        result = object_ids.copy()
        for key, value in zip(keys_to_get, values):
            result[key] = value
        return result
    else:
        return ray.get(object_ids)
Exemplo n.º 4
0
  def testCachingReusables(self):
    # Test that we can define reusable variables before the driver is connected.
    def foo_initializer():
      return 1
    def bar_initializer():
      return []
    def bar_reinitializer(bar):
      return []
    ray.reusables.foo = ray.Reusable(foo_initializer)
    ray.reusables.bar = ray.Reusable(bar_initializer, bar_reinitializer)

    @ray.remote
    def use_foo():
      return ray.reusables.foo
    @ray.remote
    def use_bar():
      ray.reusables.bar.append(1)
      return ray.reusables.bar

    ray.init(start_ray_local=True, num_workers=2)

    self.assertEqual(ray.get(use_foo.remote()), 1)
    self.assertEqual(ray.get(use_foo.remote()), 1)
    self.assertEqual(ray.get(use_bar.remote()), [1])
    self.assertEqual(ray.get(use_bar.remote()), [1])

    ray.worker.cleanup()
Exemplo n.º 5
0
  def testPutGet(self):
    ray.init(start_ray_local=True, num_workers=0)

    for i in range(100):
      value_before = i * 10 ** 6
      objectid = ray.put(value_before)
      value_after = ray.get(objectid)
      self.assertEqual(value_before, value_after)

    for i in range(100):
      value_before = i * 10 ** 6 * 1.0
      objectid = ray.put(value_before)
      value_after = ray.get(objectid)
      self.assertEqual(value_before, value_after)

    for i in range(100):
      value_before = "h" * i
      objectid = ray.put(value_before)
      value_after = ray.get(objectid)
      self.assertEqual(value_before, value_after)

    for i in range(100):
      value_before = [1] * i
      objectid = ray.put(value_before)
      value_after = ray.get(objectid)
      self.assertEqual(value_before, value_after)

    ray.worker.cleanup()
Exemplo n.º 6
0
    def testDependencies(self):
        for num_local_schedulers in [1, 4]:
            for num_workers_per_scheduler in [4]:
                num_workers = num_local_schedulers * num_workers_per_scheduler
                ray.worker._init(start_ray_local=True, num_workers=num_workers,
                                 num_local_schedulers=num_local_schedulers,
                                 num_cpus=100)

                @ray.remote
                def f(x):
                    return x

                x = 1
                for _ in range(1000):
                    x = f.remote(x)
                ray.get(x)

                @ray.remote
                def g(*xs):
                    return 1

                xs = [g.remote(1)]
                for _ in range(100):
                    xs.append(g.remote(*xs))
                    xs.append(g.remote(1))
                ray.get(xs)

                self.assertTrue(ray.services.all_processes_alive())
                ray.worker.cleanup()
Exemplo n.º 7
0
def test_batching_ability(router: DeadlineAwareRouter, now: float):
    first = unwrap(router.call.remote("SleepFirst", 1, now + 1))
    rest = [
        unwrap(router.call.remote("SleepFirst", 1, now + 1)) for _ in range(10)
    ]
    assert ray.get(first) == 1
    assert np.alltrue(np.array(ray.get(rest)) == 10)
Exemplo n.º 8
0
    def testSimple(self):
        # Define the size of one task's return argument so that the combined
        # sum of all objects' sizes is at least twice the plasma stores'
        # combined allotted memory.
        num_objects = 1000
        size = int(self.plasma_store_memory * 1.5 / (num_objects * 8))

        # Define a remote task with no dependencies, which returns a numpy
        # array of the given size.
        @ray.remote
        def foo(i, size):
            array = np.zeros(size)
            array[0] = i
            return array

        # Launch num_objects instances of the remote task.
        args = []
        for i in range(num_objects):
            args.append(foo.remote(i, size))

        # Get each value to force each task to finish. After some number of
        # gets, old values should be evicted.
        for i in range(num_objects):
            value = ray.get(args[i])
            self.assertEqual(value[0], i)
        # Get each value again to force reconstruction.
        for i in range(num_objects):
            value = ray.get(args[i])
            self.assertEqual(value[0], i)
        # Get values sequentially, in chunks.
        num_chunks = 4 * self.num_local_schedulers
        chunk = num_objects // num_chunks
        for i in range(num_chunks):
            values = ray.get(args[i * chunk:(i + 1) * chunk])
            del values
Exemplo n.º 9
0
    def _testWorkerFailed(self, num_local_schedulers):
        @ray.remote
        def f(x):
            time.sleep(0.5)
            return x

        num_initial_workers = 4
        ray.worker._init(num_workers=(num_initial_workers *
                                      num_local_schedulers),
                         num_local_schedulers=num_local_schedulers,
                         start_workers_from_local_scheduler=False,
                         start_ray_local=True,
                         num_cpus=[num_initial_workers] * num_local_schedulers,
                         redirect_output=True)
        # Submit more tasks than there are workers so that all workers and
        # cores are utilized.
        object_ids = [f.remote(i) for i
                      in range(num_initial_workers * num_local_schedulers)]
        object_ids += [f.remote(object_id) for object_id in object_ids]
        # Allow the tasks some time to begin executing.
        time.sleep(0.1)
        # Kill the workers as the tasks execute.
        for worker in (ray.services
                          .all_processes[ray.services.PROCESS_TYPE_WORKER]):
            worker.terminate()
            time.sleep(0.1)
        # Make sure that we can still get the objects after the executing tasks
        # died.
        ray.get(object_ids)
Exemplo n.º 10
0
def get(object_ids, worker=None):
    """Get a single or a collection of remote objects from the object store.

    This method is identical to `ray.get` except it adds support for tuples,
    ndarrays and dictionaries.

    Args:
        object_ids: Object ID of the object to get, a list, tuple, ndarray of
            object IDs to get or a dict of {key: object ID}.

    Returns:
        A Python object, a list of Python objects or a dict of {key: object}.
    """
    # There is a dependency on ray.worker which prevents importing
    # global_worker at the top of this file
    worker = ray.worker.global_worker if worker is None else worker
    if isinstance(object_ids, (tuple, np.ndarray)):
        return ray.get(list(object_ids), worker)
    elif isinstance(object_ids, dict):
        keys_to_get = [
            k for k, v in object_ids.items() if isinstance(v, ray.ObjectID)
        ]
        ids_to_get = [
            v for k, v in object_ids.items() if isinstance(v, ray.ObjectID)
        ]
        values = ray.get(ids_to_get)

        result = object_ids.copy()
        for key, value in zip(keys_to_get, values):
            result[key] = value
        return result
    else:
        return ray.get(object_ids, worker)
Exemplo n.º 11
0
    def testFailImportingActor(self):
        ray.init(num_workers=2, driver_mode=ray.SILENT_MODE)

        # Create the contents of a temporary Python file.
        temporary_python_file = """
def temporary_helper_function():
    return 1
"""

        f = tempfile.NamedTemporaryFile(suffix=".py")
        f.write(temporary_python_file.encode("ascii"))
        f.flush()
        directory = os.path.dirname(f.name)
        # Get the module name and strip ".py" from the end.
        module_name = os.path.basename(f.name)[:-3]
        sys.path.append(directory)
        module = __import__(module_name)

        # Define an actor that closes over this temporary module. This should
        # fail when it is unpickled.
        @ray.remote
        class Foo(object):
            def __init__(self):
                self.x = module.temporary_python_file()

            def get_val(self):
                return 1

        # There should be no errors yet.
        self.assertEqual(len(ray.error_info()), 0)

        # Create an actor.
        foo = Foo.remote()

        # Wait for the error to arrive.
        wait_for_errors(b"register_actor", 1)
        self.assertIn(b"No module named", ray.error_info()[0][b"message"])

        # Wait for the error from when the __init__ tries to run.
        wait_for_errors(b"task", 1)
        self.assertIn(
            b"failed to be imported, and so cannot execute this method",
            ray.error_info()[1][b"message"])

        # Check that if we try to get the function it throws an exception and
        # does not hang.
        with self.assertRaises(Exception):
            ray.get(foo.get_val.remote())

        # Wait for the error from when the call to get_val.
        wait_for_errors(b"task", 2)
        self.assertIn(
            b"failed to be imported, and so cannot execute this method",
            ray.error_info()[2][b"message"])

        f.close()

        # Clean up the junk we added to sys.path.
        sys.path.pop(-1)
Exemplo n.º 12
0
Arquivo: linalg.py Projeto: amplab/ray
def tsqr_hr(a):
  """Algorithm 6 from http://www.eecs.berkeley.edu/Pubs/TechRpts/2013/EECS-2013-175.pdf"""
  q, r_temp = tsqr.remote(a)
  y, u, s = modified_lu.remote(q)
  y_blocked = ray.get(y)
  t, y_top = tsqr_hr_helper1.remote(u, s, y_blocked.objectids[0, 0], a.shape[1])
  r = tsqr_hr_helper2.remote(s, r_temp)
  return ray.get(y), ray.get(t), ray.get(y_top), ray.get(r)
Exemplo n.º 13
0
 def baseline():
     sum_time = 0.
     for _ in range(50):
         tasks = [f.remote(n) for n in range(20)]
         start = time.time()
         ray.get(tasks)
         sum_time += time.time() - start
     return sum_time
Exemplo n.º 14
0
def tsqr_hr(a):
    q, r_temp = tsqr.remote(a)
    y, u, s = modified_lu.remote(q)
    y_blocked = ray.get(y)
    t, y_top = tsqr_hr_helper1.remote(u, s, y_blocked.objectids[0, 0],
                                      a.shape[1])
    r = tsqr_hr_helper2.remote(s, r_temp)
    return ray.get(y), ray.get(t), ray.get(y_top), ray.get(r)
Exemplo n.º 15
0
def test_fail_importing_actor(ray_start_regular):
    # Create the contents of a temporary Python file.
    temporary_python_file = """
def temporary_helper_function():
    return 1
"""

    f = tempfile.NamedTemporaryFile(suffix=".py")
    f.write(temporary_python_file.encode("ascii"))
    f.flush()
    directory = os.path.dirname(f.name)
    # Get the module name and strip ".py" from the end.
    module_name = os.path.basename(f.name)[:-3]
    sys.path.append(directory)
    module = __import__(module_name)

    # Define an actor that closes over this temporary module. This should
    # fail when it is unpickled.
    @ray.remote
    class Foo(object):
        def __init__(self):
            self.x = module.temporary_python_file()

        def get_val(self):
            return 1

    # There should be no errors yet.
    assert len(ray.error_info()) == 0

    # Create an actor.
    foo = Foo.remote()

    # Wait for the error to arrive.
    wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1)
    errors = relevant_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR)
    assert "No module named" in errors[0]["message"]

    # Wait for the error from when the __init__ tries to run.
    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
    assert ("failed to be imported, and so cannot execute this method" in
            errors[0]["message"])

    # Check that if we try to get the function it throws an exception and
    # does not hang.
    with pytest.raises(Exception):
        ray.get(foo.get_val.remote())

    # Wait for the error from when the call to get_val.
    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
    assert ("failed to be imported, and so cannot execute this method" in
            errors[1]["message"])

    f.close()

    # Clean up the junk we added to sys.path.
    sys.path.pop(-1)
Exemplo n.º 16
0
Arquivo: ppo.py Projeto: adgirish/ray
 def _restore(self, checkpoint_path):
     self.saver.restore(self.local_evaluator.sess, checkpoint_path)
     extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
     self.local_evaluator.restore(extra_data[0])
     self.global_step = extra_data[1]
     self.kl_coeff = extra_data[2]
     ray.get([
         a.restore.remote(o)
             for (a, o) in zip(self.remote_evaluators, extra_data[3])])
Exemplo n.º 17
0
def worker_task(data_size, read, *parameter_servers):
    while True:
        if read:
            # Get the current value from the parameter server.
            values = ray.get([ps.pull.remote() for ps in parameter_servers])
        else:
            # Push an update to the parameter server.
            ray.get([ps.push.remote(np.zeros(data_size, dtype=np.uint8))
                     for ps in parameter_servers])
Exemplo n.º 18
0
Arquivo: dqn.py Projeto: adgirish/ray
 def _restore(self, checkpoint_path):
     self.saver.restore(self.local_evaluator.sess, checkpoint_path)
     extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
     self.local_evaluator.restore(extra_data[0])
     ray.get([
         e.restore.remote(d) for (d, e)
         in zip(extra_data[1], self.remote_evaluators)])
     self.optimizer.restore(extra_data[2])
     self.num_target_updates = extra_data[3]
     self.last_target_update_ts = extra_data[4]
Exemplo n.º 19
0
Arquivo: core.py Projeto: amplab/ray
 def assemble(self):
   """Assemble an array on this node from a distributed array of object IDs."""
   first_block = ray.get(self.objectids[(0,) * self.ndim])
   dtype = first_block.dtype
   result = np.zeros(self.shape, dtype=dtype)
   for index in np.ndindex(*self.num_blocks):
     lower = DistArray.compute_block_lower(index, self.shape)
     upper = DistArray.compute_block_upper(index, self.shape)
     result[[slice(l, u) for (l, u) in zip(lower, upper)]] = ray.get(self.objectids[index])
   return result
Exemplo n.º 20
0
  def testObjStore(self):
    node_ip_address = "127.0.0.1"
    scheduler_address = ray.services.start_ray_local(num_objstores=2, num_workers=0, worker_path=None)
    ray.connect(node_ip_address, scheduler_address, mode=ray.SCRIPT_MODE)
    objstore_addresses = [objstore_info["address"] for objstore_info in ray.scheduler_info()["objstores"]]
    w1 = ray.worker.Worker()
    w2 = ray.worker.Worker()
    ray.reusables._cached_reusables = [] # This is a hack to make the test run.
    ray.connect(node_ip_address, scheduler_address, objstore_address=objstore_addresses[0], mode=ray.SCRIPT_MODE, worker=w1)
    ray.reusables._cached_reusables = [] # This is a hack to make the test run.
    ray.connect(node_ip_address, scheduler_address, objstore_address=objstore_addresses[1], mode=ray.SCRIPT_MODE, worker=w2)

    for cls in [Foo, Bar, Baz, Qux, SubQux, Exception, CustomError, Point, NamedTupleExample]:
      ray.register_class(cls)

    # putting and getting an object shouldn't change it
    for data in RAY_TEST_OBJECTS:
      objectid = ray.put(data, w1)
      result = ray.get(objectid, w1)
      assert_equal(result, data)

    # putting an object, shipping it to another worker, and getting it shouldn't change it
    for data in RAY_TEST_OBJECTS:
      objectid = ray.put(data, w1)
      result = ray.get(objectid, w2)
      assert_equal(result, data)

    # putting an object, shipping it to another worker, and getting it shouldn't change it
    for data in RAY_TEST_OBJECTS:
      objectid = ray.put(data, w2)
      result = ray.get(objectid, w1)
      assert_equal(result, data)

    # This test fails. See https://github.com/ray-project/ray/issues/159.
    # getting multiple times shouldn't matter
    # for data in [np.zeros([10, 20]), np.random.normal(size=[45, 25]), np.zeros([10, 20], dtype=np.dtype("float64")), np.zeros([10, 20], dtype=np.dtype("float32")), np.zeros([10, 20], dtype=np.dtype("int64")), np.zeros([10, 20], dtype=np.dtype("int32"))]:
    #   objectid = worker.put(data, w1)
    #   result = worker.get(objectid, w2)
    #   result = worker.get(objectid, w2)
    #   result = worker.get(objectid, w2)
    #   assert_equal(result, data)

    # Getting a buffer after modifying it before it finishes should return updated buffer
    objectid = ray.libraylib.get_objectid(w1.handle)
    buf = ray.libraylib.allocate_buffer(w1.handle, objectid, 100)
    buf[0][0] = 1
    ray.libraylib.finish_buffer(w1.handle, objectid, buf[1], 0)
    completedbuffer = ray.libraylib.get_buffer(w1.handle, objectid)
    self.assertEqual(completedbuffer[0][0], 1)

    # We started multiple drivers manually, so we will disconnect them manually.
    ray.disconnect(worker=w1)
    ray.disconnect(worker=w2)
    ray.worker.cleanup()
Exemplo n.º 21
0
 def ping(self, num_pings):
     children_outputs = []
     for _ in range(num_pings):
         children_outputs += [
             child.ping.remote() for child in self.children
         ]
     try:
         ray.get(children_outputs)
     except Exception:
         # Replace the children if one of them died.
         self.__init__(len(self.children), self.death_probability)
Exemplo n.º 22
0
def _distributed_sgd_step(actors, ps_list, fetch_stats, write_timeline):
    # Preallocate object ids that actors will write gradient shards to
    grad_shard_oids_list = [[np.random.bytes(20) for _ in ps_list]
                            for _ in actors]
    logger.debug("Generated grad oids")

    # Preallocate object ids that param servers will write new weights to
    accum_shard_ids = [np.random.bytes(20) for _ in ps_list]
    logger.debug("Generated accum oids")

    # Kick off the fused compute grad / update weights tf run for each actor
    losses = []
    for actor, grad_shard_oids in zip(actors, grad_shard_oids_list):
        losses.append(
            actor.ps_compute_apply.remote(
                grad_shard_oids,
                accum_shard_ids,
                write_timeline=write_timeline))
    logger.debug("Launched all ps_compute_applys on all actors")

    # Issue prefetch ops
    for j, (ps, weight_shard_oid) in list(
            enumerate(zip(ps_list, accum_shard_ids)))[::-1]:
        to_fetch = []
        for grad_shard_oids in grad_shard_oids_list:
            to_fetch.append(grad_shard_oids[j])
        random.shuffle(to_fetch)
        ps.prefetch.remote(to_fetch)
    logger.debug("Launched all prefetch ops")

    # Aggregate the gradients produced by the actors. These operations
    # run concurrently with the actor methods above.
    ps_gets = []
    for j, (ps, weight_shard_oid) in list(
            enumerate(zip(ps_list, accum_shard_ids)))[::-1]:
        ps.add_spinwait.remote([gs[j] for gs in grad_shard_oids_list])
        ps_gets.append(ps.get.remote(weight_shard_oid))
    logger.debug("Launched all aggregate ops")

    if write_timeline:
        timelines = [ps.get_timeline.remote() for ps in ps_list]
        logger.debug("Launched timeline gets")
        timelines = ray.get(timelines)
        t0 = timelines[0]
        for t in timelines[1:]:
            t0.merge(t)
        t0.chrome_trace_format("ps_timeline.json")
    else:
        # Wait for at least the ps gets to finish
        ray.get(ps_gets)
    if fetch_stats:
        return {"loss": np.mean(ray.get(losses))}
    else:
        return None
Exemplo n.º 23
0
    def restore_from_obj(self, obj):
        """Restores runner state from the specified object."""

        if self.runner is None:
            print("Unable to restore - no runner")
        else:
            try:
                ray.get(self.runner.restore_from_object.remote(obj))
            except Exception:
                print("Error restoring runner:", traceback.format_exc())
                self.status = Trial.ERROR
Exemplo n.º 24
0
def test_recursive(ray_start_reconstruction):
    plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
    # Define the size of one task's return argument so that the combined
    # sum of all objects' sizes is at least twice the plasma stores'
    # combined allotted memory.
    num_objects = 100
    size = int(plasma_store_memory * 1.5 / (num_objects * 8))

    # Define a root task with no dependencies, which returns a numpy array
    # of the given size.
    @ray.remote
    def no_dependency_task(size):
        array = np.zeros(size)
        return array

    # Define a task with a single dependency, which returns its one
    # argument.
    @ray.remote
    def single_dependency(i, arg):
        arg = np.copy(arg)
        arg[0] = i
        return arg

    # Launch num_objects instances of the remote task, each dependent on
    # the one before it.
    arg = no_dependency_task.remote(size)
    args = []
    for i in range(num_objects):
        arg = single_dependency.remote(i, arg)
        args.append(arg)

    # Get each value to force each task to finish. After some number of
    # gets, old values should be evicted.
    for i in range(num_objects):
        value = ray.get(args[i])
        assert value[0] == i
    # Get each value again to force reconstruction.
    for i in range(num_objects):
        value = ray.get(args[i])
        assert value[0] == i
    # Get 10 values randomly.
    random_indexes = sorted_random_indexes(num_objects, 10)
    for i in random_indexes:
        value = ray.get(args[i])
        assert value[0] == i
    # Get values sequentially, in chunks.
    num_chunks = 4 * num_nodes
    chunk = num_objects // num_chunks
    for i in range(num_chunks):
        values = ray.get(args[i * chunk:(i + 1) * chunk])
        del values

    for node in cluster.list_all_nodes():
        assert node.all_processes_alive()
Exemplo n.º 25
0
  def testReferenceCountFalse(self):
    ray.init(start_ray_local=True, num_workers=1)

    # Make sure that we aren't accidentally messing up Python's reference counts.
    @ray.remote
    def f():
      return sys.getrefcount(False)
    first_count = ray.get(f.remote())
    second_count = ray.get(f.remote())
    self.assertEqual(first_count, second_count)

    ray.worker.cleanup()
Exemplo n.º 26
0
    def test_redis_password_cluster(self, password, shutdown_only):
        @ray.remote
        def f():
            return 1

        node_args = {"redis_password": password}
        cluster = Cluster(
            initialize_head=True, connect=True, head_node_args=node_args)
        cluster.add_node(**node_args)

        object_id = f.remote()
        ray.get(object_id)
Exemplo n.º 27
0
def main(config, experiments, num_cpus, num_gpus, redis_address):
  print("config =", config.name)
  print("experiments =", experiments)
  print("num_gpus =", num_gpus)
  print("num_cpus =", num_cpus)
  print("redis_address =", redis_address)

  # Use configuration file location as the project location.
  projectDir = os.path.dirname(config.name)
  projectDir = os.path.abspath(projectDir)
  print("projectDir =", projectDir)

  # Load and parse experiment configurations
  configs = parse_config(config, experiments, globals=globals())

  # Pre-download dataset
  data_dir = os.path.join(projectDir, "data")
  datasets.CIFAR10(data_dir, download=True, train=True)

  # Initialize ray cluster
  if redis_address is not None:
    ray.init(redis_address=redis_address, include_webui=True)
    num_cpus = 1
  else:
    ray.init(num_cpus=num_cpus, num_gpus=num_gpus, local_mode=num_cpus == 1)

  # Run all experiments in parallel
  results = []
  for exp in configs:
    config = configs[exp]
    config["name"] = exp

    # Make sure local directories are relative to the project location
    path = config.get("path", None)
    if path and not os.path.isabs(path):
      config["path"] = os.path.join(projectDir, path)

    data_dir = config.get("data_dir", "data")
    if not os.path.isabs(data_dir):
      config["data_dir"] = os.path.join(projectDir, data_dir)

    # When running multiple hyperparameter searches on different experiments,
    # ray.tune will run one experiment at the time. We use "ray.remote" to
    # run each tune experiment in parallel as a "remote" function and wait until
    # all experiments complete
    results.append(run_experiment.remote(config, MobileNetTune,
                                         num_cpus=1,
                                         num_gpus=num_gpus / num_cpus))

  # Wait for all experiments to complete
  ray.get(results)

  ray.shutdown()
Exemplo n.º 28
0
    def test_simple_class(self):
        cls = ray.remote(cyth.simple_class)
        a1 = cls.remote()
        a2 = cls.remote()

        result1 = ray.get(a1.increment.remote())
        result2 = ray.get(a2.increment.remote())
        result3 = ray.get(a2.increment.remote())

        self.assertEqual(result1, 1)
        self.assertEqual(result2, 1)
        self.assertEqual(result3, 2)
Exemplo n.º 29
0
def test_getting_and_putting(ray_start_sharded):
    for n in range(8):
        x = np.zeros(10**n)

        for _ in range(100):
            ray.put(x)

        x_id = ray.put(x)
        for _ in range(1000):
            ray.get(x_id)

    assert ray.services.remaining_processes_alive()
Exemplo n.º 30
0
def test_multiple_recursive(ray_start_reconstruction):
    plasma_store_memory, _, cluster = ray_start_reconstruction
    # Define the size of one task's return argument so that the combined
    # sum of all objects' sizes is at least twice the plasma stores'
    # combined allotted memory.
    num_objects = 100
    size = plasma_store_memory * 2 // (num_objects * 8)

    # Define a root task with no dependencies, which returns a numpy array
    # of the given size.
    @ray.remote
    def no_dependency_task(size):
        array = np.zeros(size)
        return array

    # Define a task with multiple dependencies, which returns its first
    # argument.
    @ray.remote
    def multiple_dependency(i, arg1, arg2, arg3):
        arg1 = np.copy(arg1)
        arg1[0] = i
        return arg1

    # Launch num_args instances of the root task. Then launch num_objects
    # instances of the multi-dependency remote task, each dependent on the
    # num_args tasks before it.
    num_args = 3
    args = []
    for i in range(num_args):
        arg = no_dependency_task.remote(size)
        args.append(arg)
    for i in range(num_objects):
        args.append(multiple_dependency.remote(i, *args[i:i + num_args]))

    # Get each value to force each task to finish. After some number of
    # gets, old values should be evicted.
    args = args[num_args:]
    for i in range(num_objects):
        value = ray.get(args[i])
        assert value[0] == i
    # Get each value again to force reconstruction.
    for i in range(num_objects):
        value = ray.get(args[i])
        assert value[0] == i
    # Get 10 values randomly.
    random_indexes = sorted_random_indexes(num_objects, 10)
    for i in random_indexes:
        value = ray.get(args[i])
        assert value[0] == i

    for node in cluster.list_all_nodes():
        assert node.all_processes_alive()
Exemplo n.º 31
0
def full_loss(theta):
  theta_id = ray.put(theta)
  loss_ids = [actor.loss(theta_id) for actor in actors]
  return sum(ray.get(loss_ids))
Exemplo n.º 32
0
def full_grad(theta):
  theta_id = ray.put(theta)
  grad_ids = [actor.grad(theta_id) for actor in actors]
  return sum(ray.get(grad_ids)).astype("float64") # This conversion is necessary for use with fmin_l_bfgs_b.
Exemplo n.º 33
0
 def fork(queue):
     for i in range(10):
         x = queue.enqueue.remote(0, i)
         time.sleep(0.1)
     return ray.get(x)
Exemplo n.º 34
0
 def name(self):
     return ray.get(self._buffer.name.remote())
Exemplo n.º 35
0
 def fork(pickled_queue, key, num_items):
     queue = ray.worker.pickle.loads(pickled_queue)
     x = None
     for item in range(num_items):
         x = queue.enqueue.remote(key, item)
     return ray.get(x)
Exemplo n.º 36
0
 def inc(actor_handle):
     return ray.get(actor_handle.inc.remote())
Exemplo n.º 37
0
def test_nondeterministic_task(ray_start_reconstruction):
    _, _, plasma_store_memory, num_local_schedulers = ray_start_reconstruction
    # Define the size of one task's return argument so that the combined
    # sum of all objects' sizes is at least twice the plasma stores'
    # combined allotted memory.
    num_objects = 1000
    size = plasma_store_memory * 2 // (num_objects * 8)

    # Define a nondeterministic remote task with no dependencies, which
    # returns a random numpy array of the given size. This task should
    # produce an error on the driver if it is ever reexecuted.
    @ray.remote
    def foo(i, size):
        array = np.random.rand(size)
        array[0] = i
        return array

    # Define a deterministic remote task with no dependencies, which
    # returns a numpy array of zeros of the given size.
    @ray.remote
    def bar(i, size):
        array = np.zeros(size)
        array[0] = i
        return array

    # Launch num_objects instances, half deterministic and half
    # nondeterministic.
    args = []
    for i in range(num_objects):
        if i % 2 == 0:
            args.append(foo.remote(i, size))
        else:
            args.append(bar.remote(i, size))

    # Get each value to force each task to finish. After some number of
    # gets, old values should be evicted.
    for i in range(num_objects):
        value = ray.get(args[i])
        assert value[0] == i
    # Get each value again to force reconstruction.
    for i in range(num_objects):
        value = ray.get(args[i])
        assert value[0] == i

    def error_check(errors):
        if num_local_schedulers == 1:
            # In a single-node setting, each object is evicted and
            # reconstructed exactly once, so exactly half the objects will
            # produce an error during reconstruction.
            min_errors = num_objects // 2
        else:
            # In a multinode setting, each object is evicted zero or one
            # times, so some of the nondeterministic tasks may not be
            # reexecuted.
            min_errors = 1
        return len(errors) >= min_errors

    errors = wait_for_errors(error_check)
    # Make sure all the errors have the correct type.
    assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR
               for error in errors)
Exemplo n.º 38
0
def test_local_clusters():
    """
    This tests the various behaviors of connecting to local clusters:

    * Using `ray.client("local").connect() ` should always create a new
      cluster.
    * Using `ray.cleint().connectIO` should create a new cluster if it doesn't
      connect to an existing one.
    * Using `ray.client().connect()` should only connect to a cluster if it
      was created with `ray start --head`, not from a python program.

    It does tests if two calls are in the same cluster by trying to create an
    actor with the same name in the same namespace, which will error and cause
    the script have a non-zero exit, which throws an exception.
    """
    driver_template = """
import ray
info = ray.client({address}).namespace("").connect()

@ray.remote
class Foo:
    def ping(self):
        return "pong"

a = Foo.options(name="abc", lifetime="detached").remote()
ray.get(a.ping.remote())

import time
while True:
    time.sleep(30)

"""
    blocking_local_script = driver_template.format(address="'local'", blocking=True)
    blocking_noaddr_script = driver_template.format(address="", blocking=True)

    # This should start a cluster.
    p1 = run_string_as_driver_nonblocking(blocking_local_script)
    # ray.client("local").connect() should start a second cluster.
    p2 = run_string_as_driver_nonblocking(blocking_local_script)
    # ray.client().connect() shouldn't connect to a cluster started by
    # ray.client("local").connect() so it should create a third one.
    p3 = run_string_as_driver_nonblocking(blocking_noaddr_script)
    # ray.client().connect() shouldn't connect to a cluster started by
    # ray.client().connect() so it should create a fourth one.
    p4 = run_string_as_driver_nonblocking(blocking_noaddr_script)

    wait_for_condition(
        lambda: len(ray._private.services.find_bootstrap_address()) == 4,
        retry_interval_ms=1000,
    )

    p1.kill()
    p2.kill()
    p3.kill()
    p4.kill()
    # Prevent flakiness since fatesharing takes some time.
    subprocess.check_output("ray stop --force", shell=True)

    # Since there's a cluster started with `ray start --head`
    # we should connect to it instead.
    subprocess.check_output("ray start --head", shell=True)
    # The assertion in the driver should cause the script to fail if we start
    # a new cluster instead of connecting.
    run_string_as_driver(
        """
import ray
ray.client().connect()
assert len(ray._private.services.find_bootstrap_address()) == 1
    """
    )
    # ray.client("local").connect() should always create a new cluster even if
    # there's one running.
    p1 = run_string_as_driver_nonblocking(blocking_local_script)
    wait_for_condition(
        lambda: len(ray._private.services.find_bootstrap_address()) == 2,
        retry_interval_ms=1000,
    )
    p1.kill()
    subprocess.check_output("ray stop --force", shell=True)
Exemplo n.º 39
0
def _kill_routers(client):
    routers = ray.get(client._controller.get_routers.remote())
    for router in routers.values():
        ray.kill(router, no_restart=False)
Exemplo n.º 40
0
def test_reconstruction_cached_dependency(ray_start_cluster,
                                          reconstruction_enabled):
    config = json.dumps({
        "num_heartbeats_timeout":
        10,
        "raylet_heartbeat_timeout_milliseconds":
        100,
        "lineage_pinning_enabled":
        1 if reconstruction_enabled else 0,
        "free_objects_period_milliseconds":
        -1,
    })
    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _internal_config=config)
    ray.init(address=cluster.address)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8,
                                    _internal_config=config)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    cluster.wait_for_nodes()

    @ray.remote(max_retries=0)
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def chain(x):
        return x

    @ray.remote
    def dependent_task(x):
        return

    obj = large_object.options(resources={"node2": 1}).remote()
    obj = chain.options(resources={"node1": 1}).remote(obj)
    ray.get(dependent_task.options(resources={"node1": 1}).remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8,
                     _internal_config=config)
    assert wait_for_condition(lambda: not all(node["Alive"]
                                              for node in ray.nodes()),
                              timeout=10)

    for _ in range(20):
        large_object.options(resources={"node2": 1}).remote()

    if reconstruction_enabled:
        ray.get(dependent_task.remote(obj))
    else:
        with pytest.raises(ray.exceptions.RayTaskError) as e:
            ray.get(dependent_task.remote(obj))
            with pytest.raises(ray.exceptions.UnreconstructableError):
                raise e.as_instanceof_cause()
Exemplo n.º 41
0
  return sum(ray.get(grad_ids)).astype("float64") # This conversion is necessary for use with fmin_l_bfgs_b.

if __name__ == "__main__":
  ray.init(redirect_output=True)

  # From the perspective of scipy.optimize.fmin_l_bfgs_b, full_loss is simply a
  # function which takes some parameters theta, and computes a loss. Similarly,
  # full_grad is a function which takes some parameters theta, and computes the
  # gradient of the loss. Internally, these functions use Ray to distribute the
  # computation of the loss and the gradient over the data that is represented
  # by the remote object IDs x_batches and y_batches and which is potentially
  # distributed over a cluster. However, these details are hidden from
  # scipy.optimize.fmin_l_bfgs_b, which simply uses it to run the L-BFGS
  # algorithm.

  # Load the mnist data and turn the data into remote objects.
  print("Downloading the MNIST dataset. This may take a minute.")
  mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
  num_batches = 10
  batch_size = mnist.train.num_examples // num_batches
  batches = [mnist.train.next_batch(batch_size) for _ in range(num_batches)]
  print("Putting MNIST in the object store.")
  actors = [NetActor(xs, ys) for (xs, ys) in batches]
  # Initialize the weights for the network to the vector of all zeros.
  dim = ray.get(actors[0].get_flat_size())
  theta_init = 1e-2 * np.random.normal(size=dim)

  # Use L-BFGS to minimize the loss function.
  print("Running L-BFGS.")
  result = scipy.optimize.fmin_l_bfgs_b(full_loss, theta_init, maxiter=10, fprime=full_grad, disp=True)
Exemplo n.º 42
0
def check_async_status(ray_obj):
    return ray.get(ray_obj)
Exemplo n.º 43
0
 def fork(queue, key, item):
     # ray.get here could be blocked and cause ray to start
     # a lot of python workers.
     return ray.get(queue.enqueue.remote(key, item))
Exemplo n.º 44
0
 def step(self):
     ray.get(self.actor.f.remote())
Exemplo n.º 45
0
 def f(actor, signal):
     ray.get(signal.wait.remote())
     return ray.get(actor.method.remote())
Exemplo n.º 46
0
 def fork_many_incs(counter, num_incs):
     x = None
     for _ in range(num_incs):
         x = counter.inc.remote()
     # Only call ray.get() on the last task submitted.
     return ray.get(x)
Exemplo n.º 47
0
 def good_to_learn(self):
     return ray.get(self._buffer.good_to_learn.remote())
Exemplo n.º 48
0
 def _sample(self):
     while True:
         yield ray.get(self._buffer.sample.remote())
Exemplo n.º 49
0
 def get(self, k):
     if not ray.is_initialized():
         return self.to_flush[k]
     return ray.get(self.references[k])
Exemplo n.º 50
0
 replay_buffer = ReplayBuffer.remote(training_signal)
 
 # run async self-plays
 self_plays = [SelfPlay.remote() for _ in range(ASYNC_SELF_PLAYS)]    
 run_self_plays = [
     self_play.run.remote(
         replay_buffer,
         update_signal,
         self_plays.index(self_play),
         )
     for self_play in self_plays
     ]
 
 # start async training when signal arrives
 train = Train.remote()
 ray.get(training_signal.wait.remote())
 print('Training in progress...')
 train.run.remote(replay_buffer, evaluation_signal)
 
 # run async evaluations each time an evaluation signal arrives
 while True:      
     ray.get(evaluation_signal.wait.remote())
     print(f'Evaluation in progress...')
     # get the the current update id 
     update_id = ray.get(update_signal.get_update_id.remote())
     # run evaluations and get the results
     evaluations = [
         Evaluation.remote(update_id) for _ in range(ASYNC_EVALUATIONS)
         ]    
     results = ray.get(
         [evaluation.run.remote() for evaluation in evaluations]
Exemplo n.º 51
0
def start_metric_monitor_loop(monitor_handle, duration_s=5):
    while True:
        ray.get(monitor_handle.scrape.remote())
        time.sleep(duration_s)
Exemplo n.º 52
0
def read_datasource(datasource: Datasource[T],
                    *,
                    parallelism: int = 200,
                    ray_remote_args: Dict[str, Any] = None,
                    _spread_resource_prefix: Optional[str] = None,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.
        ray_remote_args: kwargs passed to ray.remote in the read tasks.

    Returns:
        Dataset holding the data read from the datasource.
    """

    read_tasks = datasource.prepare_read(parallelism, **read_args)

    def remote_read(task: ReadTask) -> Block:
        return task()

    if ray_remote_args is None:
        ray_remote_args = {}
    # Increase the read parallelism by default to maximize IO throughput. This
    # is particularly important when reading from e.g., remote storage.
    if "num_cpus" not in ray_remote_args:
        # Note that the too many workers warning triggers at 4x subscription,
        # so we go at 0.5 to avoid the warning message.
        ray_remote_args["num_cpus"] = 0.5
    remote_read = cached_remote_fn(remote_read)

    if _spread_resource_prefix is not None:
        # Use given spread resource prefix for round-robin resource-based
        # scheduling.
        nodes = ray.nodes()
        resource_iter = _get_spread_resources_iter(nodes,
                                                   _spread_resource_prefix,
                                                   ray_remote_args)
    else:
        # If no spread resource prefix given, yield an empty dictionary.
        resource_iter = itertools.repeat({})

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []

    for task in read_tasks:
        calls.append(
            lambda task=task, resources=next(resource_iter): remote_read.
            options(**ray_remote_args, resources=resources).remote(task))
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:
        get_schema = cached_remote_fn(_get_schema)
        schema0 = ray.get(get_schema.remote(next(iter(block_list))))
        block_list.set_metadata(
            0,
            BlockMetadata(
                num_rows=metadata[0].num_rows,
                size_bytes=metadata[0].size_bytes,
                schema=schema0,
                input_files=metadata[0].input_files,
            ))

    return Dataset(block_list, 0)
Exemplo n.º 53
0
 def foo(x):
     if x > 1:
         return ray.get(foo.remote(x - 1)) + x
     else:
         time.sleep(5)
         return x
Exemplo n.º 54
0
 def fork(queue, key, num_items):
     x = None
     for item in range(num_items):
         x = queue.enqueue.remote(key, item)
     return ray.get(x)
Exemplo n.º 55
0
def test_node_info(disable_aiohttp_cache, ray_start_with_dashboard):
    @ray.remote
    class Actor:
        def getpid(self):
            return os.getpid()

    actors = [Actor.remote(), Actor.remote()]
    actor_pids = [actor.getpid.remote() for actor in actors]
    actor_pids = set(ray.get(actor_pids))

    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    node_id = ray_start_with_dashboard["node_id"]

    timeout_seconds = 10
    start_time = time.time()
    last_ex = None
    while True:
        time.sleep(1)
        try:
            response = requests.get(webui_url + "/nodes?view=hostnamelist")
            response.raise_for_status()
            hostname_list = response.json()
            assert hostname_list["result"] is True, hostname_list["msg"]
            hostname_list = hostname_list["data"]["hostNameList"]
            assert len(hostname_list) == 1

            hostname = hostname_list[0]
            response = requests.get(webui_url + f"/nodes/{node_id}")
            response.raise_for_status()
            detail = response.json()
            assert detail["result"] is True, detail["msg"]
            detail = detail["data"]["detail"]
            assert detail["hostname"] == hostname
            assert detail["raylet"]["state"] == "ALIVE"
            assert "raylet" in detail["cmdline"][0]
            assert len(detail["workers"]) >= 2
            assert len(detail["actors"]) == 2, detail["actors"]
            assert len(detail["raylet"]["viewData"]) > 0

            actor_worker_pids = set()
            for worker in detail["workers"]:
                if "ray::Actor" in worker["cmdline"][0]:
                    actor_worker_pids.add(worker["pid"])
            assert actor_worker_pids == actor_pids

            response = requests.get(webui_url + "/nodes?view=summary")
            response.raise_for_status()
            summary = response.json()
            assert summary["result"] is True, summary["msg"]
            assert len(summary["data"]["summary"]) == 1
            summary = summary["data"]["summary"][0]
            assert summary["hostname"] == hostname
            assert summary["raylet"]["state"] == "ALIVE"
            assert "raylet" in summary["cmdline"][0]
            assert "workers" not in summary
            assert "actors" not in summary
            assert "viewData" not in summary["raylet"]
            break
        except Exception as ex:
            last_ex = ex
        finally:
            if time.time() > start_time + timeout_seconds:
                ex_stack = traceback.format_exception(
                    type(last_ex), last_ex,
                    last_ex.__traceback__) if last_ex else []
                ex_stack = "".join(ex_stack)
                raise Exception(f"Timed out while testing, {ex_stack}")
Exemplo n.º 56
0
 def refresh_actor_handle_cache(self):
     self.actor_handle_cache = ray.get(
         self.actor_nursery_handle.get_all_handles.remote())
Exemplo n.º 57
0
 def _read_stream(self, f, path: str, **reader_args):
     count = self.counter.increment.remote()
     ray.get(count)
     for block in super()._read_stream(f, path, **reader_args):
         yield block
Exemplo n.º 58
0
 def wait(obj_ref):
     return ray.get(obj_ref[0])
Exemplo n.º 59
0
def _get_worker_handles(client, backend):
    controller = client._controller
    backend_dict = ray.get(controller.get_all_worker_handles.remote())

    return list(backend_dict[backend].values())
Exemplo n.º 60
0
    start = time.time()

    @ray.remote
    def train():
        os.environ["TEST_OUTPUT_JSON"] = output
        os.environ["TEST_STATE_JSON"] = state
        train_ray(
            path="/data/classification.parquet",
            num_workers=4,
            num_boost_rounds=100,
            num_files=25,
            regression=False,
            use_gpu=False,
            ray_params=ray_params,
            xgboost_params=None,
        )

    ray.get(train.remote())
    taken = time.time() - start

    result = {
        "time_taken": taken,
    }
    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
                                      "/tmp/train_small.json")
    with open(test_output_json, "wt") as f:
        json.dump(result, f)

    print("PASSED.")