Пример #1
0
def connect_ray_pdb(host=None, port=None, patch_stdstreams=False, quiet=None):
    """
    Opens a remote PDB on first available port.
    """
    if host is None:
        host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1")
    if port is None:
        port = int(os.environ.get("REMOTE_PDB_PORT", "0"))
    if quiet is None:
        quiet = bool(os.environ.get("REMOTE_PDB_QUIET", ""))
    rdb = RemotePdb(host=host,
                    port=port,
                    patch_stdstreams=patch_stdstreams,
                    quiet=quiet)
    sockname = rdb._listen_socket.getsockname()
    pdb_address = "{}:{}".format(sockname[0], sockname[1])
    parentframeinfo = inspect.getouterframes(inspect.currentframe())[2]
    data = {
        "proctitle": setproctitle.getproctitle(),
        "pdb_address": pdb_address,
        "filename": parentframeinfo.filename,
        "lineno": parentframeinfo.lineno,
        "traceback": "\n".join(traceback.format_exception(*sys.exc_info()))
    }
    breakpoint_uuid = uuid.uuid4()
    _internal_kv_put("RAY_PDB_{}".format(breakpoint_uuid),
                     json.dumps(data),
                     overwrite=True)
    rdb.listen()
    _internal_kv_del("RAY_PDB_{}".format(breakpoint_uuid))

    return rdb
Пример #2
0
def connect_ray_pdb(
    host=None,
    port=None,
    patch_stdstreams=False,
    quiet=None,
    breakpoint_uuid=None,
    debugger_external=False,
):
    """
    Opens a remote PDB on first available port.
    """
    if debugger_external:
        assert not host, "Cannot specify both host and debugger_external"
        host = "0.0.0.0"
    elif host is None:
        host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1")
    if port is None:
        port = int(os.environ.get("REMOTE_PDB_PORT", "0"))
    if quiet is None:
        quiet = bool(os.environ.get("REMOTE_PDB_QUIET", ""))
    if not breakpoint_uuid:
        breakpoint_uuid = uuid.uuid4().hex
    if debugger_external:
        ip_address = ray.worker.global_worker.node_ip_address
    else:
        ip_address = "localhost"
    rdb = RemotePdb(
        breakpoint_uuid=breakpoint_uuid,
        host=host,
        port=port,
        ip_address=ip_address,
        patch_stdstreams=patch_stdstreams,
        quiet=quiet,
    )
    sockname = rdb._listen_socket.getsockname()
    pdb_address = "{}:{}".format(ip_address, sockname[1])
    parentframeinfo = inspect.getouterframes(inspect.currentframe())[2]
    data = {
        "proctitle": setproctitle.getproctitle(),
        "pdb_address": pdb_address,
        "filename": parentframeinfo.filename,
        "lineno": parentframeinfo.lineno,
        "traceback": "\n".join(traceback.format_exception(*sys.exc_info())),
        "timestamp": time.time(),
        "job_id": ray.get_runtime_context().job_id.hex(),
    }
    _internal_kv_put(
        "RAY_PDB_{}".format(breakpoint_uuid),
        json.dumps(data),
        overwrite=True,
        namespace=ray_constants.KV_NAMESPACE_PDB,
    )
    rdb.listen()
    _internal_kv_del("RAY_PDB_{}".format(breakpoint_uuid),
                     namespace=ray_constants.KV_NAMESPACE_PDB)

    return rdb
Пример #3
0
def test_spread_scheduling_strategy(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    # Create a head node
    cluster.add_node(
        num_cpus=0,
        _system_config={
            "scheduler_spread_threshold": 1,
        },
    )
    ray.init(address=cluster.address)
    for i in range(2):
        cluster.add_node(num_cpus=8, resources={f"foo:{i}": 1})
    cluster.wait_for_nodes()

    with connect_to_client_or_not(connect_to_client):

        @ray.remote
        def get_node_id():
            return ray.worker.global_worker.current_node_id

        worker_node_ids = {
            ray.get(get_node_id.options(resources={
                f"foo:{i}": 1
            }).remote())
            for i in range(2)
        }
        # Wait for updating driver raylet's resource view.
        time.sleep(5)

        @ray.remote(scheduling_strategy=SPREAD_SCHEDULING_STRATEGY)
        def task1():
            internal_kv._internal_kv_put("test_task1", "task1")
            while internal_kv._internal_kv_exists("test_task1"):
                time.sleep(0.1)
            return ray.worker.global_worker.current_node_id

        @ray.remote
        def task2():
            internal_kv._internal_kv_put("test_task2", "task2")
            return ray.worker.global_worker.current_node_id

        locations = []
        locations.append(task1.remote())
        while not internal_kv._internal_kv_exists("test_task1"):
            time.sleep(0.1)
        # Wait for updating driver raylet's resource view.
        time.sleep(5)
        locations.append(
            task2.options(
                scheduling_strategy=SPREAD_SCHEDULING_STRATEGY).remote())
        while not internal_kv._internal_kv_exists("test_task2"):
            time.sleep(0.1)
        internal_kv._internal_kv_del("test_task1")
        internal_kv._internal_kv_del("test_task2")
        assert set(ray.get(locations)) == worker_node_ids
Пример #4
0
 def run(self):
     # Register signal handlers for autoscaler termination.
     signal.signal(signal.SIGINT, self._signal_handler)
     signal.signal(signal.SIGTERM, self._signal_handler)
     try:
         if _internal_kv_initialized():
             # Delete any previous autoscaling errors.
             _internal_kv_del(DEBUG_AUTOSCALING_ERROR)
         self._initialize_autoscaler()
         self._run()
     except Exception:
         self._handle_failure(traceback.format_exc())
         raise
Пример #5
0
    def test_create_upload_once(self, tmp_path, random_dir, ray_start_regular):
        uri = get_uri_for_directory(random_dir)
        uploaded = upload_package_if_needed(uri, tmp_path, random_dir)
        assert uploaded
        assert _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE)

        uploaded = upload_package_if_needed(uri, tmp_path, random_dir)
        assert not uploaded
        assert _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE)

        # Delete the URI from the internal_kv. This should trigger re-upload.
        _internal_kv_del(uri, namespace=KV_NAMESPACE_PACKAGE)
        assert not _internal_kv_exists(uri, namespace=KV_NAMESPACE_PACKAGE)
        uploaded = upload_package_if_needed(uri, tmp_path, random_dir)
        assert uploaded
Пример #6
0
    def test_create_upload_once(self, empty_dir, random_dir,
                                ray_start_regular):
        uri = get_uri_for_directory(random_dir)
        uploaded = upload_package_if_needed(uri, empty_dir, random_dir)
        assert uploaded
        assert _internal_kv_exists(uri)

        uploaded = upload_package_if_needed(uri, empty_dir, random_dir)
        assert not uploaded
        assert _internal_kv_exists(uri)

        # Delete the URI from the internal_kv. This should trigger re-upload.
        _internal_kv_del(uri)
        assert not _internal_kv_exists(uri)
        uploaded = upload_package_if_needed(uri, empty_dir, random_dir)
        assert uploaded
Пример #7
0
    def delete(self, key):
        """Delete the value associated with the given key from the store.

        Args:
            key (str)
        """

        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))
        return ray_kv._internal_kv_del(self._format_key(key))
Пример #8
0
    def delete(self, key: str):
        """Delete the value associated with the given key from the store.

        Args:
            key (str)
        """

        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))
        return ray_kv._internal_kv_del(
            self.get_storage_key(key),
            namespace=ray_constants.KV_NAMESPACE_SERVE)
Пример #9
0
def test_internal_kv(ray_start_regular):
    import ray.experimental.internal_kv as kv

    assert kv._internal_kv_get("k1") is None
    assert kv._internal_kv_put("k1", "v1") is False
    assert kv._internal_kv_put("k1", "v1") is True
    assert kv._internal_kv_get("k1") == b"v1"

    assert kv._internal_kv_get("k1", namespace="n") is None
    assert kv._internal_kv_put("k1", "v1", namespace="n") is False
    assert kv._internal_kv_put("k1", "v1", namespace="n") is True
    assert kv._internal_kv_put("k1", "v2", True, namespace="n") is True
    assert kv._internal_kv_get("k1", namespace="n") == b"v2"

    assert kv._internal_kv_del("k1") == 1
    assert kv._internal_kv_del("k1") == 0
    assert kv._internal_kv_get("k1") is None

    assert kv._internal_kv_put("k2", "v2", namespace="n") is False
    assert kv._internal_kv_put("k3", "v3", namespace="n") is False

    assert set(kv._internal_kv_list("k",
                                    namespace="n")) == {b"k1", b"k2", b"k3"}
    assert kv._internal_kv_del("k", del_by_prefix=True, namespace="n") == 3
    assert kv._internal_kv_del("x", del_by_prefix=True, namespace="n") == 0
    assert kv._internal_kv_get("k1", namespace="n") is None
    assert kv._internal_kv_get("k2", namespace="n") is None
    assert kv._internal_kv_get("k3", namespace="n") is None

    with pytest.raises(RuntimeError):
        kv._internal_kv_put("@namespace_", "x", True)
    with pytest.raises(RuntimeError):
        kv._internal_kv_get("@namespace_", namespace="n")
    with pytest.raises(RuntimeError):
        kv._internal_kv_del("@namespace_def", namespace="n")
    with pytest.raises(RuntimeError):
        kv._internal_kv_list("@namespace_abc", namespace="n")
Пример #10
0
 def del_keys(self, keys: list):
     for key in keys:
         ok = internal_kv._internal_kv_del(key)
         if not ok:
             return False
     return True
Пример #11
0
def test_spread_scheduling_strategy(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    # Create a head node
    cluster.add_node(
        num_cpus=0,
        _system_config={
            "scheduler_spread_threshold": 1,
        },
    )
    ray.init(address=cluster.address)
    for i in range(2):
        cluster.add_node(num_cpus=8, resources={f"foo:{i}": 1})
    cluster.wait_for_nodes()

    with connect_to_client_or_not(connect_to_client):

        @ray.remote
        def get_node_id():
            return ray.get_runtime_context().node_id.hex()

        worker_node_ids = {
            ray.get(get_node_id.options(resources={
                f"foo:{i}": 1
            }).remote())
            for i in range(2)
        }
        # Wait for updating driver raylet's resource view.
        time.sleep(5)

        @ray.remote(scheduling_strategy="SPREAD")
        def task1():
            internal_kv._internal_kv_put("test_task1", "task1")
            while internal_kv._internal_kv_exists("test_task1"):
                time.sleep(0.1)
            return ray.get_runtime_context().node_id.hex()

        @ray.remote
        def task2():
            internal_kv._internal_kv_put("test_task2", "task2")
            return ray.get_runtime_context().node_id.hex()

        locations = []
        locations.append(task1.remote())
        while not internal_kv._internal_kv_exists("test_task1"):
            time.sleep(0.1)
        # Wait for updating driver raylet's resource view.
        time.sleep(5)
        locations.append(task2.options(scheduling_strategy="SPREAD").remote())
        while not internal_kv._internal_kv_exists("test_task2"):
            time.sleep(0.1)
        internal_kv._internal_kv_del("test_task1")
        internal_kv._internal_kv_del("test_task2")
        assert set(ray.get(locations)) == worker_node_ids

        # Wait for updating driver raylet's resource view.
        time.sleep(5)

        # Make sure actors can be spreaded as well.
        @ray.remote(num_cpus=1)
        class Actor:
            def ping(self):
                return ray.get_runtime_context().node_id.hex()

        actors = []
        locations = []
        for i in range(8):
            actors.append(Actor.options(scheduling_strategy="SPREAD").remote())
            locations.append(ray.get(actors[-1].ping.remote()))
        locations.sort()
        expected_locations = list(worker_node_ids) * 4
        expected_locations.sort()
        assert locations == expected_locations