示例#1
0
    def _handle_failure(self, error):
        logger.exception("Error in monitor loop")
        if self.autoscaler is not None and \
           os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1":
            self.autoscaler.kill_workers()
            # Take down autoscaler workers if necessary.
            self.destroy_autoscaler_workers()

        # Something went wrong, so push an error to all current and future
        # drivers.
        message = f"The autoscaler failed with the following error:\n{error}"
        if _internal_kv_initialized():
            _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True)
        if not use_gcs_for_bootstrap():
            redis_client = ray._private.services.create_redis_client(
                self.redis_address, password=self.redis_password)
        else:
            redis_client = None
        gcs_publisher = None
        if gcs_pubsub_enabled():
            if use_gcs_for_bootstrap():
                gcs_publisher = GcsPublisher(address=args.gcs_address)
            else:
                gcs_publisher = GcsPublisher(
                    address=get_gcs_address_from_redis(redis_client))
        from ray._private.utils import publish_error_to_driver
        publish_error_to_driver(ray_constants.MONITOR_DIED_ERROR,
                                message,
                                redis_client=redis_client,
                                gcs_publisher=gcs_publisher)
示例#2
0
    def __init__(self, http_host, http_port, http_port_retries, gcs_address,
                 redis_address, redis_password, log_dir):
        self.health_check_thread: GCSHealthCheckThread = None
        self._gcs_rpc_error_counter = 0
        # Public attributes are accessible for all head modules.
        # Walkaround for issue: https://github.com/ray-project/ray/issues/7084
        self.http_host = "127.0.0.1" if http_host == "localhost" else http_host
        self.http_port = http_port
        self.http_port_retries = http_port_retries

        if use_gcs_for_bootstrap():
            assert gcs_address is not None
            self.gcs_address = gcs_address
        else:
            self.redis_address = dashboard_utils.address_tuple(redis_address)
            self.redis_password = redis_password

        self.log_dir = log_dir
        self.aioredis_client = None
        self.aiogrpc_gcs_channel = None
        self.gcs_error_subscriber = None
        self.gcs_log_subscriber = None
        self.http_session = None
        self.ip = ray.util.get_node_ip_address()
        if not use_gcs_for_bootstrap():
            ip, port = redis_address.split(":")
        else:
            ip, port = gcs_address.split(":")

        self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), ))
        grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0"
        self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server(
            self.server, f"{grpc_ip}:0")
        logger.info("Dashboard head grpc address: %s:%s", grpc_ip,
                    self.grpc_port)
示例#3
0
    async def check(self, *args, **kwargs):
        try:
            if not ray.is_initialized():
                try:
                    if use_gcs_for_bootstrap():
                        address = self._dashboard_head.gcs_address
                        redis_pw = None
                        logger.info(
                            f"Connecting to ray with address={address}")
                    else:
                        ip, port = self._dashboard_head.redis_address
                        redis_pw = self._dashboard_head.redis_password
                        address = f"{ip}:{port}"
                        logger.info(
                            f"Connecting to ray with address={address}, "
                            f"redis_pw={redis_pw}")
                    ray.init(
                        address=address,
                        namespace=RAY_INTERNAL_JOBS_NAMESPACE,
                        _redis_password=redis_pw)
                except Exception as e:
                    ray.shutdown()
                    raise e from None

            return await f(self, *args, **kwargs)
        except Exception as e:
            logger.exception(f"Unexpected error in handler: {e}")
            return Response(
                text=traceback.format_exc(),
                status=aiohttp.web.HTTPInternalServerError.status_code)
示例#4
0
    def __init__(self, dashboard_agent):
        """Initialize the reporter object."""
        super().__init__(dashboard_agent)
        if IN_KUBERNETES_POD:
            # psutil does not compute this correctly when in a K8s pod.
            # Use ray._private.utils instead.
            cpu_count = ray._private.utils.get_num_cpus()
            self._cpu_counts = (cpu_count, cpu_count)
        else:
            self._cpu_counts = (psutil.cpu_count(),
                                psutil.cpu_count(logical=False))

        self._ip = dashboard_agent.ip
        if not use_gcs_for_bootstrap():
            self._redis_address, _ = dashboard_agent.redis_address
            self._is_head_node = (self._ip == self._redis_address)
        else:
            self._is_head_node = (
                self._ip == dashboard_agent.gcs_address.split(":")[0])
        self._hostname = socket.gethostname()
        self._workers = set()
        self._network_stats_hist = [(0, (0.0, 0.0))]  # time, (sent, recv)
        self._metrics_agent = MetricsAgent(
            "127.0.0.1" if self._ip == "127.0.0.1" else "",
            dashboard_agent.metrics_export_port)
        self._key = f"{reporter_consts.REPORTER_PREFIX}" \
                    f"{self._dashboard_agent.node_id}"
示例#5
0
文件: test_utils.py 项目: novahe/ray
def run_string_as_driver(driver_script: str, env: Dict = None):
    """Run a driver as a separate process.

    Args:
        driver_script (str): A string to run as a Python script.
        env (dict): The environment variables for the driver.

    Returns:
        The script's output.
    """
    if env is not None and gcs_utils.use_gcs_for_bootstrap():
        env.update({
            "RAY_bootstrap_with_gcs": "1",
            "RAY_gcs_grpc_based_pubsub": "1",
            "RAY_gcs_storage": "memory",
            "RAY_bootstrap_with_gcs": "1",
        })

    proc = subprocess.Popen(
        [sys.executable, "-"],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        env=env,
    )
    with proc:
        output = proc.communicate(driver_script.encode("ascii"))[0]
        if proc.returncode:
            print(ray._private.utils.decode(output))
            raise subprocess.CalledProcessError(proc.returncode, proc.args,
                                                output, proc.stderr)
        out = ray._private.utils.decode(output)
    return out
示例#6
0
    def __init__(self,
                 node_ip_address,
                 redis_address,
                 dashboard_agent_port,
                 gcs_address,
                 redis_password=None,
                 temp_dir=None,
                 session_dir=None,
                 runtime_env_dir=None,
                 log_dir=None,
                 metrics_export_port=None,
                 node_manager_port=None,
                 listen_port=0,
                 object_store_name=None,
                 raylet_name=None,
                 logging_params=None):
        """Initialize the DashboardAgent object."""
        # Public attributes are accessible for all agent modules.
        self.ip = node_ip_address

        if use_gcs_for_bootstrap():
            assert gcs_address is not None
            self.gcs_address = gcs_address
        else:
            self.redis_address = dashboard_utils.address_tuple(redis_address)
            self.redis_password = redis_password
            self.aioredis_client = None
            self.gcs_address = None

        self.temp_dir = temp_dir
        self.session_dir = session_dir
        self.runtime_env_dir = runtime_env_dir
        self.log_dir = log_dir
        self.dashboard_agent_port = dashboard_agent_port
        self.metrics_export_port = metrics_export_port
        self.node_manager_port = node_manager_port
        self.listen_port = listen_port
        self.object_store_name = object_store_name
        self.raylet_name = raylet_name
        self.logging_params = logging_params
        self.node_id = os.environ["RAY_NODE_ID"]
        # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is
        # only used for fate-sharing with the raylet and we need a different
        # fate-sharing mechanism for Windows anyways.
        if sys.platform not in ["win32", "cygwin"]:
            self.ppid = int(os.environ["RAY_RAYLET_PID"])
            assert self.ppid > 0
            logger.info("Parent pid is %s", self.ppid)
        self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), ))
        grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0"
        self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server(
            self.server, f"{grpc_ip}:{self.dashboard_agent_port}")
        logger.info("Dashboard agent grpc address: %s:%s", grpc_ip,
                    self.grpc_port)
        options = (("grpc.enable_http_proxy", 0), )
        self.aiogrpc_raylet_channel = ray._private.utils.init_grpc_channel(
            f"{self.ip}:{self.node_manager_port}", options, asynchronous=True)
        self.http_session = None
示例#7
0
def test_raylet_tempfiles(shutdown_only):
    expected_socket_files = ({"plasma_store", "raylet"}
                             if sys.platform != "win32" else set())

    ray.init(num_cpus=0)
    node = ray.worker._global_node
    top_levels = set(os.listdir(node.get_session_dir_path()))
    assert top_levels.issuperset({"sockets", "logs"})
    log_files_expected = {
        "log_monitor.log",
        "monitor.log",
        "raylet.out",
        "raylet.err",
        "gcs_server.out",
        "gcs_server.err",
        "dashboard.log",
        "dashboard_agent.log",
    }
    if not use_gcs_for_bootstrap():
        log_files_expected.update({
            "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err"
        })

    def check_all_log_file_exists():
        for expected in log_files_expected:
            log_files = set(os.listdir(node.get_logs_dir_path()))
            if expected not in log_files:
                raise RuntimeError(f"File {expected} not found!")
        return True

    wait_for_condition(check_all_log_file_exists)
    # Get the list of log files again since the previous one
    # might have the stale information.
    log_files = set(os.listdir(node.get_logs_dir_path()))
    assert log_files_expected.issubset(log_files)
    assert log_files.issuperset(log_files_expected)

    socket_files = set(os.listdir(node.get_sockets_dir_path()))
    assert socket_files == expected_socket_files
    ray.shutdown()

    ray.init(num_cpus=2)
    node = ray.worker._global_node
    top_levels = set(os.listdir(node.get_session_dir_path()))
    assert top_levels.issuperset({"sockets", "logs"})
    time.sleep(3)  # wait workers to start
    log_files = set(os.listdir(node.get_logs_dir_path()))

    assert log_files.issuperset(log_files_expected)

    # Check numbers of worker log file.
    assert sum(1 for filename in log_files
               if filename.startswith("worker")) == 4

    socket_files = set(os.listdir(node.get_sockets_dir_path()))
    assert socket_files == expected_socket_files
示例#8
0
def make_global_state_accessor(address_info):
    if not gcs_utils.use_gcs_for_bootstrap():
        gcs_options = GcsClientOptions.from_redis_address(
            address_info["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD
        )
    else:
        gcs_options = GcsClientOptions.from_gcs_address(address_info["gcs_address"])
    global_state_accessor = GlobalStateAccessor(gcs_options)
    global_state_accessor.connect()
    return global_state_accessor
示例#9
0
def test_heartbeats_single(ray_start_cluster_head):
    """Unit test for `Cluster.wait_for_nodes`.

    Test proper metrics.
    """
    cluster = ray_start_cluster_head
    if use_gcs_for_bootstrap():
        monitor = setup_monitor(cluster.gcs_address)
    else:
        monitor = setup_monitor(cluster.address)
    total_cpus = ray.state.cluster_resources()["CPU"]
    verify_load_metrics(monitor, ({"CPU": 0.0}, {"CPU": total_cpus}))

    @ray.remote
    def work(signal):
        wait_signal = signal.wait.remote()
        while True:
            ready, not_ready = ray.wait([wait_signal], timeout=0)
            if len(ready) == 1:
                break
            time.sleep(1)

    signal = SignalActor.remote()

    work_handle = work.remote(signal)
    verify_load_metrics(monitor, ({"CPU": 1.0}, {"CPU": total_cpus}))

    ray.get(signal.send.remote())
    ray.get(work_handle)

    @ray.remote(num_cpus=1)
    class Actor:
        def work(self, signal):
            wait_signal = signal.wait.remote()
            while True:
                ready, not_ready = ray.wait([wait_signal], timeout=0)
                if len(ready) == 1:
                    break
                time.sleep(1)

    signal = SignalActor.remote()

    test_actor = Actor.remote()
    work_handle = test_actor.work.remote(signal)
    time.sleep(1)  # Time for actor to get placed and the method to start.

    verify_load_metrics(monitor, ({"CPU": 1.0}, {"CPU": total_cpus}))

    ray.get(signal.send.remote())
    ray.get(work_handle)
    del monitor
示例#10
0
def try_create_gcs_client(
    address: Optional[str], redis_password: Optional[str]
) -> Optional[GcsClient]:
    """
    Try to create a gcs client based on the the command line args or by
    autodetecting a running Ray cluster.
    """
    address = canonicalize_bootstrap_address(address)
    if use_gcs_for_bootstrap():
        return GcsClient(address=address)
    else:
        if redis_password is None:
            redis_password = ray.ray_constants.REDIS_DEFAULT_PASSWORD
        return GcsClient.connect_to_gcs_by_redis_address(address, redis_password)
示例#11
0
def make_gcs_client(address_info):
    if not use_gcs_for_bootstrap():
        address = address_info["redis_address"]
        address = address.split(":")
        assert len(address) == 2
        client = redis.StrictRedis(
            host=address[0],
            port=int(address[1]),
            password=ray_constants.REDIS_DEFAULT_PASSWORD)
        gcs_client = ray._private.gcs_utils.GcsClient.create_from_redis(client)
    else:
        address = address_info["gcs_address"]
        gcs_client = ray._private.gcs_utils.GcsClient(address=address)
    return gcs_client
示例#12
0
文件: proxier.py 项目: ijrsvt/ray
def serve_proxier(
    connection_str: str,
    address: Optional[str],
    *,
    redis_password: Optional[str] = None,
    session_dir: Optional[str] = None,
    runtime_env_agent_port: int = 0,
):
    # Initialize internal KV to be used to upload and download working_dir
    # before calling ray.init within the RayletServicers.
    # NOTE(edoakes): redis_address and redis_password should only be None in
    # tests.
    if use_gcs_for_bootstrap():
        if address is not None:
            gcs_cli = GcsClient(address=address)
            ray.experimental.internal_kv._initialize_internal_kv(gcs_cli)
    else:
        if address is not None and redis_password is not None:
            gcs_cli = GcsClient.connect_to_gcs_by_redis_address(
                address, redis_password)
            ray.experimental.internal_kv._initialize_internal_kv(gcs_cli)

    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=CLIENT_SERVER_MAX_THREADS),
        options=GRPC_OPTIONS,
    )
    proxy_manager = ProxyManager(
        address,
        session_dir=session_dir,
        redis_password=redis_password,
        runtime_env_agent_port=runtime_env_agent_port,
    )
    task_servicer = RayletServicerProxy(None, proxy_manager)
    data_servicer = DataServicerProxy(proxy_manager)
    logs_servicer = LogstreamServicerProxy(proxy_manager)
    ray_client_pb2_grpc.add_RayletDriverServicer_to_server(
        task_servicer, server)
    ray_client_pb2_grpc.add_RayletDataStreamerServicer_to_server(
        data_servicer, server)
    ray_client_pb2_grpc.add_RayletLogStreamerServicer_to_server(
        logs_servicer, server)
    add_port_to_grpc_server(server, connection_str)
    server.start()
    return ClientServerHandle(
        task_servicer=task_servicer,
        data_servicer=data_servicer,
        logs_servicer=logs_servicer,
        grpc_server=server,
    )
示例#13
0
    def __init__(self, redis_address, redis_password, gcs_address, temp_dir):
        if use_gcs_for_bootstrap():
            gcs_client_options = ray._raylet.GcsClientOptions.from_gcs_address(
                gcs_address)
            self.gcs_address = gcs_address
        else:
            gcs_client_options = ray._raylet.GcsClientOptions.from_redis_address(
                redis_address, redis_password)
            self.redis_address = redis_address
            self.redis_password = redis_password

        ray.state.state._initialize_global_state(gcs_client_options)
        self.temp_dir = temp_dir
        self.default_service_discovery_flush_period = 5
        super().__init__()
def get_ray_status_output(address):
    if gcs_utils.use_gcs_for_bootstrap():
        gcs_client = gcs_utils.GcsClient(address=address)
    else:
        redis_client = ray._private.services.create_redis_client(address, "")
        gcs_client = gcs_utils.GcsClient.create_from_redis(redis_client)
    internal_kv._initialize_internal_kv(gcs_client)
    status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS)
    error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR)
    return {
        "demand":
        debug_status(status,
                     error).split("Demands:")[1].strip("\n").strip(" "),
        "usage":
        debug_status(status, error).split("Demands:")[0].split("Usage:")
        [1].strip("\n").strip(" ")
    }
示例#15
0
 async def get_gcs_address(self):
     # Create an aioredis client for all modules.
     if use_gcs_for_bootstrap():
         return self.gcs_address
     else:
         try:
             self.aioredis_client = \
                 await dashboard_utils.get_aioredis_client(
                     self.redis_address, self.redis_password,
                     dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS,
                     dashboard_consts.RETRY_REDIS_CONNECTION_TIMES)
         except (socket.gaierror, ConnectionError):
             logger.error(
                 "Dashboard head exiting: "
                 "Failed to connect to redis at %s", self.redis_address)
             sys.exit(-1)
         return await get_gcs_address_with_retry(self.aioredis_client)
示例#16
0
文件: log_monitor.py 项目: ijrsvt/ray
 def __init__(self, logs_dir, redis_address, gcs_address, redis_password=None):
     """Initialize the log monitor object."""
     self.ip = services.get_node_ip_address()
     self.logs_dir = logs_dir
     if gcs_utils.use_gcs_for_bootstrap():
         self.redis_client = None
     else:
         self.redis_client = ray._private.services.create_redis_client(
             redis_address, password=redis_password
         )
         gcs_address = gcs_utils.get_gcs_address_from_redis(self.redis_client)
     self.publisher = None
     if gcs_pubsub.gcs_pubsub_enabled():
         self.publisher = gcs_pubsub.GcsPublisher(address=gcs_address)
     self.log_filenames = set()
     self.open_file_infos = []
     self.closed_file_infos = []
     self.can_open_more_files = True
示例#17
0
def assert_no_thrashing(address):
    state = ray.state.GlobalState()
    if use_gcs_for_bootstrap():
        options = GcsClientOptions.from_gcs_address(address)
    else:
        options = GcsClientOptions.from_redis_address(
            address, ray.ray_constants.REDIS_DEFAULT_PASSWORD)
    state._initialize_global_state(options)
    summary = memory_summary(address=address, stats_only=True)
    restored_bytes = 0
    consumed_bytes = 0

    for line in summary.split("\n"):
        if "Restored" in line:
            restored_bytes = int(line.split(" ")[1])
        if "consumed" in line:
            consumed_bytes = int(line.split(" ")[-2])
    assert (consumed_bytes >= restored_bytes
            ), f"consumed: {consumed_bytes}, restored: {restored_bytes}"
示例#18
0
def list_state_cli_group(ctx):
    address = services.canonicalize_bootstrap_address(None)
    gcs_client = GcsClient(address=address, nums_reconnect_retry=0)
    ray.experimental.internal_kv._initialize_internal_kv(gcs_client)
    api_server_url = ray._private.utils.internal_kv_get_with_retry(
        gcs_client,
        ray_constants.DASHBOARD_ADDRESS,
        namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        num_retries=20,
    )
    if api_server_url is None:
        raise ValueError((
            "Couldn't obtain the API server address from GCS. It is likely that "
            "the GCS server is down. Check gcs_server.[out | err] to see if it is "
            "still alive."))

    assert use_gcs_for_bootstrap()
    ctx.ensure_object(dict)
    ctx.obj["api_server_url"] = f"http://{api_server_url.decode()}"
示例#19
0
def test_driver_lives_parallel(ray_start_regular):
    all_processes = ray.worker._global_node.all_processes

    process_infos = (all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
                     all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
                     all_processes[ray_constants.PROCESS_TYPE_MONITOR])
    if not use_gcs_for_bootstrap():
        process_infos += all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER]

    # Kill all the components in parallel.
    for process_info in process_infos:
        process_info.process.terminate()

    time.sleep(0.1)
    for process_info in process_infos:
        process_info.process.kill()

    for process_info in process_infos:
        process_info.process.wait()
示例#20
0
def memory_summary(address=None,
                   redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
                   group_by="NODE_ADDRESS",
                   sort_by="OBJECT_SIZE",
                   units="B",
                   line_wrap=True,
                   stats_only=False,
                   num_entries=None):
    from ray.dashboard.memory_utils import memory_summary
    address = services.canonicalize_bootstrap_address(address)

    state = GlobalState()
    if use_gcs_for_bootstrap():
        options = GcsClientOptions.from_gcs_address(address)
    else:
        options = GcsClientOptions.from_redis_address(address, redis_password)
    state._initialize_global_state(options)
    if stats_only:
        return get_store_stats(state)
    return (memory_summary(state, group_by, sort_by, line_wrap, units,
                           num_entries) + get_store_stats(state))
示例#21
0
    def node(self) -> ray.node.Node:
        """Gets a 'ray.Node' object for this node (the head node).
        If it does not already exist, one is created using the bootstrap
        address.
        """
        if self._node:
            return self._node
        if use_gcs_for_bootstrap():
            ray_params = RayParams(gcs_address=self.address)
        else:
            ray_params = RayParams(redis_address=self.address)
            if self._redis_password:
                ray_params.redis_password = self._redis_password

        self._node = ray.node.Node(ray_params,
                                   head=False,
                                   shutdown_at_exit=False,
                                   spawn_reaper=False,
                                   connect_only=True)

        return self._node
示例#22
0
        async def decorator(self, *args, **kwargs):
            try:
                if not ray.is_initialized():
                    try:
                        if use_gcs_for_bootstrap():
                            address = self._dashboard_head.gcs_address
                            redis_pw = None
                            logger.info(
                                f"Connecting to ray with address={address}")
                        else:
                            ip, port = self._dashboard_head.redis_address
                            redis_pw = self._dashboard_head.redis_password
                            address = f"{ip}:{port}"
                            logger.info(
                                f"Connecting to ray with address={address}, "
                                f"redis_pw={redis_pw}")
                        ray.init(
                            address=address,
                            namespace=RAY_INTERNAL_DASHBOARD_NAMESPACE,
                            _redis_password=redis_pw,
                        )
                    except Exception as e:
                        ray.shutdown()
                        raise e from None

                if connect_to_serve:
                    # TODO(edoakes): this should probably run in the `serve`
                    # namespace.
                    serve.start(detached=True)
                return await f(self, *args, **kwargs)
            except Exception as e:
                logger.exception(f"Unexpected error in handler: {e}")
                return Response(
                    text=traceback.format_exc(),
                    status=aiohttp.web.HTTPInternalServerError.status_code,
                )
示例#23
0
 def get_file_discovery_content(self):
     """Return the content for Prometheus service discovery."""
     nodes = ray.nodes()
     metrics_export_addresses = [
         "{}:{}".format(node["NodeManagerAddress"],
                        node["MetricsExportPort"]) for node in nodes
         if node["alive"] is True
     ]
     if not use_gcs_for_bootstrap():
         redis_client = services.create_redis_client(
             self.redis_address, self.redis_password)
         autoscaler_addr = redis_client.get("AutoscalerMetricsAddress")
     else:
         gcs_client = GcsClient(address=self.gcs_address)
         autoscaler_addr = gcs_client.internal_kv_get(
             b"AutoscalerMetricsAddress", None)
     if autoscaler_addr:
         metrics_export_addresses.append(autoscaler_addr.decode("utf-8"))
     return json.dumps([{
         "labels": {
             "job": "ray"
         },
         "targets": metrics_export_addresses
     }])
示例#24
0
    def add_node(self, wait=True, **node_args):
        """Adds a node to the local Ray Cluster.

        All nodes are by default started with the following settings:
            cleanup=True,
            num_cpus=1,
            object_store_memory=150 * 1024 * 1024  # 150 MiB

        Args:
            wait (bool): Whether to wait until the node is alive.
            node_args: Keyword arguments used in `start_ray_head` and
                `start_ray_node`. Overrides defaults.

        Returns:
            Node object of the added Ray node.
        """
        default_kwargs = {
            "num_cpus": 1,
            "num_gpus": 0,
            "object_store_memory": 150 * 1024 * 1024,  # 150 MiB
            "min_worker_port": 0,
            "max_worker_port": 0,
            "dashboard_port": None,
        }
        ray_params = ray._private.parameter.RayParams(**node_args)
        ray_params.update_if_absent(**default_kwargs)
        with disable_client_hook():
            if self.head_node is None:
                node = ray.node.Node(
                    ray_params,
                    head=True,
                    shutdown_at_exit=self._shutdown_at_exit,
                    spawn_reaper=self._shutdown_at_exit,
                )
                self.head_node = node
                self.redis_address = self.head_node.redis_address
                self.redis_password = node_args.get(
                    "redis_password", ray_constants.REDIS_DEFAULT_PASSWORD
                )
                self.webui_url = self.head_node.webui_url
                # Init global state accessor when creating head node.
                if use_gcs_for_bootstrap():
                    gcs_options = GcsClientOptions.from_gcs_address(node.gcs_address)
                else:
                    gcs_options = GcsClientOptions.from_redis_address(
                        self.redis_address, self.redis_password
                    )
                self.global_state._initialize_global_state(gcs_options)
            else:
                ray_params.update_if_absent(redis_address=self.redis_address)
                ray_params.update_if_absent(gcs_address=self.gcs_address)
                # We only need one log monitor per physical node.
                ray_params.update_if_absent(include_log_monitor=False)
                # Let grpc pick a port.
                ray_params.update_if_absent(node_manager_port=0)

                node = ray.node.Node(
                    ray_params,
                    head=False,
                    shutdown_at_exit=self._shutdown_at_exit,
                    spawn_reaper=self._shutdown_at_exit,
                )
                self.worker_nodes.add(node)

            if wait:
                # Wait for the node to appear in the client table. We do this
                # so that the nodes appears in the client table in the order
                # that the corresponding calls to add_node were made. We do
                # this because in the tests we assume that the driver is
                # connected to the first node that is added.
                self._wait_for_node(node)

        return node
示例#25
0
        try:
            dashboard_url = ray.experimental.internal_kv._internal_kv_get(
                ray_constants.DASHBOARD_ADDRESS,
                namespace=ray_constants.KV_NAMESPACE_DASHBOARD)
            if dashboard_url:
                new_port = int(dashboard_url.split(b":")[-1])
                assert new_port > int(port)
                break
        except AssertionError as e:
            logger.info("Retry because of %s", e)
        finally:
            if time.time() > start_time + timeout_seconds:
                raise Exception("Timed out while testing.")


@pytest.mark.skipif(use_gcs_for_bootstrap(), reason="Not working right now.")
def test_gcs_check_alive(fast_gcs_failure_detection, ray_start_with_dashboard):
    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
            is True)

    all_processes = ray.worker._global_node.all_processes
    dashboard_info = all_processes[ray_constants.PROCESS_TYPE_DASHBOARD][0]
    dashboard_proc = psutil.Process(dashboard_info.process.pid)
    gcs_server_info = all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER][0]
    gcs_server_proc = psutil.Process(gcs_server_info.process.pid)

    assert dashboard_proc.status() in [
        psutil.STATUS_RUNNING, psutil.STATUS_SLEEPING, psutil.STATUS_DISK_SLEEP
    ]

    gcs_server_proc.kill()
示例#26
0
    async def run(self):
        async def _check_parent():
            """Check if raylet is dead and fate-share if it is."""
            try:
                curr_proc = psutil.Process()
                while True:
                    parent = curr_proc.parent()
                    if (parent is None or parent.pid == 1
                            or self.ppid != parent.pid):
                        logger.error("Raylet is dead, exiting.")
                        sys.exit(0)
                    await asyncio.sleep(
                        dashboard_consts.
                        DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS)
            except Exception:
                logger.error("Failed to check parent PID, exiting.")
                sys.exit(1)

        if sys.platform not in ["win32", "cygwin"]:
            check_parent_task = create_task(_check_parent())

        if not use_gcs_for_bootstrap():
            # Create an aioredis client for all modules.
            try:
                self.aioredis_client = \
                    await dashboard_utils.get_aioredis_client(
                        self.redis_address, self.redis_password,
                        dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS,
                        dashboard_consts.RETRY_REDIS_CONNECTION_TIMES)
            except (socket.gaierror, ConnectionRefusedError):
                logger.error(
                    "Dashboard agent exiting: "
                    "Failed to connect to redis at %s", self.redis_address)
                sys.exit(-1)

        # Create a http session for all modules.
        # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore
        if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"):
            self.http_session = aiohttp.ClientSession(
                loop=asyncio.get_event_loop())
        else:
            self.http_session = aiohttp.ClientSession()

        # Start a grpc asyncio server.
        await self.server.start()

        if not use_gcs_for_bootstrap():
            gcs_address = await self.aioredis_client.get(
                dashboard_consts.GCS_SERVER_ADDRESS)
            self.gcs_client = GcsClient(address=gcs_address.decode())
        else:
            self.gcs_client = GcsClient(address=self.gcs_address)
        modules = self._load_modules()

        # Http server should be initialized after all modules loaded.
        app = aiohttp.web.Application()
        app.add_routes(routes=routes.bound_routes())

        # Enable CORS on all routes.
        cors = aiohttp_cors.setup(app,
                                  defaults={
                                      "*":
                                      aiohttp_cors.ResourceOptions(
                                          allow_credentials=True,
                                          expose_headers="*",
                                          allow_methods="*",
                                          allow_headers=("Content-Type",
                                                         "X-Header"),
                                      )
                                  })
        for route in list(app.router.routes()):
            cors.add(route)

        runner = aiohttp.web.AppRunner(app)
        await runner.setup()
        site = aiohttp.web.TCPSite(
            runner, "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0",
            self.listen_port)
        await site.start()
        http_host, http_port, *_ = site._server.sockets[0].getsockname()
        logger.info("Dashboard agent http address: %s:%s", http_host,
                    http_port)

        # Dump registered http routes.
        dump_routes = [
            r for r in app.router.routes() if r.method != hdrs.METH_HEAD
        ]
        for r in dump_routes:
            logger.info(r)
        logger.info("Registered %s routes.", len(dump_routes))

        # Write the dashboard agent port to redis.
        # TODO: Use async version if performance is an issue
        internal_kv._internal_kv_put(
            f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
            json.dumps([http_port, self.grpc_port]),
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD)

        # Register agent to agent manager.
        raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
            self.aiogrpc_raylet_channel)

        await raylet_stub.RegisterAgent(
            agent_manager_pb2.RegisterAgentRequest(agent_pid=os.getpid(),
                                                   agent_port=self.grpc_port,
                                                   agent_ip_address=self.ip))

        tasks = [m.run(self.server) for m in modules]
        if sys.platform not in ["win32", "cygwin"]:
            tasks.append(check_parent_task)
        await asyncio.gather(*tasks)

        await self.server.wait_for_termination()
        # Wait for finish signal.
        await runner.cleanup()
示例#27
0
        loop.run_until_complete(agent.run())
    except Exception as e:
        # All these env vars should be available because
        # they are provided by the parent raylet.
        restart_count = os.environ["RESTART_COUNT"]
        max_restart_count = os.environ["MAX_RESTART_COUNT"]
        raylet_pid = os.environ["RAY_RAYLET_PID"]
        node_ip = args.node_ip_address
        if restart_count >= max_restart_count:
            # Agent is failed to be started many times.
            # Push an error to all drivers, so that users can know the
            # impact of the issue.
            redis_client = None
            gcs_publisher = None
            if gcs_pubsub_enabled():
                if use_gcs_for_bootstrap():
                    gcs_publisher = GcsPublisher(args.gcs_address)
                else:
                    redis_client = ray._private.services.create_redis_client(
                        args.redis_address, password=args.redis_password)
                    gcs_publisher = GcsPublisher(
                        address=get_gcs_address_from_redis(redis_client))
            else:
                redis_client = ray._private.services.create_redis_client(
                    args.redis_address, password=args.redis_password)

            traceback_str = ray._private.utils.format_error_message(
                traceback.format_exc())
            message = (
                f"(ip={node_ip}) "
                f"The agent on node {platform.uname()[1]} failed to "
示例#28
0
class TestRedisPassword:
    @pytest.mark.skipif(use_gcs_for_bootstrap(),
                        reason="Not valid for gcs bootstrap")
    def test_redis_password(self, password, shutdown_only):
        @ray.remote
        def f():
            return 1

        info = ray.init(_redis_password=password)
        address = info["redis_address"]
        redis_ip, redis_port = address.split(":")

        # Check that we can run a task
        object_ref = f.remote()
        ray.get(object_ref)

        # Check that Redis connections require a password
        redis_client = redis.StrictRedis(host=redis_ip,
                                         port=redis_port,
                                         password=None)
        with pytest.raises(redis.exceptions.AuthenticationError):
            redis_client.ping()
        # We want to simulate how this is called by ray.scripts.start().
        try:
            ray._private.services.wait_for_redis_to_start(
                redis_ip, redis_port, password="******")
        # We catch a generic Exception here in case someone later changes the
        # type of the exception.
        except Exception as ex:
            if not (isinstance(ex.__cause__, redis.AuthenticationError)
                    and "invalid password" in str(ex.__cause__)) and not (
                        isinstance(ex, redis.ResponseError) and
                        "WRONGPASS invalid username-password pair" in str(ex)):
                raise
            # By contrast, we may be fairly confident the exact string
            # 'invalid password' won't go away, because redis-py simply wraps
            # the exact error from the Redis library.
            # https://github.com/andymccurdy/redis-py/blob/master/
            # redis/connection.py#L132
            # Except, apparently sometimes redis-py raises a completely
            # different *type* of error for a bad password,
            # redis.ResponseError, which is not even derived from
            # redis.ConnectionError as redis.AuthenticationError is.

        # Check that we can connect to Redis using the provided password
        redis_client = redis.StrictRedis(host=redis_ip,
                                         port=redis_port,
                                         password=password)
        assert redis_client.ping()

    def test_redis_password_cluster(self, password, shutdown_only):
        @ray.remote
        def f():
            return 1

        node_args = {"redis_password": password}
        cluster = Cluster(initialize_head=True,
                          connect=True,
                          head_node_args=node_args)
        cluster.add_node(**node_args)

        object_ref = f.remote()
        ray.get(object_ref)
示例#29
0
文件: log_monitor.py 项目: vakker/ray
    log_monitor = LogMonitor(
        args.logs_dir,
        args.redis_address,
        args.gcs_address,
        redis_password=args.redis_password,
    )

    try:
        log_monitor.run()
    except Exception as e:
        # Something went wrong, so push an error to all drivers.
        redis_client = ray._private.services.create_redis_client(
            args.redis_address, password=args.redis_password)
        gcs_publisher = None
        if gcs_pubsub_enabled():
            if gcs_utils.use_gcs_for_bootstrap():
                gcs_publisher = GcsPublisher(address=args.gcs_address)
            else:
                gcs_publisher = GcsPublisher(
                    address=gcs_utils.get_gcs_address_from_redis(redis_client))
        traceback_str = ray._private.utils.format_error_message(
            traceback.format_exc())
        message = (f"The log monitor on node {platform.node()} "
                   f"failed with the following error:\n{traceback_str}")
        ray._private.utils.publish_error_to_driver(
            ray_constants.LOG_MONITOR_DIED_ERROR,
            message,
            redis_client=redis_client,
            gcs_publisher=gcs_publisher,
        )
        logger.error(message)
示例#30
0
    # Create a second remote function to guarantee that when we call
    # get_path2.remote(), the second function to run will have been run on
    # the worker.
    @ray.remote
    def get_path2():
        return sys.path

    assert "fake_directory" not in ray.get(get_path2.remote())


@pytest.mark.skipif(
    "RAY_PROFILING" not in os.environ,
    reason="Only tested in client/profiling build.")
@pytest.mark.skipif(
    client_test_enabled() and use_gcs_for_bootstrap(),
    reason=("wait_for_function will miss in this mode. To be fixed after using"
            " gcs to bootstrap all component."))
def test_profiling_api(ray_start_2_cpus):
    @ray.remote
    def f(delay):
        with profiling.profile(
                "custom_event", extra_data={"name": "custom name"}):
            time.sleep(delay)
            pass

    @ray.remote
    def g(input_list):
        # The argument input_list should be a list containing one object ref.
        ray.wait([input_list[0]])