Python get_and_run_node_killer 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: ray._private.test_utils

메소드/함수: get_and_run_node_killer

hotexamples.com에서의 예제들: 4

Python get_and_run_node_killer - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 ray._private.test_utils.get_and_run_node_killer에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def main():
    """Start the chaos testing.

    Currently chaos testing only covers random node failures.
    """
    args, _ = parse_script_args()
    ray.init(address="auto")
    get_and_run_node_killer(
        args.node_kill_interval,
        namespace="release_test_namespace",
        lifetime="detached",
        no_start=args.no_start)
    print("Successfully deployed a node killer.")

예제 #2

파일 보기

def _ray_start_chaos_cluster(request):
    param = getattr(request, "param", {})
    kill_interval = param.pop("kill_interval", None)
    config = param.pop("_system_config", {})
    config.update(
        {
            "num_heartbeats_timeout": 10,
            "raylet_heartbeat_period_milliseconds": 100,
            "task_retry_delay_ms": 100,
        }
    )
    # Config of workers that are re-started.
    head_resources = param.pop("head_resources")
    worker_node_types = param.pop("worker_node_types")
    cluster = AutoscalingCluster(
        head_resources,
        worker_node_types,
        idle_timeout_minutes=10,  # Don't take down nodes.
        **param,
    )
    cluster.start(_system_config=config)
    ray.init("auto")
    nodes = ray.nodes()
    assert len(nodes) == 1

    if kill_interval is not None:
        node_killer = get_and_run_node_killer(kill_interval)

    yield cluster

    if kill_interval is not None:
        ray.get(node_killer.stop_run.remote())
        killed = ray.get(node_killer.get_total_killed_nodes.remote())
        assert len(killed) > 0
        died = {node["NodeID"] for node in ray.nodes() if not node["Alive"]}
        assert died.issubset(killed), (
            f"Raylets {died - killed} that " "we did not kill crashed"
        )

    ray.shutdown()
    cluster.shutdown()

예제 #3

파일 보기

파일: conftest.py 프로젝트: amzn/amazon-ray

def ray_start_chaos_cluster(request):
    """Returns the cluster and chaos thread.
    """
    os.environ["RAY_num_heartbeats_timeout"] = "5"
    os.environ["RAY_raylet_heartbeat_period_milliseconds"] = "100"
    param = getattr(request, "param", {})
    kill_interval = param.get("kill_interval", 2)
    # Config of workers that are re-started.
    head_resources = param["head_resources"]
    worker_node_types = param["worker_node_types"]

    cluster = AutoscalingCluster(head_resources, worker_node_types)
    cluster.start()
    ray.init("auto")
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_killer = get_and_run_node_killer(kill_interval)
    yield node_killer
    assert ray.get(node_killer.get_total_killed_nodes.remote()) > 0
    ray.shutdown()
    cluster.shutdown()
    del os.environ["RAY_num_heartbeats_timeout"]
    del os.environ["RAY_raylet_heartbeat_period_milliseconds"]

예제 #4

파일 보기

파일: test_chaos_basic.py 프로젝트: mvindiola1/ray

def main():
    """Test task/actor/placement group basic chaos test.

    Currently, it only tests node failures scenario.
    Node failures are implemented by an actor that keeps calling
    Raylet's KillRaylet RPC.

    Ideally, we should setup the infra to cause machine failures/
    network partitions/etc., but we don't do that for now.

    In the short term, we will only test gRPC network delay +
    node failures.

    Currently, the test runs 3 steps. Each steps records the
    peak memory usage to observe the memory usage while there
    are node failures.

    Step 1: Warm up the cluster. It is needed to pre-start workers
        if necessary.

    Step 2: Start the test without a failure.

    Step 3: Start the test with constant node failures.
    """
    args, unknown = parse_script_args()
    logging.info("Received arguments: {}".format(args))
    ray.init(address="auto")
    total_num_cpus = ray.cluster_resources()["CPU"]
    total_nodes = 0
    for n in ray.nodes():
        if n["Alive"]:
            total_nodes += 1
    monitor_actor = monitor_memory_usage()

    workload = None
    if args.workload == "tasks":
        workload = run_task_workload
    elif args.workload == "actors":
        workload = run_actor_workload
    elif args.workload == "pg":
        workload = run_placement_group_workload
    else:
        assert False

    # Step 1
    print("Warm up... Prestarting workers if necessary.")
    start = time.time()
    workload(total_num_cpus, args.smoke)

    # Step 2
    print("Running without failures")
    start = time.time()
    workload(total_num_cpus, args.smoke)
    print(f"Runtime when there are no failures: {time.time() - start}")
    used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
    print("Memory usage without failures.")
    print(f"Peak memory usage: {round(used_gb, 2)}GB")
    print(f"Peak memory usage per processes:\n {usage}")

    # Step 3
    print("Running with failures")
    node_killer = get_and_run_node_killer(
        node_kill_interval_s=args.node_kill_interval)
    start = time.time()
    workload(total_num_cpus, args.smoke)
    print(f"Runtime when there are many failures: {time.time() - start}")
    print(f"Total node failures: "
          f"{ray.get(node_killer.get_total_killed_nodes.remote())}")
    node_killer.stop_run.remote()
    used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
    print("Memory usage with failures.")
    print(f"Peak memory usage: {round(used_gb, 2)}GB")
    print(f"Peak memory usage per processes:\n {usage}")

    # Report the result.
    ray.get(monitor_actor.stop_run.remote())
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(
            json.dumps({
                "success": 1,
                "_peak_memory": round(used_gb, 2),
                "_peak_process_memory": usage
            }))