示例#1
0
def test(num_tasks):
    ray.init(address="auto")

    test_utils.wait_for_condition(no_resource_leaks)
    monitor_actor = test_utils.monitor_memory_usage()
    start_time = time.time()
    test_max_running_tasks(num_tasks)
    end_time = time.time()
    ray.get(monitor_actor.stop_run.remote())
    used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
    print(f"Peak memory usage: {round(used_gb, 2)}GB")
    print(f"Peak memory usage per processes:\n {usage}")
    del monitor_actor
    test_utils.wait_for_condition(no_resource_leaks)

    rate = num_tasks / (end_time - start_time - sleep_time)
    print(f"Success! Started {num_tasks} tasks in {end_time - start_time}s. "
          f"({rate} tasks/s)")

    if "TEST_OUTPUT_JSON" in os.environ:
        out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
        results = {
            "tasks_per_second": rate,
            "num_tasks": num_tasks,
            "time": end_time - start_time,
            "success": "1",
            "_peak_memory": round(used_gb, 2),
            "_peak_process_memory": usage,
        }
        json.dump(results, out_file)
示例#2
0
def main():
    """The test simulates the workload with many threaded actors.

    Test is doing 4 things for 1 hour.

    - It first creates actors as many as num_cpus with max_concurrency=10
    - Each actor computes pi and put the result to the queue.
    - Driver keeps getting result & metadata from the actor.
    - Every X seconds, it kills all actors and restarts them.
    """
    ray.init(address="auto")
    args, unknown = parse_script_args()
    num_cpus = ray.cluster_resources()["CPU"]
    num_nodes = sum(1 for n in ray.nodes() if n["Alive"])
    print(f"Total number of actors: {num_cpus}, nodes: {num_nodes}")
    monitor_actor = monitor_memory_usage()

    start = time.time()
    while time.time() - start < args.test_runtime:
        # Step 1: Create actors and start computation loop.
        print("Create actors.")
        actors = start_actors(num_cpus, num_nodes)

        # Step 2: Get the pi result from actors.
        compute_start = time.time()
        print("Start computation.")
        while time.time() - compute_start < args.kill_interval_s:
            # Get the metadata.
            ray.get([actor.get_metadata.remote() for actor in actors])
            # Get the result.
            pb = ProgressBar("Computing Pi", num_cpus)
            results = [actor.get_pi.remote() for actor in actors]
            pb.fetch_until_complete(results)
            pb.close()

        # Step 3: Kill actors.
        print("Kill all actors.")
        for actor in actors:
            ray.kill(actor)

    # Report the result.
    print("PASSED.")
    used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
    print("Memory usage with failures.")
    print(f"Peak memory usage: {round(used_gb, 2)}GB")
    print(f"Peak memory usage per processes:\n {usage}")
    # Report the result.
    ray.get(monitor_actor.stop_run.remote())

    result = {"success": 0}
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(json.dumps(result))
示例#3
0
def main():
    args, unknown = parse_script_args()
    logging.info("Received arguments: {}".format(args))

    # Create test spec
    test_spec = TestSpec(
        num_workers=args.num_workers,
        worker_obj_store_size_in_gb=args.worker_obj_store_size_in_gb,
        error_rate=args.error_rate,
        trigger_object_spill=args.trigger_object_spill,
    )
    logging.info("Created test spec: {}".format(test_spec))

    # Create the data save path if it doesn't exist.
    data_save_path = args.data_save_path
    if not os.path.exists(data_save_path):
        os.makedirs(data_save_path, mode=0o777, exist_ok=True)
    os.chmod(data_save_path, mode=0o777)

    # Lazily construct Xarrays
    xarray_filename_pairs = lazy_create_xarray_filename_pairs(test_spec)

    # Connect to the Ray cluster
    ray.init(address="auto")
    monitor_actor = monitor_memory_usage()

    # Save all the Xarrays to disk; this will trigger
    # Dask computations on Ray.
    logging.info("Saving {} xarrays..".format(len(xarray_filename_pairs)))
    SaveRoutines.save_all_xarrays(
        xarray_filename_pairs=xarray_filename_pairs,
        dirpath=data_save_path,
        batch_size=test_spec.batch_size,
        ray_scheduler=ray_dask_get,
    )
    ray.get(monitor_actor.stop_run.remote())
    used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
    print(f"Peak memory usage: {round(used_gb, 2)}GB")
    print(f"Peak memory usage per processes:\n {usage}")
    print(ray.internal.internal_api.memory_summary(stats_only=True))
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(
            json.dumps({
                "success": 1,
                "_peak_memory": round(used_gb, 2),
                "_peak_process_memory": usage
            }))
示例#4
0
    for _ in tqdm.trange(len(actors)):
        ready, not_ready = ray.wait(not_ready)
        assert ray.get(*ready) == "pong"

    for pg in tqdm.tqdm(pgs, desc="Cleaning up pgs"):
        remove_placement_group(pg)


def no_resource_leaks():
    return test_utils.no_resource_leaks_excluding_node_resources()


ray.init(address="auto")

test_utils.wait_for_condition(no_resource_leaks)
monitor_actor = test_utils.monitor_memory_usage()
start_time = time.time()
test_many_placement_groups()
end_time = time.time()
ray.get(monitor_actor.stop_run.remote())
used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
print(f"Peak memory usage: {round(used_gb, 2)}GB")
print(f"Peak memory usage per processes:\n {usage}")
del monitor_actor
test_utils.wait_for_condition(no_resource_leaks)

rate = MAX_PLACEMENT_GROUPS / (end_time - start_time)
print(f"Success! Started {MAX_PLACEMENT_GROUPS} pgs in "
      f"{end_time - start_time}s. ({rate} pgs/s)")

if "TEST_OUTPUT_JSON" in os.environ:
示例#5
0
def main():
    """Test task/actor/placement group basic chaos test.

    Currently, it only tests node failures scenario.
    Node failures are implemented by an actor that keeps calling
    Raylet's KillRaylet RPC.

    Ideally, we should setup the infra to cause machine failures/
    network partitions/etc., but we don't do that for now.

    In the short term, we will only test gRPC network delay +
    node failures.

    Currently, the test runs 3 steps. Each steps records the
    peak memory usage to observe the memory usage while there
    are node failures.

    Step 1: Warm up the cluster. It is needed to pre-start workers
        if necessary.

    Step 2: Start the test without a failure.

    Step 3: Start the test with constant node failures.
    """
    args, unknown = parse_script_args()
    logging.info("Received arguments: {}".format(args))
    ray.init(address="auto")
    total_num_cpus = ray.cluster_resources()["CPU"]
    total_nodes = 0
    for n in ray.nodes():
        if n["Alive"]:
            total_nodes += 1
    monitor_actor = monitor_memory_usage()

    workload = None
    if args.workload == "tasks":
        workload = run_task_workload
    elif args.workload == "actors":
        workload = run_actor_workload
    elif args.workload == "pg":
        workload = run_placement_group_workload
    else:
        assert False

    # Step 1
    print("Warm up... Prestarting workers if necessary.")
    start = time.time()
    workload(total_num_cpus, args.smoke)

    # Step 2
    print("Running without failures")
    start = time.time()
    workload(total_num_cpus, args.smoke)
    print(f"Runtime when there are no failures: {time.time() - start}")
    used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
    print("Memory usage without failures.")
    print(f"Peak memory usage: {round(used_gb, 2)}GB")
    print(f"Peak memory usage per processes:\n {usage}")

    # Step 3
    print("Running with failures")
    start = time.time()
    node_killer = ray.get_actor("node_killer",
                                namespace="release_test_namespace")
    node_killer.run.remote()
    workload(total_num_cpus, args.smoke)
    print(f"Runtime when there are many failures: {time.time() - start}")
    print(f"Total node failures: "
          f"{ray.get(node_killer.get_total_killed_nodes.remote())}")
    node_killer.stop_run.remote()
    used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
    print("Memory usage with failures.")
    print(f"Peak memory usage: {round(used_gb, 2)}GB")
    print(f"Peak memory usage per processes:\n {usage}")

    # Report the result.
    ray.get(monitor_actor.stop_run.remote())
    print("Total number of killed nodes: "
          f"{ray.get(node_killer.get_total_killed_nodes.remote())}")
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(
            json.dumps({
                "success": 1,
                "_peak_memory": round(used_gb, 2),
                "_peak_process_memory": usage
            }))
示例#6
0
from ray._private.test_utils import monitor_memory_usage

num_redis_shards = 5
redis_max_memory = 10**8
object_store_memory = 10**9
num_nodes = 3

message = ("Make sure there is enough memory on this machine to run this "
           "workload. We divide the system memory by 2 to provide a buffer.")
assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory <
        ray._private.utils.get_system_memory() / 2), message

# Simulate a cluster on one machine.

ray.init(address="auto")
monitor_actor = monitor_memory_usage()

# Run the workload.

run_experiments(
    {
        "ppo": {
            "run": "PPO",
            "env": "CartPole-v0",
            "num_samples": 10000,
            "config": {
                "framework": "torch",
                "num_workers": 7,
                "num_gpus": 0,
                "num_sgd_iter": 1,
            },