def test_raylet_tempfiles(): ray.init(redirect_worker_output=False) top_levels = set(os.listdir(tempfile_services.get_temp_root())) assert top_levels == {"ray_ui.ipynb", "sockets", "logs"} log_files = set(os.listdir(tempfile_services.get_logs_dir_path())) assert log_files == { "log_monitor.out", "log_monitor.err", "plasma_store.out", "plasma_store.err", "webui.out", "webui.err", "monitor.out", "monitor.err", "raylet_monitor.out", "raylet_monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err", "raylet.out", "raylet.err" } # with raylet logs socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path())) assert socket_files == {"plasma_store", "raylet"} ray.shutdown() ray.init(redirect_worker_output=True, num_cpus=0) top_levels = set(os.listdir(tempfile_services.get_temp_root())) assert top_levels == {"ray_ui.ipynb", "sockets", "logs"} log_files = set(os.listdir(tempfile_services.get_logs_dir_path())) assert log_files == { "log_monitor.out", "log_monitor.err", "plasma_store.out", "plasma_store.err", "webui.out", "webui.err", "monitor.out", "monitor.err", "raylet_monitor.out", "raylet_monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err", "raylet.out", "raylet.err" } # with raylet logs socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path())) assert socket_files == {"plasma_store", "raylet"} ray.shutdown() ray.init(redirect_worker_output=True, num_cpus=2) top_levels = set(os.listdir(tempfile_services.get_temp_root())) assert top_levels == {"ray_ui.ipynb", "sockets", "logs"} time.sleep(3) # wait workers to start log_files = set(os.listdir(tempfile_services.get_logs_dir_path())) assert log_files.issuperset({ "log_monitor.out", "log_monitor.err", "plasma_store.out", "plasma_store.err", "webui.out", "webui.err", "monitor.out", "monitor.err", "raylet_monitor.out", "raylet_monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err", "raylet.out", "raylet.err" }) # with raylet logs # Check numbers of worker log file. assert sum(1 for filename in log_files if filename.startswith("worker")) == 4 socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path())) assert socket_files == {"plasma_store", "raylet"} ray.shutdown()
def test_raylet_tempfiles(): ray.init(redirect_worker_output=False) top_levels = set(os.listdir(tempfile_services.get_temp_root())) assert top_levels == {"ray_ui.ipynb", "sockets", "logs"} log_files = set(os.listdir(tempfile_services.get_logs_dir_path())) assert log_files == { "log_monitor.out", "log_monitor.err", "plasma_store.out", "plasma_store.err", "webui.out", "webui.err", "monitor.out", "monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err" } # without raylet logs socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path())) assert socket_files == {"plasma_store", "raylet"} ray.shutdown() ray.init(redirect_worker_output=True, num_cpus=0) top_levels = set(os.listdir(tempfile_services.get_temp_root())) assert top_levels == {"ray_ui.ipynb", "sockets", "logs"} log_files = set(os.listdir(tempfile_services.get_logs_dir_path())) assert log_files == { "log_monitor.out", "log_monitor.err", "plasma_store.out", "plasma_store.err", "webui.out", "webui.err", "monitor.out", "monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err", "raylet.out", "raylet.err" } # with raylet logs socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path())) assert socket_files == {"plasma_store", "raylet"} ray.shutdown() ray.init(redirect_worker_output=True, num_cpus=2) top_levels = set(os.listdir(tempfile_services.get_temp_root())) assert top_levels == {"ray_ui.ipynb", "sockets", "logs"} time.sleep(3) # wait workers to start log_files = set(os.listdir(tempfile_services.get_logs_dir_path())) assert log_files.issuperset({ "log_monitor.out", "log_monitor.err", "plasma_store.out", "plasma_store.err", "webui.out", "webui.err", "monitor.out", "monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err", "raylet.out", "raylet.err" }) # with raylet logs # Check numbers of worker log file. assert sum( 1 for filename in log_files if filename.startswith("worker")) == 4 socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path())) assert socket_files == {"plasma_store", "raylet"} ray.shutdown()
def start_worker(node_ip_address, object_store_name, local_scheduler_name, redis_address, worker_path, stdout_file=None, stderr_file=None): """This method starts a worker process. Args: node_ip_address (str): The IP address of the node that this worker is running on. object_store_name (str): The name of the object store. local_scheduler_name (str): The name of the local scheduler. redis_address (str): The address that the Redis server is listening on. worker_path (str): The path of the source code which the worker process will run. stdout_file: A file handle opened for writing to redirect stdout to. If no redirection should happen, then this should be None. stderr_file: A file handle opened for writing to redirect stderr to. If no redirection should happen, then this should be None. Returns: The process that was started. """ command = [ sys.executable, "-u", worker_path, "--node-ip-address=" + node_ip_address, "--object-store-name=" + object_store_name, "--redis-address=" + str(redis_address), "--temp-dir=" + get_temp_root() ] p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file) record_log_files_in_redis(redis_address, node_ip_address, [stdout_file, stderr_file]) return p
def start_raylet(redis_address, node_ip_address, raylet_name, plasma_store_name, worker_path, num_cpus=None, num_gpus=None, resources=None, object_manager_port=None, node_manager_port=None, redis_password=None, use_valgrind=False, use_profiler=False, stdout_file=None, stderr_file=None, config=None): """Start a raylet, which is a combined local scheduler and object manager. Args: redis_address (str): The address of the primary Redis server. node_ip_address (str): The IP address of this node. raylet_name (str): The name of the raylet socket to create. plasma_store_name (str): The name of the plasma store socket to connect to. worker_path (str): The path of the Python file that new worker processes will execute. num_cpus: The CPUs allocated for this raylet. num_gpus: The GPUs allocated for this raylet. resources: The custom resources allocated for this raylet. object_manager_port: The port to use for the object manager. If this is None, then the object manager will choose its own port. node_manager_port: The port to use for the node manager. If this is None, then the node manager will choose its own port. redis_password: The password to use when connecting to Redis. use_valgrind (bool): True if the raylet should be started inside of valgrind. If this is True, use_profiler must be False. use_profiler (bool): True if the raylet should be started inside a profiler. If this is True, use_valgrind must be False. stdout_file: A file handle opened for writing to redirect stdout to. If no redirection should happen, then this should be None. stderr_file: A file handle opened for writing to redirect stderr to. If no redirection should happen, then this should be None. config (dict|None): Optional Raylet configuration that will override defaults in RayConfig. Returns: The process that was started. """ config = config or {} config_str = ",".join(["{},{}".format(*kv) for kv in config.items()]) if use_valgrind and use_profiler: raise Exception("Cannot use valgrind and profiler at the same time.") num_initial_workers = (num_cpus if num_cpus is not None else multiprocessing.cpu_count()) static_resources = check_and_update_resources(num_cpus, num_gpus, resources) # Limit the number of workers that can be started in parallel by the # raylet. However, make sure it is at least 1. maximum_startup_concurrency = max( 1, min(multiprocessing.cpu_count(), static_resources["CPU"])) # Format the resource argument in a form like 'CPU,1.0,GPU,0,Custom,3'. resource_argument = ",".join( ["{},{}".format(*kv) for kv in static_resources.items()]) gcs_ip_address, gcs_port = redis_address.split(":") # Create the command that the Raylet will use to start workers. start_worker_command = ("{} {} " "--node-ip-address={} " "--object-store-name={} " "--raylet-name={} " "--redis-address={} " "--temp-dir={}".format(sys.executable, worker_path, node_ip_address, plasma_store_name, raylet_name, redis_address, get_temp_root())) if redis_password: start_worker_command += " --redis-password {}".format(redis_password) # If the object manager port is None, then use 0 to cause the object # manager to choose its own port. if object_manager_port is None: object_manager_port = 0 # If the node manager port is None, then use 0 to cause the node manager # to choose its own port. if node_manager_port is None: node_manager_port = 0 command = [ RAYLET_EXECUTABLE, raylet_name, plasma_store_name, str(object_manager_port), str(node_manager_port), node_ip_address, gcs_ip_address, gcs_port, str(num_initial_workers), str(maximum_startup_concurrency), resource_argument, config_str, start_worker_command, "", # Worker command for Java, not needed for Python. redis_password or "", get_temp_root(), ] if use_valgrind: p = subprocess.Popen([ "valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--leak-check-heuristics=stdstring", "--error-exitcode=1" ] + command, stdout=stdout_file, stderr=stderr_file) elif use_profiler: p = subprocess.Popen(["valgrind", "--tool=callgrind"] + command, stdout=stdout_file, stderr=stderr_file) elif "RAYLET_PERFTOOLS_PATH" in os.environ: modified_env = os.environ.copy() modified_env["LD_PRELOAD"] = os.environ["RAYLET_PERFTOOLS_PATH"] modified_env["CPUPROFILE"] = os.environ["RAYLET_PERFTOOLS_LOGFILE"] p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file, env=modified_env) else: p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file) record_log_files_in_redis(redis_address, node_ip_address, [stdout_file, stderr_file], password=redis_password) return p
def start_local_scheduler(plasma_store_name, plasma_manager_name=None, worker_path=None, plasma_address=None, node_ip_address="127.0.0.1", redis_address=None, use_valgrind=False, use_profiler=False, stdout_file=None, stderr_file=None, static_resources=None, num_workers=0): """Start a local scheduler process. Args: plasma_store_name (str): The name of the plasma store socket to connect to. plasma_manager_name (str): The name of the plasma manager to connect to. This does not need to be provided, but if it is, then the Redis address must be provided as well. worker_path (str): The path of the worker script to use when the local scheduler starts up new workers. plasma_address (str): The address of the plasma manager to connect to. This is only used by the global scheduler to figure out which plasma managers are connected to which local schedulers. node_ip_address (str): The address of the node that this local scheduler is running on. redis_address (str): The address of the Redis instance to connect to. If this is not provided, then the local scheduler will not connect to Redis. use_valgrind (bool): True if the local scheduler should be started inside of valgrind. If this is True, use_profiler must be False. use_profiler (bool): True if the local scheduler should be started inside a profiler. If this is True, use_valgrind must be False. stdout_file: A file handle opened for writing to redirect stdout to. If no redirection should happen, then this should be None. stderr_file: A file handle opened for writing to redirect stderr to. If no redirection should happen, then this should be None. static_resources: A dictionary specifying the local scheduler's resource capacities. This maps resource names (strings) to integers or floats. num_workers (int): The number of workers that the local scheduler should start. Return: A tuple of the name of the local scheduler socket and the process ID of the local scheduler process. """ if (plasma_manager_name is None) != (redis_address is None): raise Exception("If one of the plasma_manager_name and the " "redis_address is provided, then both must be " "provided.") if use_valgrind and use_profiler: raise Exception("Cannot use valgrind and profiler at the same time.") local_scheduler_executable = os.path.join( os.path.dirname(os.path.abspath(__file__)), "../core/src/local_scheduler/local_scheduler") local_scheduler_name = get_local_scheduler_socket_name() command = [ local_scheduler_executable, "-s", local_scheduler_name, "-p", plasma_store_name, "-h", node_ip_address, "-n", str(num_workers) ] if plasma_manager_name is not None: command += ["-m", plasma_manager_name] if worker_path is not None: assert plasma_store_name is not None assert plasma_manager_name is not None assert redis_address is not None start_worker_command = ("{} {} " "--node-ip-address={} " "--object-store-name={} " "--object-store-manager-name={} " "--local-scheduler-name={} " "--redis-address={} " "--temp-dir={}".format( sys.executable, worker_path, node_ip_address, plasma_store_name, plasma_manager_name, local_scheduler_name, redis_address, get_temp_root())) command += ["-w", start_worker_command] if redis_address is not None: command += ["-r", redis_address] if plasma_address is not None: command += ["-a", plasma_address] if static_resources is not None: resource_argument = "" for resource_name, resource_quantity in static_resources.items(): assert (isinstance(resource_quantity, int) or isinstance(resource_quantity, float)) resource_argument = ",".join([ resource_name + "," + str(resource_quantity) for resource_name, resource_quantity in static_resources.items() ]) else: resource_argument = "CPU,{}".format(multiprocessing.cpu_count()) command += ["-c", resource_argument] if use_valgrind: pid = subprocess.Popen([ "valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--leak-check-heuristics=stdstring", "--error-exitcode=1" ] + command, stdout=stdout_file, stderr=stderr_file) time.sleep(1.0) elif use_profiler: pid = subprocess.Popen(["valgrind", "--tool=callgrind"] + command, stdout=stdout_file, stderr=stderr_file) time.sleep(1.0) else: pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file) time.sleep(0.1) return local_scheduler_name, pid