def testDontScaleBelowTarget(self): config = SMALL_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 2 config["target_utilization_fraction"] = 0.5 config_path = self.write_config(config) self.provider = MockProvider() lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, lm, max_failures=0, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() assert autoscaler.num_launches_pending.value == 0 assert len(self.provider.non_terminated_nodes({})) == 0 # Scales up as nodes are reported as used local_ip = services.get_node_ip_address() lm.update(local_ip, {"CPU": 2}, {"CPU": 0}) # head # 1.0 nodes used => target nodes = 2 => target workers = 1 autoscaler.update() self.waitForNodes(1) # Make new node idle, and never used. # Should hold steady as target is still 2. lm.update("172.0.0.0", {"CPU": 0}, {"CPU": 0}) lm.last_used_time_by_ip["172.0.0.0"] = 0 autoscaler.update() assert len(self.provider.non_terminated_nodes({})) == 1 # Reduce load on head => target nodes = 1 => target workers = 0 lm.update(local_ip, {"CPU": 2}, {"CPU": 1}) autoscaler.update() assert len(self.provider.non_terminated_nodes({})) == 0
def __init__(self, logs_dir, redis_address, redis_password=None): """Initialize the log monitor object.""" self.ip = services.get_node_ip_address() self.logs_dir = logs_dir self.redis_client = ray.services.create_redis_client( redis_address, password=redis_password) self.log_filenames = set() self.open_file_infos = [] self.closed_file_infos = [] self.can_open_more_files = True
def add_node(self, **override_kwargs): """Adds a node to the local Ray Cluster. All nodes are by default started with the following settings: cleanup=True, num_cpus=1, object_store_memory=100 * (2**20) # 100 MB Args: override_kwargs: Keyword arguments used in `start_ray_head` and `start_ray_node`. Overrides defaults. Returns: Node object of the added Ray node. """ node_kwargs = { "num_cpus": 1, "object_store_memory": 100 * (2**20) # 100 MB } node_kwargs.update(override_kwargs) ray_params = RayParams( node_ip_address=services.get_node_ip_address(), **node_kwargs) if self.head_node is None: ray_params.update(include_webui=False) address_info = services.start_ray_head(ray_params, cleanup=True) self.redis_address = address_info["redis_address"] # TODO(rliaw): Find a more stable way than modifying global state. process_dict_copy = services.all_processes.copy() for key in services.all_processes: services.all_processes[key] = [] node = Node(address_info, process_dict_copy) self.head_node = node else: ray_params.update(redis_address=self.redis_address) address_info = services.start_ray_node(ray_params, cleanup=True) # TODO(rliaw): Find a more stable way than modifying global state. process_dict_copy = services.all_processes.copy() for key in services.all_processes: services.all_processes[key] = [] node = Node(address_info, process_dict_copy) self.worker_nodes[node] = address_info logger.info("Starting Node with raylet socket {}".format( address_info["raylet_socket_name"])) return node
def testScaleUpBasedOnLoad(self): config = SMALL_CLUSTER.copy() config["min_workers"] = 1 config["max_workers"] = 10 config["target_utilization_fraction"] = 0.5 config_path = self.write_config(config) self.provider = MockProvider() lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, lm, max_failures=0, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(1) autoscaler.update() assert autoscaler.num_launches_pending.value == 0 assert len(self.provider.non_terminated_nodes({})) == 1 # Scales up as nodes are reported as used local_ip = services.get_node_ip_address() lm.update(local_ip, {"CPU": 2}, {"CPU": 0}) # head lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 0}) # worker 1 autoscaler.update() self.waitForNodes(3) lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 0}) autoscaler.update() self.waitForNodes(5) # Holds steady when load is removed lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 2}) lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2}) autoscaler.update() assert autoscaler.num_launches_pending.value == 0 assert len(self.provider.non_terminated_nodes({})) == 5 # Scales down as nodes become unused lm.last_used_time_by_ip["172.0.0.0"] = 0 lm.last_used_time_by_ip["172.0.0.1"] = 0 autoscaler.update() assert autoscaler.num_launches_pending.value == 0 assert len(self.provider.non_terminated_nodes({})) == 3 lm.last_used_time_by_ip["172.0.0.2"] = 0 lm.last_used_time_by_ip["172.0.0.3"] = 0 autoscaler.update() assert autoscaler.num_launches_pending.value == 0 assert len(self.provider.non_terminated_nodes({})) == 1
def get_node_ip(self): """Returns the IP address of the current node.""" return get_node_ip_address()
def __init__(self): self.last_used_time_by_ip = {} self.last_heartbeat_time_by_ip = {} self.static_resources_by_ip = {} self.dynamic_resources_by_ip = {} self.local_ip = services.get_node_ip_address()
def _train( dtrain, nthread, evenly_data_distribution, params: Dict, *args, evals=(), **kwargs, ): s = time.time() X, y = dtrain assert len(X) == len(y) X_row_parts = unwrap_partitions(X, axis=0, bind_ip=not evenly_data_distribution) y_row_parts = unwrap_partitions(y, axis=0, bind_ip=not evenly_data_distribution) assert len(X_row_parts) == len(y_row_parts), "Unaligned train data" # Create remote actors actors = create_actors(nthread=nthread) add_as_eval_method = None if evals: for (eval_data, method) in evals[:]: if eval_data is dtrain: add_as_eval_method = method evals.remove((eval_data, method)) for ((eval_X, eval_y), eval_method) in evals: # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.add_eval_data.remote( *X_y, eval_method=eval_method), unwrap_partitions(eval_X, axis=0, bind_ip=not evenly_data_distribution), unwrap_partitions(eval_y, axis=0, bind_ip=not evenly_data_distribution), evenly_data_distribution=evenly_data_distribution, ) # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.set_train_data.remote( *X_y, add_as_eval_method=add_as_eval_method), X_row_parts, y_row_parts, evenly_data_distribution=evenly_data_distribution, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() with RabitContextManager(len(actors), get_node_ip_address()) as env: rabit_args = [("%s=%s" % item).encode() for item in env.items()] # Train fut = [ actor.train.remote(rabit_args, params, *args, **kwargs) for _, actor in actors.items() ] # All results should be the same because of Rabit tracking. So we just # return the first one. result = ray.get(fut[0]) LOGGER.info(f"Training time: {time.time() - s} s") return result
def start(node_ip_address, redis_address, redis_port, num_redis_shards, redis_max_clients, redis_shard_ports, object_manager_port, object_store_memory, num_workers, num_cpus, num_gpus, resources, head, no_ui, block, plasma_directory, huge_pages, autoscaling_config, use_raylet, no_redirect_worker_output, no_redirect_output, logging_level, logging_format): # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) if redis_address is not None: redis_address = services.address_to_ip(redis_address) level = logging.getLevelName(logging_level.upper()) logging.basicConfig(level=level, format=logging_format) logger = logging.getLogger(__name__) if use_raylet is None and os.environ.get("RAY_USE_XRAY") == "1": # This environment variable is used in our testing setup. logger.info("Detected environment variable 'RAY_USE_XRAY'.") use_raylet = True try: resources = json.loads(resources) except Exception as e: raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") assert "CPU" not in resources, "Use the --num-cpus argument." assert "GPU" not in resources, "Use the --num-gpus argument." if num_cpus is not None: resources["CPU"] = num_cpus if num_gpus is not None: resources["GPU"] = num_gpus if head: # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. if num_redis_shards is None: num_redis_shards = len(redis_shard_ports) # Check that the arguments match. if len(redis_shard_ports) != num_redis_shards: raise Exception("If --redis-shard-ports is provided, it must " "have the form '6380,6381,6382', and the " "number of ports provided must equal " "--num-redis-shards (which is 1 if not " "provided)") if redis_address is not None: raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") # Get the node IP address if one is not provided. if node_ip_address is None: node_ip_address = services.get_node_ip_address() logger.info( "Using IP address {} for this node.".format(node_ip_address)) address_info = {} # Use the provided object manager port if there is one. if object_manager_port is not None: address_info["object_manager_ports"] = [object_manager_port] if address_info == {}: address_info = None address_info = services.start_ray_head( address_info=address_info, node_ip_address=node_ip_address, redis_port=redis_port, redis_shard_ports=redis_shard_ports, object_store_memory=object_store_memory, num_workers=num_workers, cleanup=False, redirect_worker_output=not no_redirect_worker_output, redirect_output=not no_redirect_output, resources=resources, num_redis_shards=num_redis_shards, redis_max_clients=redis_max_clients, redis_protected_mode=False, include_webui=(not no_ui), plasma_directory=plasma_directory, huge_pages=huge_pages, autoscaling_config=autoscaling_config, use_raylet=use_raylet) logger.info(address_info) logger.info( "\nStarted Ray on this node. You can add additional nodes to " "the cluster by calling\n\n" " ray start --redis-address {}\n\n" "from the node you wish to add. You can connect a driver to the " "cluster from Python by running\n\n" " import ray\n" " ray.init(redis_address=\"{}\")\n\n" "If you have trouble connecting from a different machine, check " "that your firewall is configured properly. If you wish to " "terminate the processes that have been started, run\n\n" " ray stop".format(address_info["redis_address"], address_info["redis_address"])) else: # Start Ray on a non-head node. if redis_port is not None: raise Exception("If --head is not passed in, --redis-port is not " "allowed") if redis_shard_ports is not None: raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed") if redis_address is None: raise Exception("If --head is not passed in, --redis-address must " "be provided.") if num_redis_shards is not None: raise Exception("If --head is not passed in, --num-redis-shards " "must not be provided.") if redis_max_clients is not None: raise Exception("If --head is not passed in, --redis-max-clients " "must not be provided.") if no_ui: raise Exception("If --head is not passed in, the --no-ui flag is " "not relevant.") redis_ip_address, redis_port = redis_address.split(":") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start(redis_ip_address, int(redis_port)) # Create a Redis client. redis_client = services.create_redis_client(redis_address) # Check that the verion information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. if node_ip_address is None: node_ip_address = services.get_node_ip_address(redis_address) logger.info( "Using IP address {} for this node.".format(node_ip_address)) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(node_ip_address, redis_client) address_info = services.start_ray_node( node_ip_address=node_ip_address, redis_address=redis_address, object_manager_ports=[object_manager_port], num_workers=num_workers, object_store_memory=object_store_memory, cleanup=False, redirect_worker_output=not no_redirect_worker_output, redirect_output=not no_redirect_output, resources=resources, plasma_directory=plasma_directory, huge_pages=huge_pages, use_raylet=use_raylet) logger.info(address_info) logger.info("\nStarted Ray on this node. If you wish to terminate the " "processes that have been started, run\n\n" " ray stop") if block: import time while True: time.sleep(30)
def start(node_ip_address, redis_address, redis_port, num_redis_shards, redis_max_clients, redis_password, redis_shard_ports, object_manager_port, node_manager_port, object_store_memory, redis_max_memory, num_cpus, num_gpus, resources, head, include_webui, block, plasma_directory, huge_pages, autoscaling_config, no_redirect_worker_output, no_redirect_output, plasma_store_socket_name, raylet_socket_name, temp_dir, include_java, java_worker_options, load_code_from_local, internal_config): # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) if redis_address is not None: redis_address = services.address_to_ip(redis_address) try: resources = json.loads(resources) except Exception: raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") redirect_worker_output = None if not no_redirect_worker_output else True redirect_output = None if not no_redirect_output else True ray_params = ray.parameter.RayParams( node_ip_address=node_ip_address, object_manager_port=object_manager_port, node_manager_port=node_manager_port, object_store_memory=object_store_memory, redis_password=redis_password, redirect_worker_output=redirect_worker_output, redirect_output=redirect_output, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources, plasma_directory=plasma_directory, huge_pages=huge_pages, plasma_store_socket_name=plasma_store_socket_name, raylet_socket_name=raylet_socket_name, temp_dir=temp_dir, include_java=include_java, include_webui=include_webui, java_worker_options=java_worker_options, load_code_from_local=load_code_from_local, _internal_config=internal_config) if head: # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. if num_redis_shards is None: num_redis_shards = len(redis_shard_ports) # Check that the arguments match. if len(redis_shard_ports) != num_redis_shards: raise Exception("If --redis-shard-ports is provided, it must " "have the form '6380,6381,6382', and the " "number of ports provided must equal " "--num-redis-shards (which is 1 if not " "provided)") if redis_address is not None: raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address()) logger.info("Using IP address {} for this node.".format( ray_params.node_ip_address)) ray_params.update_if_absent( redis_port=redis_port, redis_shard_ports=redis_shard_ports, redis_max_memory=redis_max_memory, num_redis_shards=num_redis_shards, redis_max_clients=redis_max_clients, autoscaling_config=autoscaling_config, include_java=False, ) node = ray.node.Node(ray_params, head=True, shutdown_at_exit=False) redis_address = node.redis_address logger.info( "\nStarted Ray on this node. You can add additional nodes to " "the cluster by calling\n\n" " ray start --redis-address {}{}{}\n\n" "from the node you wish to add. You can connect a driver to the " "cluster from Python by running\n\n" " import ray\n" " ray.init(redis_address=\"{}{}{}\")\n\n" "If you have trouble connecting from a different machine, check " "that your firewall is configured properly. If you wish to " "terminate the processes that have been started, run\n\n" " ray stop".format( redis_address, " --redis-password " if redis_password else "", redis_password if redis_password else "", redis_address, "\", redis_password=\"" if redis_password else "", redis_password if redis_password else "")) else: # Start Ray on a non-head node. if redis_port is not None: raise Exception("If --head is not passed in, --redis-port is not " "allowed") if redis_shard_ports is not None: raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed") if redis_address is None: raise Exception("If --head is not passed in, --redis-address must " "be provided.") if num_redis_shards is not None: raise Exception("If --head is not passed in, --num-redis-shards " "must not be provided.") if redis_max_clients is not None: raise Exception("If --head is not passed in, --redis-max-clients " "must not be provided.") if include_webui: raise Exception("If --head is not passed in, the --include-webui " "flag is not relevant.") if include_java is not None: raise ValueError("--include-java should only be set for the head " "node.") redis_ip_address, redis_port = redis_address.split(":") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start(redis_ip_address, int(redis_port), password=redis_password) # Create a Redis client. redis_client = services.create_redis_client(redis_address, password=redis_password) # Check that the verion information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address(redis_address)) logger.info("Using IP address {} for this node.".format( ray_params.node_ip_address)) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(ray_params.node_ip_address, redis_client) ray_params.update(redis_address=redis_address) node = ray.node.Node(ray_params, head=False, shutdown_at_exit=False) logger.info("\nStarted Ray on this node. If you wish to terminate the " "processes that have been started, run\n\n" " ray stop") if block: import time while True: time.sleep(30)
def with_head_node_ip(cmds): head_ip = services.get_node_ip_address() out = [] for cmd in cmds: out.append("export RAY_HEAD_IP={}; {}".format(head_ip, cmd)) return out
def start(node_ip_address, redis_address, redis_port, num_redis_shards, redis_max_clients, redis_shard_ports, object_manager_port, object_store_memory, num_workers, num_cpus, num_gpus, resources, head, no_ui, block, plasma_directory, huge_pages, autoscaling_config): # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) if redis_address is not None: redis_address = services.address_to_ip(redis_address) try: resources = json.loads(resources) except Exception as e: raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") assert "CPU" not in resources, "Use the --num-cpus argument." assert "GPU" not in resources, "Use the --num-gpus argument." if num_cpus is not None: resources["CPU"] = num_cpus if num_gpus is not None: resources["GPU"] = num_gpus if head: # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. if num_redis_shards is None: num_redis_shards = len(redis_shard_ports) # Check that the arguments match. if len(redis_shard_ports) != num_redis_shards: raise Exception("If --redis-shard-ports is provided, it must " "have the form '6380,6381,6382', and the " "number of ports provided must equal " "--num-redis-shards (which is 1 if not " "provided)") if redis_address is not None: raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") # Get the node IP address if one is not provided. if node_ip_address is None: node_ip_address = services.get_node_ip_address() print("Using IP address {} for this node.".format(node_ip_address)) address_info = {} # Use the provided object manager port if there is one. if object_manager_port is not None: address_info["object_manager_ports"] = [object_manager_port] if address_info == {}: address_info = None address_info = services.start_ray_head( address_info=address_info, node_ip_address=node_ip_address, redis_port=redis_port, redis_shard_ports=redis_shard_ports, object_store_memory=object_store_memory, num_workers=num_workers, cleanup=False, redirect_output=True, resources=resources, num_redis_shards=num_redis_shards, redis_max_clients=redis_max_clients, include_webui=(not no_ui), plasma_directory=plasma_directory, huge_pages=huge_pages, autoscaling_config=autoscaling_config) print(address_info) print("\nStarted Ray on this node. You can add additional nodes to " "the cluster by calling\n\n" " ray start --redis-address {}\n\n" "from the node you wish to add. You can connect a driver to the " "cluster from Python by running\n\n" " import ray\n" " ray.init(redis_address=\"{}\")\n\n" "If you have trouble connecting from a different machine, check " "that your firewall is configured properly. If you wish to " "terminate the processes that have been started, run\n\n" " ray stop".format(address_info["redis_address"], address_info["redis_address"])) else: # Start Ray on a non-head node. if redis_port is not None: raise Exception("If --head is not passed in, --redis-port is not " "allowed") if redis_shard_ports is not None: raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed") if redis_address is None: raise Exception("If --head is not passed in, --redis-address must " "be provided.") if num_redis_shards is not None: raise Exception("If --head is not passed in, --num-redis-shards " "must not be provided.") if redis_max_clients is not None: raise Exception("If --head is not passed in, --redis-max-clients " "must not be provided.") if no_ui: raise Exception("If --head is not passed in, the --no-ui flag is " "not relevant.") redis_ip_address, redis_port = redis_address.split(":") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start(redis_ip_address, int(redis_port)) # Create a Redis client. redis_client = services.create_redis_client(redis_address) # Check that the verion information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. if node_ip_address is None: node_ip_address = services.get_node_ip_address(redis_address) print("Using IP address {} for this node.".format(node_ip_address)) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(node_ip_address, redis_client) address_info = services.start_ray_node( node_ip_address=node_ip_address, redis_address=redis_address, object_manager_ports=[object_manager_port], num_workers=num_workers, object_store_memory=object_store_memory, cleanup=False, redirect_output=True, resources=resources, plasma_directory=plasma_directory, huge_pages=huge_pages) print(address_info) print("\nStarted Ray on this node. If you wish to terminate the " "processes that have been started, run\n\n" " ray stop") if block: import time while True: time.sleep(30)
def __init__(self, # --- general name, log_verbose, log_export_freq, checkpoint_freq, export_hands_freq, eval_agent_export_freq, # --- env game_cls, env_bldr_cls, start_chips, # --- Evaluation eval_modes_of_algo, eval_stack_sizes, module_args, # --- Computing path_data=None, local_crayon_server_docker_address="localhost", device_inference="cpu", DISTRIBUTED=False, CLUSTER=False, DEBUGGING=False, TESTING=False, # --- Only relevant if running distributed redis_head_adr=None, # (str) IP under which the ray redis server can be reached ): """ Args: name (str): Under this name all logs, data, and checkpoints will appear. log_verbose (bool): Whether the program shall log detailed in Tensorboard. log_export_freq: Every X iterations, logs are pushed to the Crayon Docker container. checkpoint_freq (int): Every X iterations, make a recoverable copy of state of training. eval_agent_export_freq (int): Every X iterations, an EvalAgent instance of the algo is exported. game_cls (PokerEnv subclass): Class (not instance) to be trained in. env_bldr_cls (EnvBuilder subclass) Class (not instance) to wrap the environment. start_chips (int): Standard stack size to initialize all players with. eval_modes_of_algo (tuple): Tuple of algo-specific EvalAgent's eval modes eval_stack_sizes (tuple): Tuple of lists of ints. if None, defaults to what's used in training_profile.env_bldr. module_args (dict): All modules or parts of algorithms may have their own args. These are stored in seperate objects and accessible under a certain string key in the ""module_args"" dict. path_data: path to store data (e.g. checkpoints) the algorithm generates in. If None, we will store data in a folder we create in your home dir. local_crayon_server_docker_address: ip of crayon docker container (default: localhost) device_inference: "cpu" or "cuda". This device will be used for batched NN inference DISTRIBUTED (bool): Whether ray should be used at all. CLUSTER: requires "DISTRIBUTED==True". If True, runs on many machines, if False, runs on local CPUs/GPUs. DEBUGGING (bool): Whether to use assert statements for debugging redis_head_adr: Only applicable if "CLUSTER==True". IP address under which the ray head can be found. """ # Assert basic modules were passed assert "env" in module_args # t_prof self.name = name self.log_verbose = log_verbose self.log_export_freq = log_export_freq self.checkpoint_freq = checkpoint_freq self.export_hands_freq = export_hands_freq self.eval_agent_export_freq = eval_agent_export_freq self.module_args = module_args if CLUSTER: if redis_head_adr: self.redis_head_adr = redis_head_adr else: from ray import services self.redis_head_adr = services.get_node_ip_address() + ":6379" self.local_crayon_server_docker_address = local_crayon_server_docker_address self.DISTRIBUTED = DISTRIBUTED or CLUSTER self.CLUSTER = CLUSTER self.DEBUGGING = DEBUGGING self.TESTING = TESTING self.HAVE_GPU = torch.cuda.is_available() self.n_seats = self.module_args["env"].n_seats # Eval self.eval_modes_of_algo = eval_modes_of_algo if eval_stack_sizes is None: if start_chips is None: self.eval_stack_sizes = [[game_cls.DEFAULT_STACK_SIZE for _ in range(self.n_seats)]] else: self.eval_stack_sizes = [copy.deepcopy(self.module_args["env"].starting_stack_sizes_list)] else: assert isinstance(eval_stack_sizes, tuple) assert isinstance(eval_stack_sizes[0], list) self.eval_stack_sizes = list(eval_stack_sizes) self.game_cls_str = game_cls.__name__ self.env_builder_cls_str = env_bldr_cls.__name__ assert isinstance(device_inference, str), "Please pass a string (either 'cpu' or 'cuda')!" self.device_inference = torch.device(device_inference) # Paths def get_root_path(): return "C:\\" if os.name == 'nt' else os.path.expanduser('~/') self._data_path = path_data if path_data is not None else os.path.join(get_root_path(), "poker_ai_data") if self.TESTING: self._data_path = os.path.join(self._data_path, "testing") self.path_agent_export_storage = ospj(self._data_path, "eval_agent") self.path_log_storage = ospj(self._data_path, "logs") self.path_checkpoint = ospj(self._data_path, "checkpoint") self.path_export_hands = ospj(self._data_path, "export_hands") self.path_trainingprofiles = ospj(self._data_path, "TrainingProfiles") for p in [self._data_path, self.path_agent_export_storage, self.path_log_storage, self.path_checkpoint, self.path_export_hands, self.path_trainingprofiles, ]: if (not os.path.exists(p)) and (not os.path.isfile(p)): os.makedirs(p)
def ip(self): import ray.services as rservices return rservices.get_node_ip_address()
def start(node_ip_address, address, port, redis_password, redis_shard_ports, object_manager_port, node_manager_port, gcs_server_port, min_worker_port, max_worker_port, memory, object_store_memory, redis_max_memory, num_cpus, num_gpus, resources, head, include_dashboard, dashboard_host, dashboard_port, block, plasma_directory, autoscaling_config, no_redirect_worker_output, no_redirect_output, plasma_store_socket_name, raylet_socket_name, temp_dir, java_worker_options, load_code_from_local, code_search_path, system_config, lru_evict, enable_object_reconstruction, metrics_export_port, log_style, log_color, verbose): """Start Ray processes manually on the local machine.""" cli_logger.configure(log_style, log_color, verbose) if gcs_server_port and not head: raise ValueError( "gcs_server_port can be only assigned when you specify --head.") # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) redis_address = None if address is not None: (redis_address, redis_address_ip, redis_address_port) = services.validate_redis_address(address) try: resources = json.loads(resources) except Exception: cli_logger.error("`{}` is not a valid JSON string.", cf.bold("--resources")) cli_logger.abort( "Valid values look like this: `{}`", cf.bold("--resources='\"CustomResource3\": 1, " "\"CustomResource2\": 2}'")) raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") redirect_worker_output = None if not no_redirect_worker_output else True redirect_output = None if not no_redirect_output else True ray_params = ray.parameter.RayParams( node_ip_address=node_ip_address, min_worker_port=min_worker_port, max_worker_port=max_worker_port, object_manager_port=object_manager_port, node_manager_port=node_manager_port, gcs_server_port=gcs_server_port, memory=memory, object_store_memory=object_store_memory, redis_password=redis_password, redirect_worker_output=redirect_worker_output, redirect_output=redirect_output, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources, plasma_directory=plasma_directory, huge_pages=False, plasma_store_socket_name=plasma_store_socket_name, raylet_socket_name=raylet_socket_name, temp_dir=temp_dir, include_dashboard=include_dashboard, dashboard_host=dashboard_host, dashboard_port=dashboard_port, java_worker_options=java_worker_options, load_code_from_local=load_code_from_local, code_search_path=code_search_path, _system_config=system_config, lru_evict=lru_evict, enable_object_reconstruction=enable_object_reconstruction, metrics_export_port=metrics_export_port) if head: num_redis_shards = None # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. num_redis_shards = len(redis_shard_ports) if redis_address is not None: cli_logger.abort( "`{}` starts a new Redis server, `{}` should not be set.", cf.bold("--head"), cf.bold("--address")) raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address()) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) cli_logger.old_info(logger, "Using IP address {} for this node.", ray_params.node_ip_address) ray_params.update_if_absent( redis_port=port, redis_shard_ports=redis_shard_ports, redis_max_memory=redis_max_memory, num_redis_shards=num_redis_shards, redis_max_clients=None, autoscaling_config=autoscaling_config, ) node = ray.node.Node( ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block) redis_address = node.redis_address # this is a noop if new-style is not set, so the old logger calls # are still in place cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() with cli_logger.group("Next steps"): cli_logger.print( "To connect to this Ray runtime from another node, run") cli_logger.print( cf.bold(" ray start --address='{}'{}"), redis_address, f" --redis-password='******'" if redis_password else "") cli_logger.newline() cli_logger.print("Alternatively, use the following Python code:") with cli_logger.indented(): with cf.with_style("monokai") as c: cli_logger.print("{} ray", c.magenta("import")) cli_logger.print( "ray{}init(address{}{}{})", c.magenta("."), c.magenta("="), c.yellow("'auto'"), ", _redis_password{}{}".format( c.magenta("="), c.yellow("'" + redis_password + "'")) if redis_password else "") cli_logger.newline() cli_logger.print( cf.underlined("If connection fails, check your " "firewall settings and " "network configuration.")) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) cli_logger.old_info( logger, "\nStarted Ray on this node. You can add additional nodes to " "the cluster by calling\n\n" " ray start --address='{}'{}\n\n" "from the node you wish to add. You can connect a driver to the " "cluster from Python by running\n\n" " import ray\n" " ray.init(address='auto'{})\n\n" "If you have trouble connecting from a different machine, check " "that your firewall is configured properly. If you wish to " "terminate the processes that have been started, run\n\n" " ray stop".format( redis_address, " --redis-password='******'" if redis_password else "", ", _redis_password='******'" if redis_password else "")) else: # Start Ray on a non-head node. if not (port is None): cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--port"), cf.bold("--head")) raise Exception("If --head is not passed in, --port is not " "allowed.") if redis_shard_ports is not None: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--redis-shard-ports"), cf.bold("--head")) raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed.") if redis_address is None: cli_logger.abort("`{}` is required unless starting with `{}`.", cf.bold("--address"), cf.bold("--head")) raise Exception("If --head is not passed in, --address must " "be provided.") if include_dashboard: cli_logger.abort("`{}` should not be specified without `{}`.", cf.bold("--include-dashboard"), cf.bold("--head")) raise ValueError( "If --head is not passed in, the --include-dashboard" "flag is not relevant.") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start( redis_address_ip, redis_address_port, password=redis_password) # Create a Redis client. redis_client = services.create_redis_client( redis_address, password=redis_password) # Check that the version information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address(redis_address)) cli_logger.labeled_value("Local node IP", ray_params.node_ip_address) cli_logger.old_info(logger, "Using IP address {} for this node.", ray_params.node_ip_address) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(ray_params.node_ip_address, redis_client) ray_params.update(redis_address=redis_address) node = ray.node.Node( ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block) cli_logger.newline() startup_msg = "Ray runtime started." cli_logger.success("-" * len(startup_msg)) cli_logger.success(startup_msg) cli_logger.success("-" * len(startup_msg)) cli_logger.newline() cli_logger.print("To terminate the Ray runtime, run") cli_logger.print(cf.bold(" ray stop")) cli_logger.old_info( logger, "\nStarted Ray on this node. If you wish to terminate the " "processes that have been started, run\n\n" " ray stop") if block: cli_logger.newline() with cli_logger.group(cf.bold("--block")): cli_logger.print( "This command will now block until terminated by a signal.") cli_logger.print( "Runing subprocesses are monitored and a message will be " "printed if any of them terminate unexpectedly.") while True: time.sleep(1) deceased = node.dead_processes() if len(deceased) > 0: cli_logger.newline() cli_logger.error("Some Ray subprcesses exited unexpectedly:") cli_logger.old_error(logger, "Ray processes died unexpectedly:") with cli_logger.indented(): for process_type, process in deceased: cli_logger.error( "{}", cf.bold(str(process_type)), _tags={"exit code": str(process.returncode)}) cli_logger.old_error( logger, "\t{} died with exit code {}".format( process_type, process.returncode)) # shutdown_at_exit will handle cleanup. cli_logger.newline() cli_logger.error("Remaining processes will be killed.") cli_logger.old_error( logger, "Killing remaining processes and exiting...") sys.exit(1)
def __init__(self, local_dir, remote_dir, sync_client): self.local_ip = services.get_node_ip_address() self.worker_ip = None super(NodeSyncer, self).__init__(local_dir, remote_dir, sync_client)
# Note that we redirect stdout and stderr to /dev/null because otherwise # attempts to print may cause exceptions if a process is started inside of an # SSH connection and the SSH connection dies. TODO(rkn): This is a temporary # fix. We should actually redirect stdout and stderr to Redis in some way. if args.head: # Start Ray on the head node. if args.redis_address is not None: raise Exception( "If --head is passed in, a Redis server will be started, so a Redis address should not be provided." ) # Get the node IP address if one is not provided. if args.node_ip_address is None: node_ip_address = services.get_node_ip_address() else: node_ip_address = args.node_ip_address print("Using IP address {} for this node.".format(node_ip_address)) if args.redis_port is not None: address_info = { "redis_address": "{}:{}".format(node_ip_address, args.redis_port) } else: address_info = None address_info = services.start_ray_head(address_info=address_info, node_ip_address=node_ip_address, num_workers=args.num_workers,
def start(node_ip_address, redis_address, address, redis_port, port, num_redis_shards, redis_max_clients, redis_password, redis_shard_ports, object_manager_port, node_manager_port, min_worker_port, max_worker_port, memory, object_store_memory, redis_max_memory, num_cpus, num_gpus, resources, head, include_webui, webui_host, block, plasma_directory, huge_pages, autoscaling_config, no_redirect_worker_output, no_redirect_output, plasma_store_socket_name, raylet_socket_name, temp_dir, include_java, java_worker_options, load_code_from_local, internal_config): """Start Ray processes manually on the local machine.""" if redis_address is not None: raise DeprecationWarning("The --redis-address argument is " "deprecated. Please use --address instead.") if redis_port is not None: logger.warn("The --redis-port argument will be deprecated soon. " "Please use --port instead.") if port is not None and port != redis_port: raise ValueError("Cannot specify both --port and --redis-port " "as port is a rename of deprecated redis-port") # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) if redis_address is not None or address is not None: (redis_address, redis_address_ip, redis_address_port) = services.validate_redis_address( address, redis_address) try: resources = json.loads(resources) except Exception: raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") redirect_worker_output = None if not no_redirect_worker_output else True redirect_output = None if not no_redirect_output else True ray_params = ray.parameter.RayParams( node_ip_address=node_ip_address, min_worker_port=min_worker_port, max_worker_port=max_worker_port, object_manager_port=object_manager_port, node_manager_port=node_manager_port, memory=memory, object_store_memory=object_store_memory, redis_password=redis_password, redirect_worker_output=redirect_worker_output, redirect_output=redirect_output, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources, plasma_directory=plasma_directory, huge_pages=huge_pages, plasma_store_socket_name=plasma_store_socket_name, raylet_socket_name=raylet_socket_name, temp_dir=temp_dir, include_java=include_java, include_webui=include_webui, webui_host=webui_host, java_worker_options=java_worker_options, load_code_from_local=load_code_from_local, _internal_config=internal_config) if head: # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. if num_redis_shards is None: num_redis_shards = len(redis_shard_ports) # Check that the arguments match. if len(redis_shard_ports) != num_redis_shards: raise Exception("If --redis-shard-ports is provided, it must " "have the form '6380,6381,6382', and the " "number of ports provided must equal " "--num-redis-shards (which is 1 if not " "provided)") if redis_address is not None: raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address()) logger.info("Using IP address {} for this node.".format( ray_params.node_ip_address)) ray_params.update_if_absent( redis_port=port or redis_port, redis_shard_ports=redis_shard_ports, redis_max_memory=redis_max_memory, num_redis_shards=num_redis_shards, redis_max_clients=redis_max_clients, autoscaling_config=autoscaling_config, include_java=False, ) node = ray.node.Node(ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block) redis_address = node.redis_address logger.info( "\nStarted Ray on this node. You can add additional nodes to " "the cluster by calling\n\n" " ray start --address='{}'{}\n\n" "from the node you wish to add. You can connect a driver to the " "cluster from Python by running\n\n" " import ray\n" " ray.init(address='auto'{})\n\n" "If you have trouble connecting from a different machine, check " "that your firewall is configured properly. If you wish to " "terminate the processes that have been started, run\n\n" " ray stop".format( redis_address, " --redis-password='******'" if redis_password else "", ", redis_password='******'" if redis_password else "")) else: # Start Ray on a non-head node. if not (redis_port is None and port is None): raise Exception( "If --head is not passed in, --port and --redis-port are not " "allowed.") if redis_shard_ports is not None: raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed.") if redis_address is None: raise Exception("If --head is not passed in, --address must " "be provided.") if num_redis_shards is not None: raise Exception("If --head is not passed in, --num-redis-shards " "must not be provided.") if redis_max_clients is not None: raise Exception("If --head is not passed in, --redis-max-clients " "must not be provided.") if include_webui: raise Exception("If --head is not passed in, the --include-webui " "flag is not relevant.") if include_java is not None: raise ValueError("--include-java should only be set for the head " "node.") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start(redis_address_ip, redis_address_port, password=redis_password) # Create a Redis client. redis_client = services.create_redis_client(redis_address, password=redis_password) # Check that the version information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address(redis_address)) logger.info("Using IP address {} for this node.".format( ray_params.node_ip_address)) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(ray_params.node_ip_address, redis_client) ray_params.update(redis_address=redis_address) node = ray.node.Node(ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block) logger.info("\nStarted Ray on this node. If you wish to terminate the " "processes that have been started, run\n\n" " ray stop") if block: while True: time.sleep(1) deceased = node.dead_processes() if len(deceased) > 0: logger.error("Ray processes died unexpectedly:") for process_type, process in deceased: logger.error("\t{} died with exit code {}".format( process_type, process.returncode)) # shutdown_at_exit will handle cleanup. logger.error("Killing remaining processes and exiting...") sys.exit(1)
def start(node_ip_address, redis_address, redis_port, num_redis_shards, redis_max_clients, object_manager_port, num_workers, num_cpus, num_gpus, num_custom_resource, head, no_ui, block, plasma_directory, huge_pages): # Note that we redirect stdout and stderr to /dev/null because otherwise # attempts to print may cause exceptions if a process is started inside of # an SSH connection and the SSH connection dies. TODO(rkn): This is a # temporary fix. We should actually redirect stdout and stderr to Redis in # some way. # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) if redis_address is not None: redis_address = services.address_to_ip(redis_address) if head: # Start Ray on the head node. if redis_address is not None: raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") # Get the node IP address if one is not provided. if node_ip_address is None: node_ip_address = services.get_node_ip_address() print("Using IP address {} for this node.".format(node_ip_address)) address_info = {} # Use the provided object manager port if there is one. if object_manager_port is not None: address_info["object_manager_ports"] = [object_manager_port] if address_info == {}: address_info = None address_info = services.start_ray_head( address_info=address_info, node_ip_address=node_ip_address, redis_port=redis_port, num_workers=num_workers, cleanup=False, redirect_output=True, num_cpus=num_cpus, num_gpus=num_gpus, num_custom_resource=num_custom_resource, num_redis_shards=num_redis_shards, redis_max_clients=redis_max_clients, include_webui=(not no_ui), plasma_directory=plasma_directory, huge_pages=huge_pages) print(address_info) print("\nStarted Ray on this node. You can add additional nodes to " "the cluster by calling\n\n" " ray start --redis-address {}\n\n" "from the node you wish to add. You can connect a driver to the " "cluster from Python by running\n\n" " import ray\n" " ray.init(redis_address=\"{}\")\n\n" "If you have trouble connecting from a different machine, check " "that your firewall is configured properly. If you wish to " "terminate the processes that have been started, run\n\n" " ray stop".format(address_info["redis_address"], address_info["redis_address"])) else: # Start Ray on a non-head node. if redis_port is not None: raise Exception("If --head is not passed in, --redis-port is not " "allowed") if redis_address is None: raise Exception("If --head is not passed in, --redis-address must " "be provided.") if num_redis_shards is not None: raise Exception("If --head is not passed in, --num-redis-shards " "must not be provided.") if redis_max_clients is not None: raise Exception("If --head is not passed in, --redis-max-clients " "must not be provided.") if no_ui: raise Exception("If --head is not passed in, the --no-ui flag is " "not relevant.") redis_ip_address, redis_port = redis_address.split(":") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start(redis_ip_address, int(redis_port)) # Get the node IP address if one is not provided. if node_ip_address is None: node_ip_address = services.get_node_ip_address(redis_address) print("Using IP address {} for this node.".format(node_ip_address)) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(node_ip_address, redis_address) address_info = services.start_ray_node( node_ip_address=node_ip_address, redis_address=redis_address, object_manager_ports=[object_manager_port], num_workers=num_workers, cleanup=False, redirect_output=True, num_cpus=num_cpus, num_gpus=num_gpus, num_custom_resource=num_custom_resource, plasma_directory=plasma_directory, huge_pages=huge_pages) print(address_info) print("\nStarted Ray on this node. If you wish to terminate the " "processes that have been started, run\n\n" " ray stop") if block: import time while True: time.sleep(30)
def f(): time.sleep(0.01) return get_node_ip_address()
def _train( dtrain, num_actors, params: Dict, *args, evals=(), **kwargs, ): """ Run distributed training of XGBoost model on Ray backend. During work it evenly distributes `dtrain` between workers according to IP addresses partitions (in case of not even distribution of `dtrain` by nodes, part of partitions will be re-distributed between nodes), runs xgb.train on each worker for subset of `dtrain` and reduces training results of each worker using Rabit Context. Parameters ---------- dtrain : modin.experimental.DMatrix Data to be trained against. num_actors : int, optional Number of actors for training. If unspecified, this value will be computed automatically. params : dict Booster params. *args : iterable Other parameters for `xgboost.train`. evals : list of pairs (modin.experimental.xgboost.DMatrix, str), default: empty List of validation sets for which metrics will be evaluated during training. Validation metrics will help us track the performance of the model. **kwargs : dict Other parameters are the same as `xgboost.train`. Returns ------- dict A dictionary with trained booster and dict of evaluation results as {"booster": xgboost.Booster, "history": dict}. """ s = time.time() X_row_parts, y_row_parts = dtrain assert len(X_row_parts) == len(y_row_parts), "Unaligned train data" num_actors = _get_num_actors( num_actors if isinstance(num_actors, int) else "default_train") if num_actors > len(X_row_parts): num_actors = len(X_row_parts) actors, pg = create_actors(num_actors) add_as_eval_method = None if evals: for (eval_data, method) in evals[:]: if eval_data is dtrain: add_as_eval_method = method evals.remove((eval_data, method)) for ((eval_X, eval_y), eval_method) in evals: # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.add_eval_data.remote( *X_y, eval_method=eval_method), eval_X, y_parts=eval_y, ) # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.set_train_data.remote( *X_y, add_as_eval_method=add_as_eval_method), X_row_parts, y_parts=y_row_parts, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() with RabitContextManager(len(actors), get_node_ip_address()) as env: rabit_args = [("%s=%s" % item).encode() for item in env.items()] # Train fut = [ actor.train.remote(rabit_args, params, *args, **kwargs) for actor in actors ] # All results should be the same because of Rabit tracking. So we just # return the first one. result = ray.get(fut[0]) remove_placement_group(pg) LOGGER.info(f"Training time: {time.time() - s} s") return result