def _enrich_object_sotre_memory(self, sc, object_store_memory): if is_local(sc): assert not object_store_memory, "you should not set object_store_memory on spark local" return resourceToBytes(self._get_ray_plasma_memory_local()) else: return resourceToBytes( str(object_store_memory)) if object_store_memory else None
def _enrich_object_sotre_memory(self, sc, object_store_memory): if is_local(sc): if self.object_store_memory is None: self.object_store_memory = self._get_ray_plasma_memory_local() return resourceToBytes(self.object_store_memory) else: return resourceToBytes( str(object_store_memory)) if object_store_memory else None
def clean_fn(self): import ray ray.shutdown() if not is_local(self.sc): self.ray_rdd.map(gen_shutdown_per_node(self.pgids, self.node_ips)).collect() else: gen_shutdown_per_node(self.pgids, self.node_ips)([])
def __init__(self, sc, redis_port=None, password="******", object_store_memory=None, verbose=False, env=None, local_ray_node_num=2, waiting_time_sec=8, extra_params=None): """ The RayContext would init a ray cluster on top of the configuration of SparkContext. For spark cluster mode: The number of raylets is equal to number of executors. For Spark local mode: The number of raylets is controlled by local_ray_node_num. CPU cores for each is raylet equals to spark_cores/local_ray_node_num. :param sc: :param redis_port: redis port for the "head" node. The value would be randomly picked if not specified. :param password: [optional] password for the redis. :param object_store_memory: Memory size for the object_store. :param verbose: True for more logs. :param env: The environment variable dict for running Ray. :param local_ray_node_num number of raylets to be created. :param waiting_time_sec: Waiting time for the raylets before connecting to redis. :param extra_params: key value dictionary for extra options to launch Ray. i.e extra_params={"temp-dir": "/tmp/ray2/"} """ self.sc = sc self.stopped = False self.is_local = is_local(sc) self.local_ray_node_num = local_ray_node_num self.ray_node_cpu_cores = self._get_ray_node_cpu_cores() self.num_ray_nodes = self._get_num_ray_nodes() self.python_loc = os.environ['PYSPARK_PYTHON'] self.ray_processesMonitor = None self.verbose = verbose self.redis_password = password self.object_store_memory = object_store_memory self.redis_port = self._new_port() if not redis_port else redis_port self.ray_service = RayServiceFuncGenerator( python_loc=self.python_loc, redis_port=self.redis_port, ray_node_cpu_cores=self.ray_node_cpu_cores, mkl_cores=self._get_mkl_cores(), password=password, object_store_memory=self._enrich_object_sotre_memory( sc, object_store_memory), verbose=verbose, env=env, waitting_time_sec=waiting_time_sec, extra_params=extra_params) self._gather_cluster_ips() from bigdl.util.common import init_executor_gateway print("Start to launch the JVM guarding process") init_executor_gateway(sc) print("JVM guarding process has been successfully launched")
def clean_fn(self): if self.raycontext.stopped: return import ray ray.shutdown() if not self.sc: print("WARNING: SparkContext has been stopped before cleaning the Ray resources") if self.sc and (not is_local(self.sc)): self.ray_rdd.map(gen_shutdown_per_node(self.pgids, self.node_ips)).collect() else: gen_shutdown_per_node(self.pgids, self.node_ips)([])
def __init__(self, process_infos, sc, ray_rdd, verbose=False): self.sc = sc self.verbose = verbose self.ray_rdd = ray_rdd self.master = [] self.slaves = [] self.pgids = [] self.node_ips = [] self.process_infos = process_infos for process_info in process_infos: self.pgids.append(process_info.pgid) self.node_ips.append(process_info.node_ip) if process_info.master_addr: self.master.append(process_info) else: self.slaves.append(process_info) ProcessMonitor.register_shutdown_hook(extra_close_fn=self.clean_fn) assert len(self.master) == 1, \ "We should got 1 master only, but we got {}".format(len(self.master)) self.master = self.master[0] if not is_local(self.sc): self.print_ray_remote_err_out()