def test_local(self): @ray.remote class TestRay: def hostname(self): import socket return socket.gethostname() sc = init_spark_on_local(cores=4) ray_ctx = RayContext(sc=sc, object_store_memory="1g") address_info = ray_ctx.init() assert "object_store_address" in address_info actors = [TestRay.remote() for i in range(0, 4)] print(ray.get([actor.hostname.remote() for actor in actors])) ray_ctx.stop() sc.stop()
(options, args) = parser.parse_args(sys.argv) # Prepare csv files df = pd.read_csv(options.file_path) sc = init_spark_on_local(cores="*") sqlContext = SQLContext(sc) num_nodes, num_cores = get_node_and_core_number() df_spark = sqlContext.createDataFrame(df) df_spark.printSchema() df_spark.repartition(num_cores).write.\ format('json').mode("overwrite").save("/tmp/ray-pandas-example") # init ray context ray_ctx = RayContext(sc=sc, object_store_memory="5g") ray_ctx.init(object_store_memory="5g") # read data data_shard = zoo.xshard.pandas.read_json("/tmp/ray-pandas-example", ray_ctx) # collect data data = data_shard.collect() print("collected data :") print(data[0].head()) # repartition partitions = data_shard.get_partitions() print("get %d partitions" % len(partitions)) data_shard.repartition(2) new_partitions = data_shard.get_partitions()
parser.add_argument('-n', '--num_workers', type=int, default=2, help='The number of workers to be launched.') opt = parser.parse_args() if opt.hadoop_conf: assert opt.conda_name is not None, "conda_name must be specified for yarn mode" sc = init_spark_on_yarn(hadoop_conf=opt.hadoop_conf, conda_name=opt.conda_name, num_executors=opt.num_workers, executor_cores=opt.executor_cores) else: sc = init_spark_on_local(cores="*") ray_ctx = RayContext(sc=sc) ray_ctx.init() import pandas as pd df = pd.read_csv(opt.path) feature_cols = [ "FIPS", "Lower 95% Confidence Interval", "Upper 95% Confidence Interval", "Average Annual Count", "Recent 5-Year Trend" ] target_col = "Age-Adjusted Incidence Rate" train_df, val_df = train_test_split(df, test_size=0.2, random_state=2) config = {'random_state': 2, 'min_child_weight': 3, 'n_jobs': 2} estimator = AutoXGBoost().regressor(feature_cols=feature_cols, target_col=target_col, config=config)
def init_orca_context(cluster_mode="local", cores=2, memory="2g", num_nodes=1, init_ray_on_spark=False, **kwargs): """ Creates or gets a SparkContext for different Spark cluster modes (and launch Ray services across the cluster if necessary). :param cluster_mode: The mode for the Spark cluster. One of "local", "yarn-client", "standalone" and "spark-submit". Default to be "local". For "spark-submit", you are supposed to use spark-submit to submit the application. In this case, please set the Spark configurations through command line options or the properties file. You need to use "spark-submit" for yarn-cluster mode. To make things easier, you are recommended to use the launching scripts under `analytics-zoo/scripts`. For other cluster modes, you are recommended to install and run analytics-zoo through pip, which is more convenient. :param cores: The number of cores to be used on each node. Default to be 2. :param memory: The memory allocated for each node. Default to be '2g'. :param num_nodes: The number of nodes to be used in the cluster. Default to be 1. For Spark local, num_nodes should always be 1 and you don't need to change it. :param init_ray_on_spark: Whether to launch Ray services across the cluster. Default to be False and in this case the Ray cluster would be launched lazily when Ray is involved in Project Orca. :param kwargs: The extra keyword arguments used for creating SparkContext and launching Ray if any. :return: An instance of SparkContext. """ cluster_mode = cluster_mode.lower() spark_args = {} for key in ["conf", "spark_log_level", "redirect_spark_log"]: if key in kwargs: spark_args[key] = kwargs[key] if cluster_mode == "spark-submit": from zoo import init_nncontext sc = init_nncontext(**spark_args) elif cluster_mode == "local": assert num_nodes == 1, "For Spark local mode, num_nodes should be 1" os.environ["SPARK_DRIVER_MEMORY"] = memory if "python_location" in kwargs: spark_args["python_location"] = kwargs["python_location"] from zoo import init_spark_on_local sc = init_spark_on_local(cores, **spark_args) elif cluster_mode.startswith("yarn"): # yarn or yarn-client if cluster_mode == "yarn-cluster": raise ValueError( 'For yarn-cluster mode, please set cluster_mode to "spark-submit" ' 'and submit the application via spark-submit instead') hadoop_conf = os.environ.get("HADOOP_CONF_DIR") if not hadoop_conf: assert "hadoop_conf" in kwargs,\ "Directory path to hadoop conf not found for yarn-client mode. Please either " \ "specify argument hadoop_conf or set the environment variable HADOOP_CONF_DIR" hadoop_conf = kwargs["hadoop_conf"] from zoo.util.utils import detect_python_location python_location = detect_python_location( ) # /path/to/conda/envs/conda_name/bin/python assert "envs" in python_location, "You must use a conda environment for yarn-client mode" for key in [ "driver_cores", "driver_memory", "extra_executor_memory_for_ray", "extra_python_lib", "penv_archive", "additional_archive", "hadoop_user_name", "spark_yarn_archive", "jars" ]: if key in kwargs: spark_args[key] = kwargs[key] from zoo import init_spark_on_yarn sc = init_spark_on_yarn(hadoop_conf=hadoop_conf, conda_name=python_location.split("/")[-3], num_executors=num_nodes, executor_cores=cores, executor_memory=memory, **spark_args) elif cluster_mode == "standalone": for key in [ "driver_cores", "driver_memory", "extra_executor_memory_for_ray", "extra_python_lib", "jars", "master", "enable_numa_binding" ]: if key in kwargs: spark_args[key] = kwargs[key] from zoo import init_spark_standalone sc = init_spark_standalone(num_executors=num_nodes, executor_cores=cores, executor_memory=memory, **spark_args) else: raise ValueError( "cluster_mode can only be local, yarn-client, standalone or spark-submit, " "but got: %s".format(cluster_mode)) ray_args = {} for key in [ "redis_port", "password", "object_store_memory", "verbose", "env", "extra_params", "num_ray_nodes", "ray_node_cpu_cores" ]: if key in kwargs: ray_args[key] = kwargs[key] from zoo.ray import RayContext ray_ctx = RayContext(sc, **ray_args) if init_ray_on_spark: driver_cores = 0 # This is the default value. if "driver_cores" in kwargs: driver_cores = kwargs["driver_cores"] ray_ctx.init(driver_cores=driver_cores) return sc
num_executor=slave_num, executor_cores=28, executor_memory="10g", driver_memory="2g", driver_cores=4, extra_executor_memory_for_ray="30g", spark_conf={"hello": "world"}) ray_ctx = RayContext(sc=sc, object_store_memory="25g", extra_params={"temp-dir": "/tmp/hello/"}, env={ "http_proxy": "http://child-prc.intel.com:913", "http_proxys": "http://child-prc.intel.com:913" }) ray_ctx.init(object_store_memory="2g", num_cores=0, labels="", extra_params={}) @ray.remote class TestRay(): def hostname(self): import socket return socket.gethostname() def check_cv2(self): # conda install -c conda-forge opencv==3.4.2 import cv2 return cv2.__version__ def ip(self): import ray.services as rservices