예제 #1
0
    def test_local(self):
        @ray.remote
        class TestRay:
            def hostname(self):
                import socket
                return socket.gethostname()

        sc = init_spark_on_local(cores=4)
        ray_ctx = RayContext(sc=sc, object_store_memory="1g")
        address_info = ray_ctx.init()
        assert "object_store_address" in address_info
        actors = [TestRay.remote() for i in range(0, 4)]
        print(ray.get([actor.hostname.remote() for actor in actors]))
        ray_ctx.stop()
        sc.stop()
예제 #2
0
    (options, args) = parser.parse_args(sys.argv)

    # Prepare csv files
    df = pd.read_csv(options.file_path)
    sc = init_spark_on_local(cores="*")
    sqlContext = SQLContext(sc)
    num_nodes, num_cores = get_node_and_core_number()
    df_spark = sqlContext.createDataFrame(df)
    df_spark.printSchema()
    df_spark.repartition(num_cores).write.\
        format('json').mode("overwrite").save("/tmp/ray-pandas-example")

    # init ray context
    ray_ctx = RayContext(sc=sc, object_store_memory="5g")
    ray_ctx.init(object_store_memory="5g")

    # read data
    data_shard = zoo.xshard.pandas.read_json("/tmp/ray-pandas-example",
                                             ray_ctx)

    # collect data
    data = data_shard.collect()
    print("collected data :")
    print(data[0].head())

    # repartition
    partitions = data_shard.get_partitions()
    print("get %d partitions" % len(partitions))
    data_shard.repartition(2)
    new_partitions = data_shard.get_partitions()
예제 #3
0
    parser.add_argument('-n',
                        '--num_workers',
                        type=int,
                        default=2,
                        help='The number of workers to be launched.')
    opt = parser.parse_args()
    if opt.hadoop_conf:
        assert opt.conda_name is not None, "conda_name must be specified for yarn mode"
        sc = init_spark_on_yarn(hadoop_conf=opt.hadoop_conf,
                                conda_name=opt.conda_name,
                                num_executors=opt.num_workers,
                                executor_cores=opt.executor_cores)
    else:
        sc = init_spark_on_local(cores="*")
    ray_ctx = RayContext(sc=sc)
    ray_ctx.init()

    import pandas as pd
    df = pd.read_csv(opt.path)
    feature_cols = [
        "FIPS", "Lower 95% Confidence Interval",
        "Upper 95% Confidence Interval", "Average Annual Count",
        "Recent 5-Year Trend"
    ]
    target_col = "Age-Adjusted Incidence Rate"
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=2)

    config = {'random_state': 2, 'min_child_weight': 3, 'n_jobs': 2}
    estimator = AutoXGBoost().regressor(feature_cols=feature_cols,
                                        target_col=target_col,
                                        config=config)
예제 #4
0
def init_orca_context(cluster_mode="local",
                      cores=2,
                      memory="2g",
                      num_nodes=1,
                      init_ray_on_spark=False,
                      **kwargs):
    """
    Creates or gets a SparkContext for different Spark cluster modes (and launch Ray services
    across the cluster if necessary).

    :param cluster_mode: The mode for the Spark cluster. One of "local", "yarn-client",
           "standalone" and "spark-submit". Default to be "local".

           For "spark-submit", you are supposed to use spark-submit to submit the application.
           In this case, please set the Spark configurations through command line options or
           the properties file. You need to use "spark-submit" for yarn-cluster mode.
           To make things easier, you are recommended to use the launching scripts under
           `analytics-zoo/scripts`.

           For other cluster modes, you are recommended to install and run analytics-zoo through
           pip, which is more convenient.
    :param cores: The number of cores to be used on each node. Default to be 2.
    :param memory: The memory allocated for each node. Default to be '2g'.
    :param num_nodes: The number of nodes to be used in the cluster. Default to be 1.
           For Spark local, num_nodes should always be 1 and you don't need to change it.
    :param init_ray_on_spark: Whether to launch Ray services across the cluster.
           Default to be False and in this case the Ray cluster would be launched lazily when
           Ray is involved in Project Orca.
    :param kwargs: The extra keyword arguments used for creating SparkContext and
           launching Ray if any.

    :return: An instance of SparkContext.
    """
    cluster_mode = cluster_mode.lower()
    spark_args = {}
    for key in ["conf", "spark_log_level", "redirect_spark_log"]:
        if key in kwargs:
            spark_args[key] = kwargs[key]
    if cluster_mode == "spark-submit":
        from zoo import init_nncontext
        sc = init_nncontext(**spark_args)
    elif cluster_mode == "local":
        assert num_nodes == 1, "For Spark local mode, num_nodes should be 1"
        os.environ["SPARK_DRIVER_MEMORY"] = memory
        if "python_location" in kwargs:
            spark_args["python_location"] = kwargs["python_location"]
        from zoo import init_spark_on_local
        sc = init_spark_on_local(cores, **spark_args)
    elif cluster_mode.startswith("yarn"):  # yarn or yarn-client
        if cluster_mode == "yarn-cluster":
            raise ValueError(
                'For yarn-cluster mode, please set cluster_mode to "spark-submit" '
                'and submit the application via spark-submit instead')
        hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
        if not hadoop_conf:
            assert "hadoop_conf" in kwargs,\
                "Directory path to hadoop conf not found for yarn-client mode. Please either " \
                "specify argument hadoop_conf or set the environment variable HADOOP_CONF_DIR"
            hadoop_conf = kwargs["hadoop_conf"]
        from zoo.util.utils import detect_python_location
        python_location = detect_python_location(
        )  # /path/to/conda/envs/conda_name/bin/python
        assert "envs" in python_location, "You must use a conda environment for yarn-client mode"
        for key in [
                "driver_cores", "driver_memory",
                "extra_executor_memory_for_ray", "extra_python_lib",
                "penv_archive", "additional_archive", "hadoop_user_name",
                "spark_yarn_archive", "jars"
        ]:
            if key in kwargs:
                spark_args[key] = kwargs[key]
        from zoo import init_spark_on_yarn
        sc = init_spark_on_yarn(hadoop_conf=hadoop_conf,
                                conda_name=python_location.split("/")[-3],
                                num_executors=num_nodes,
                                executor_cores=cores,
                                executor_memory=memory,
                                **spark_args)
    elif cluster_mode == "standalone":
        for key in [
                "driver_cores", "driver_memory",
                "extra_executor_memory_for_ray", "extra_python_lib", "jars",
                "master", "enable_numa_binding"
        ]:
            if key in kwargs:
                spark_args[key] = kwargs[key]
        from zoo import init_spark_standalone
        sc = init_spark_standalone(num_executors=num_nodes,
                                   executor_cores=cores,
                                   executor_memory=memory,
                                   **spark_args)
    else:
        raise ValueError(
            "cluster_mode can only be local, yarn-client, standalone or spark-submit, "
            "but got: %s".format(cluster_mode))
    ray_args = {}
    for key in [
            "redis_port", "password", "object_store_memory", "verbose", "env",
            "extra_params", "num_ray_nodes", "ray_node_cpu_cores"
    ]:
        if key in kwargs:
            ray_args[key] = kwargs[key]
    from zoo.ray import RayContext
    ray_ctx = RayContext(sc, **ray_args)
    if init_ray_on_spark:
        driver_cores = 0  # This is the default value.
        if "driver_cores" in kwargs:
            driver_cores = kwargs["driver_cores"]
        ray_ctx.init(driver_cores=driver_cores)
    return sc
예제 #5
0
                        num_executor=slave_num,
                        executor_cores=28,
                        executor_memory="10g",
                        driver_memory="2g",
                        driver_cores=4,
                        extra_executor_memory_for_ray="30g",
                        spark_conf={"hello": "world"})

ray_ctx = RayContext(sc=sc,
                     object_store_memory="25g",
                     extra_params={"temp-dir": "/tmp/hello/"},
                     env={
                         "http_proxy": "http://child-prc.intel.com:913",
                         "http_proxys": "http://child-prc.intel.com:913"
                     })
ray_ctx.init(object_store_memory="2g", num_cores=0, labels="", extra_params={})


@ray.remote
class TestRay():
    def hostname(self):
        import socket
        return socket.gethostname()

    def check_cv2(self):
        # conda install -c conda-forge opencv==3.4.2
        import cv2
        return cv2.__version__

    def ip(self):
        import ray.services as rservices