예제 #1
0
    parser.add_argument('-n',
                        '--num_workers',
                        type=int,
                        default=2,
                        help='The number of workers to be launched.')
    parser.add_argument('-m',
                        '--mode',
                        type=str,
                        default='gridrandom',
                        choices=['gridrandom', 'skopt'],
                        help='The search algorithm to use.')
    opt = parser.parse_args()
    if opt.hadoop_conf:
        assert opt.conda_name is not None, "conda_name must be specified for yarn mode"
        sc = init_spark_on_yarn(hadoop_conf=opt.hadoop_conf,
                                conda_name=opt.conda_name,
                                num_executors=opt.num_workers,
                                executor_cores=opt.executor_cores)
    else:
        sc = init_spark_on_local(cores="*")
    ray_ctx = RayContext(sc=sc)
    ray_ctx.init()

    input_cols = [
        "Year",
        "Month",
        "DayofMonth",
        "DayofWeek",
        "CRSDepTime",
        "CRSArrTime",
        "UniqueCarrier",
        "FlightNum",
예제 #2
0
        weights = ray.get(ps.pull.remote(keys))
        net.set_weights(keys, weights)
        # Compute an update and push it to the parameter server.
        xs, ys = mnist.train.next_batch(batch_size)
        gradients = net.compute_update(xs, ys)
        ps.push.remote(keys, gradients)


if __name__ == "__main__":
    args = parser.parse_args()
    if args.hadoop_conf:
        sc = init_spark_on_yarn(
            hadoop_conf=args.hadoop_conf,
            conda_name=args.conda_name,
            num_executor=args.num_workers,
            executor_cores=args.executor_cores,
            executor_memory=args.executor_memory,
            driver_memory=args.driver_memory,
            driver_cores=args.driver_cores,
            extra_executor_memory_for_ray=args.extra_executor_memory_for_ray)
        ray_ctx = RayContext(sc=sc,
                             object_store_memory=args.object_store_memory)
    else:
        sc = init_spark_on_local(cores=args.driver_cores)
        ray_ctx = RayContext(sc=sc,
                             object_store_memory=args.object_store_memory)
    ray_ctx.init()

    # Create a parameter server with some random weights.
    net = model.SimpleCNN()
    all_keys, all_values = net.get_weights()
예제 #3
0
                        default=10,
                        type=int,
                        help="The number of rollouts to do per batch.")
    parser.add_argument("--iterations",
                        default=-1,
                        type=int,
                        help="The number of model updates to perform. By "
                        "default, training will not terminate.")

    args = parser.parse_args()
    if args.hadoop_conf:
        slave_num = 2
        sc = init_spark_on_yarn(hadoop_conf=args.hadoop_conf,
                                conda_name="ray36",
                                num_executor=slave_num,
                                executor_cores=28,
                                executor_memory="10g",
                                driver_memory="2g",
                                driver_cores=4,
                                extra_executor_memory_for_ray="30g")
        ray_ctx = RayContext(sc=sc, object_store_memory="25g")
    else:
        sc = init_spark_on_local(cores=4)
        ray_ctx = RayContext(sc=sc)
    ray_ctx.init()

    batch_size = args.batch_size
    # Run the reinforcement learning.
    running_reward = None
    batch_num = 1
    model = {}
    # "Xavier" initialization.
예제 #4
0
def init_orca_context(cluster_mode="local", cores=2, memory="2g", num_nodes=1,
                      init_ray_on_spark=False, **kwargs):
    """
    Creates or gets a SparkContext for different Spark cluster modes (and launch Ray services
    across the cluster if necessary).

    :param cluster_mode: The mode for the Spark cluster. One of "local", "yarn-client",
           "k8s-client", "standalone" and "spark-submit". Default to be "local".

           For "spark-submit", you are supposed to use spark-submit to submit the application.
           In this case, please set the Spark configurations through command line options or
           the properties file. You need to use "spark-submit" for yarn-cluster or k8s-cluster mode.
           To make things easier, you are recommended to use the launch scripts we provide:
           https://github.com/intel-analytics/analytics-zoo/tree/master/scripts.

           For other cluster modes, you are recommended to install and run analytics-zoo through
           pip, which is more convenient.
    :param cores: The number of cores to be used on each node. Default to be 2.
    :param memory: The memory allocated for each node. Default to be '2g'.
    :param num_nodes: The number of nodes to be used in the cluster. Default to be 1.
           For Spark local, num_nodes should always be 1 and you don't need to change it.
    :param init_ray_on_spark: Whether to launch Ray services across the cluster.
           Default to be False and in this case the Ray cluster would be launched lazily when
           Ray is involved in Project Orca.
    :param kwargs: The extra keyword arguments used for creating SparkContext and
           launching Ray if any.

    :return: An instance of SparkContext.
    """
    print("Initializing orca context")
    import atexit
    atexit.register(stop_orca_context)
    cluster_mode = cluster_mode.lower()
    spark_args = {}
    for key in ["conf", "spark_log_level", "redirect_spark_log"]:
        if key in kwargs:
            spark_args[key] = kwargs[key]
    if cluster_mode == "spark-submit":
        from zoo import init_nncontext
        sc = init_nncontext(**spark_args)
    elif cluster_mode == "local":
        assert num_nodes == 1, "For Spark local mode, num_nodes should be 1"
        os.environ["SPARK_DRIVER_MEMORY"] = memory
        if "python_location" in kwargs:
            spark_args["python_location"] = kwargs["python_location"]
        from zoo import init_spark_on_local
        sc = init_spark_on_local(cores, **spark_args)
    elif cluster_mode.startswith("yarn"):  # yarn or yarn-client
        if cluster_mode == "yarn-cluster":
            raise ValueError('For yarn-cluster mode, please set cluster_mode to "spark-submit" '
                             'and submit the application via spark-submit instead')
        hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
        if not hadoop_conf:
            assert "hadoop_conf" in kwargs,\
                "Directory path to hadoop conf not found for yarn-client mode. Please either " \
                "specify argument hadoop_conf or set the environment variable HADOOP_CONF_DIR"
            hadoop_conf = kwargs["hadoop_conf"]
        from zoo.util.utils import detect_python_location
        python_location = detect_python_location()  # /path/to/conda/envs/conda_name/bin/python
        assert "envs" in python_location, "You must use a conda environment for yarn-client mode"
        for key in ["driver_cores", "driver_memory", "extra_executor_memory_for_ray",
                    "extra_python_lib", "penv_archive", "additional_archive",
                    "hadoop_user_name", "spark_yarn_archive", "jars"]:
            if key in kwargs:
                spark_args[key] = kwargs[key]
        from zoo import init_spark_on_yarn
        sc = init_spark_on_yarn(hadoop_conf=hadoop_conf,
                                conda_name=python_location.split("/")[-3],
                                num_executors=num_nodes, executor_cores=cores,
                                executor_memory=memory, **spark_args)
    elif cluster_mode.startswith("k8s"):  # k8s or k8s-client
        if cluster_mode == "k8s-cluster":
            raise ValueError('For k8s-cluster mode, please set cluster_mode to "spark-submit" '
                             'and submit the application via spark-submit instead')
        assert "master" in kwargs, "Please specify master for k8s-client mode"
        assert "container_image" in kwargs, "Please specify container_image for k8s-client mode"
        for key in ["driver_cores", "driver_memory", "extra_executor_memory_for_ray",
                    "extra_python_lib", "jars", "python_location"]:
            if key in kwargs:
                spark_args[key] = kwargs[key]
        from zoo import init_spark_on_k8s
        sc = init_spark_on_k8s(master=kwargs["master"],
                               container_image=kwargs["container_image"],
                               num_executors=num_nodes, executor_cores=cores,
                               executor_memory=memory, **spark_args)
    elif cluster_mode == "standalone":
        for key in ["driver_cores", "driver_memory", "extra_executor_memory_for_ray",
                    "extra_python_lib", "jars", "master", "python_location", "enable_numa_binding"]:
            if key in kwargs:
                spark_args[key] = kwargs[key]
        from zoo import init_spark_standalone
        sc = init_spark_standalone(num_executors=num_nodes, executor_cores=cores,
                                   executor_memory=memory, **spark_args)
    else:
        raise ValueError("cluster_mode can only be local, yarn-client, standalone or spark-submit, "
                         "but got: %s".format(cluster_mode))
    ray_args = {}
    for key in ["redis_port", "password", "object_store_memory", "verbose", "env",
                "extra_params", "num_ray_nodes", "ray_node_cpu_cores"]:
        if key in kwargs:
            ray_args[key] = kwargs[key]
    from zoo.ray import RayContext
    ray_ctx = RayContext(sc, **ray_args)
    if init_ray_on_spark:
        driver_cores = 0  # This is the default value.
        if "driver_cores" in kwargs:
            driver_cores = kwargs["driver_cores"]
        ray_ctx.init(driver_cores=driver_cores)
    return sc
np.random.seed(1337)  # for reproducibility


@ray.remote
class TestRay():
    def hostname(self):
        import socket
        return socket.gethostname()


node_num = 4
sc = init_spark_on_yarn(hadoop_conf="/opt/work/hadoop-2.7.2/etc/hadoop/",
                        conda_name="rayexample",
                        num_executor=node_num,
                        executor_cores=28,
                        executor_memory="10g",
                        driver_memory="2g",
                        driver_cores=4,
                        extra_executor_memory_for_ray="30g")
ray_ctx = RayContext(sc=sc, object_store_memory="2g")
ray_ctx.init()
actors = [TestRay.remote() for i in range(0, node_num)]
print(ray.get([actor.hostname.remote() for actor in actors]))
ray_ctx.stop()
# repeat
ray_ctx = RayContext(sc=sc, object_store_memory="1g")
ray_ctx.init()
actors = [TestRay.remote() for i in range(0, node_num)]
print(ray.get([actor.hostname.remote() for actor in actors]))
ray_ctx.stop()
예제 #6
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

import ray

from zoo import init_spark_on_yarn
from zoo.ray import RayContext

slave_num = 2

sc = init_spark_on_yarn(hadoop_conf="/opt/work/almaren-yarn-config/",
                        conda_name="ray_train",
                        num_executors=slave_num,
                        executor_cores=28,
                        executor_memory="10g",
                        driver_memory="2g",
                        driver_cores=4,
                        extra_executor_memory_for_ray="30g",
                        conf={"hello": "world"})

ray_ctx = RayContext(sc=sc,
                     object_store_memory="25g",
                     extra_params={"temp-dir": "/tmp/hello/"},
                     env={
                         "http_proxy": "http://child-prc.intel.com:913",
                         "http_proxys": "http://child-prc.intel.com:913"
                     })
ray_ctx.init()

예제 #7
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import ray

from zoo import init_spark_on_yarn
from zoo.ray.util.raycontext import RayContext

slave_num = 2

sc = init_spark_on_yarn(hadoop_conf="/opt/work/almaren-yarn-config/",
                        conda_name="ray36-dev",
                        num_executor=slave_num,
                        executor_cores=28,
                        executor_memory="10g",
                        driver_memory="2g",
                        driver_cores=4,
                        extra_executor_memory_for_ray="30g")

ray_ctx = RayContext(sc=sc,
                     object_store_memory="25g",
                     env={
                         "http_proxy": "http://child-prc.intel.com:913",
                         "http_proxys": "http://child-prc.intel.com:913"
                     })
ray_ctx.init()


@ray.remote
class TestRay():
예제 #8
0
        help="Enables GPU training")
    parser.add_argument(
        "--tune", action="store_true", default=False, help="Tune training")

    args, _ = parser.parse_known_args()
    import ray

    #ray.init(redis_address=args.redis_address)
    if args.hadoop_conf:
        slave_num = args.num_replicas
        print("Slave num : " + str(slave_num))
        sc = init_spark_on_yarn(
            hadoop_conf=args.hadoop_conf,
            conda_name="rayexample",
            num_executor=slave_num,
            executor_cores=88,#88
            executor_memory="10g",
            driver_memory="5g",
            driver_cores=4,
            extra_executor_memory_for_ray="10g")
        print("Init spark success!")
        ray_ctx = RayContext(sc=sc,object_store_memory="10g")
        print("RayContext success!")
        ray_ctx.init()
    else:
        # sc = init_spark_on_local(cores=22)
        # ray_ctx = RayContext(sc=sc)
        ray.init()

    print("ray init")
    t_s = time.time()