示例#1
0
    def train_model(self,
                    num_epochs=300,
                    num_workers=1,
                    early_stop=False,
                    tenacity=10):
        if num_workers == 1:
            return self.train_model_local(num_epochs=num_epochs,
                                          early_stop=early_stop,
                                          tenacity=tenacity)
        else:
            from bigdl.chronos.model.tcmf.local_model_distributed_trainer import\
                train_yseq_hvd
            import ray

            # check whether there has been an activate ray context yet.
            from bigdl.orca.ray import RayContext
            ray_ctx = RayContext.get()
            Ymat_id = ray.put(self.Ymat)
            covariates_id = ray.put(self.covariates)
            Ycov_id = ray.put(self.Ycov)
            trainer_config_keys = [
                "vbsize", "hbsize", "end_index", "val_len", "lr", "num_inputs",
                "num_channels", "kernel_size", "dropout"
            ]
            trainer_config = {k: self.__dict__[k] for k in trainer_config_keys}
            model, val_loss = train_yseq_hvd(epochs=num_epochs,
                                             workers_per_node=num_workers //
                                             ray_ctx.num_ray_nodes,
                                             Ymat_id=Ymat_id,
                                             covariates_id=covariates_id,
                                             Ycov_id=Ycov_id,
                                             **trainer_config)
            self.seq = model
            return val_loss
    def test_parquet_images_training(self):
        from bigdl.orca.learn.tf2 import Estimator
        temp_dir = tempfile.mkdtemp()
        try:
            ParquetDataset.write("file://" + temp_dir, images_generator(),
                                 images_schema)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            output_shapes = {"id": (), "image": (), "label": ()}

            def data_creator(config, batch_size):
                dataset = read_parquet("tf_dataset",
                                       path=path,
                                       output_types=output_types,
                                       output_shapes=output_shapes)
                dataset = dataset.shuffle(10)
                dataset = dataset.map(lambda data_dict:
                                      (data_dict["image"], data_dict["label"]))
                dataset = dataset.map(parse_data_train)
                dataset = dataset.batch(batch_size)
                return dataset

            ray_ctx = RayContext.get()
            trainer = Estimator.from_keras(model_creator=model_creator)
            trainer.fit(data=data_creator, epochs=1, batch_size=2)
        finally:
            shutil.rmtree(temp_dir)
示例#3
0
 def get_default_num_workers():
     from bigdl.orca.ray import RayContext
     try:
         ray_ctx = RayContext.get(initialize=False)
         num_workers = ray_ctx.num_ray_nodes
     except:
         num_workers = 1
     return num_workers
示例#4
0
    def predict(self, x=None, horizon=24, mc=False,
                future_covariates=None,
                future_dti=None,
                num_workers=None):
        """
        Predict horizon time-points ahead the input x in fit_eval
        :param x: We don't support input x currently.
        :param horizon: horizon length to predict
        :param mc:
        :param future_covariates: covariates corresponding to future horizon steps data to predict.
        :param future_dti: dti corresponding to future horizon steps data to predict.
        :param num_workers: the number of workers to use. Note that there has to be an activate
            RayContext if num_workers > 1.
        :return:
        """
        if x is not None:
            raise ValueError("We don't support input x directly.")
        if self.model is None:
            raise Exception("Needs to call fit_eval or restore first before calling predict")
        self._check_covariates_dti(covariates=future_covariates, dti=future_dti, ts_len=horizon,
                                   method_name="predict")
        if num_workers is None:
            num_workers = TCMF.get_default_num_workers()
        if num_workers > 1:
            import ray
            from bigdl.orca.ray import RayContext
            try:
                RayContext.get(initialize=False)
            except:
                try:
                    # detect whether ray has been started.
                    ray.put(None)
                except:
                    raise RuntimeError(f"There must be an activate ray context while running with "
                                       f"{num_workers} workers. You can either start and init a "
                                       f"RayContext by init_orca_context(..., init_ray_on_spark="
                                       f"True) or start Ray with ray.init()")

        out = self.model.predict_horizon(
            future=horizon,
            bsize=90,
            num_workers=num_workers,
            future_covariates=future_covariates,
            future_dti=future_dti,
        )
        return out[:, -horizon::]
示例#5
0
    def test_local(self):
        @ray.remote
        class TestRay:
            def hostname(self):
                import socket
                return socket.gethostname()

        sc = init_spark_on_local(cores=8)
        ray_ctx = RayContext(sc=sc,
                             object_store_memory="1g",
                             ray_node_cpu_cores=4)
        address_info = ray_ctx.init()
        assert "object_store_address" in address_info
        actors = [TestRay.remote() for i in range(0, 4)]
        print(ray.get([actor.hostname.remote() for actor in actors]))
        ray_ctx.stop()
        sc.stop()
示例#6
0
    def fit(
        self,
        input_df,
        validation_df=None,
        metric="mse",
        recipe=SmokeRecipe(),
        mc=False,
        resources_per_trial={"cpu": 2},
        upload_dir=None,
    ):
        """
        Trains the model for time sequence prediction.
        If future sequence length > 1, use seq2seq model, else use vanilla LSTM model.
        :param input_df: The input time series data frame, Example:
         datetime   value   "extra feature 1"   "extra feature 2"
         2019-01-01 1.9 1   2
         2019-01-02 2.3 0   2
        :param validation_df: validation data
        :param metric: String. Metric used for train and validation. Available values are
                       "mean_squared_error" or "r_square"
        :param recipe: a Recipe object. Various recipes covers different search space and stopping
                      criteria. Default is SmokeRecipe().
        :param resources_per_trial: Machine resources to allocate per trial,
            e.g. ``{"cpu": 64, "gpu": 8}`
        :param upload_dir: Optional URI to sync training results and checkpoints. We only support
            hdfs URI for now. It defaults to
            "hdfs:///user/{hadoop_user_name}/ray_checkpoints/{predictor_name}".
            Where hadoop_user_name is specified in init_orca_context or init_spark_on_yarn,
            which defaults to "root". predictor_name is the name used in predictor instantiation.
        )
        :return: a pipeline constructed with the best model and configs.
        """
        self._check_df(input_df)
        if validation_df is not None:
            self._check_df(validation_df)

        ray_ctx = RayContext.get()
        is_local = ray_ctx.is_local
        # BasePredictor._check_fit_metric(metric)
        if not is_local:
            if not upload_dir:
                hadoop_user_name = os.getenv("HADOOP_USER_NAME")
                upload_dir = os.path.join(os.sep, "user", hadoop_user_name,
                                          "ray_checkpoints", self.name)
            cmd = "hadoop fs -mkdir -p {}".format(upload_dir)
            process(cmd)
        else:
            upload_dir = None

        self.pipeline = self._hp_search(
            input_df,
            validation_df=validation_df,
            metric=metric,
            recipe=recipe,
            mc=mc,
            resources_per_trial=resources_per_trial,
            remote_dir=upload_dir)
        return self.pipeline
 def get_default_remote_dir(name):
     from bigdl.orca.ray import RayContext
     from bigdl.orca.automl.search.utils import process
     ray_ctx = RayContext.get()
     if ray_ctx.is_local:
         return None
     else:
         default_remote_dir = f"hdfs:///tmp/{name}"
         process(command=f"hadoop fs -mkdir -p {default_remote_dir}")
         return default_remote_dir
    def test_horovod_learning_rate_schedule(self):
        import horovod
        major, minor, patch = horovod.__version__.split(".")

        larger_major = int(major) > 0
        larger_minor = int(major) == 0 and int(minor) > 19
        larger_patch = int(major) == 0 and int(minor) == 19 and int(patch) >= 2

        if larger_major or larger_minor or larger_patch:
            ray_ctx = RayContext.get()
            batch_size = 32
            workers_per_node = 4
            global_batch_size = batch_size * workers_per_node
            config = {"lr": 0.8}
            trainer = Estimator.from_keras(model_creator=simple_model,
                                           compile_args_creator=compile_args,
                                           verbose=True,
                                           config=config,
                                           backend="horovod",
                                           workers_per_node=workers_per_node)
            import horovod.tensorflow.keras as hvd
            callbacks = [
                hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                         initial_lr=0.4,
                                                         verbose=True),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=5,
                                                           end_epoch=10,
                                                           multiplier=1.,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=10,
                                                           end_epoch=15,
                                                           multiplier=1e-1,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=15,
                                                           end_epoch=20,
                                                           multiplier=1e-2,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=20,
                                                           multiplier=1e-3,
                                                           initial_lr=0.4),
                LRChecker()
            ]
            for i in range(30):
                trainer.fit(create_train_datasets,
                            epochs=1,
                            batch_size=global_batch_size,
                            callbacks=callbacks)
        else:
            # skip tests in horovod lower version
            pass
示例#9
0
 def test_gluon(self):
     current_ray_ctx = RayContext.get()
     address_info = current_ray_ctx.address_info
     assert "object_store_address" in address_info
     config = create_config(log_interval=2, optimizer="adam",
                            optimizer_params={'learning_rate': 0.02})
     estimator = Estimator.from_mxnet(config=config,
                                      model_creator=get_model,
                                      loss_creator=get_loss,
                                      eval_metrics_creator=get_metrics,
                                      validation_metrics_creator=get_metrics,
                                      num_workers=2)
     estimator.fit(get_train_data_iter, validation_data=get_test_data_iter, epochs=2)
     estimator.shutdown()
示例#10
0
def init_ray_context_fixture():
    from bigdl.dllib.nncontext import init_spark_on_local
    from bigdl.orca.ray import RayContext
    sc = init_spark_on_local(cores=4, spark_log_level="INFO")
    ray_ctx = RayContext(sc=sc, object_store_memory="1g")
    ray_ctx.init()
    yield
    ray_ctx.stop()
    sc.stop()
示例#11
0
def stop_orca_context():
    """
    Stop the SparkContext (and stop Ray services across the cluster if necessary).
    """
    from pyspark import SparkContext
    # If users successfully call stop_orca_context after the program finishes,
    # namely when there is no active SparkContext, the registered exit function
    # should do nothing.
    if SparkContext._active_spark_context is not None:
        print("Stopping orca context")
        from bigdl.orca.ray import RayContext
        ray_ctx = RayContext.get(initialize=False)
        if ray_ctx.initialized:
            ray_ctx.stop()
        sc = SparkContext.getOrCreate()
        if sc.getConf().get("spark.master").startswith("spark://"):
            from bigdl.dllib.nncontext import stop_spark_standalone
            stop_spark_standalone()
        sc.stop()
示例#12
0
    def to_spark_xshards(self):
        from bigdl.orca.data import SparkXShards
        ray_ctx = RayContext.get()
        sc = ray_ctx.sc
        address = ray_ctx.redis_address
        password = ray_ctx.redis_password
        num_parts = self.num_partitions()
        partition2store = self.partition2store_name
        rdd = self.rdd.mapPartitionsWithIndex(lambda idx, _: get_from_ray(
            idx, address, password, partition2store))

        # the reason why we trigger computation here is to ensure we get the data
        # from ray before the RayXShards goes out of scope and the data get garbage collected
        from pyspark.storagelevel import StorageLevel
        rdd = rdd.cache()
        result_rdd = rdd.map(
            lambda x: x)  # sparkxshards will uncache the rdd when gc
        spark_xshards = SparkXShards(result_rdd)
        return spark_xshards
示例#13
0
    def _from_spark_xshards_ray_api(spark_xshards):
        ray_ctx = RayContext.get()
        address = ray_ctx.redis_address
        password = ray_ctx.redis_password
        driver_ip = ray._private.services.get_node_ip_address()
        uuid_str = str(uuid.uuid4())
        resources = ray.cluster_resources()
        nodes = []
        for key, value in resources.items():
            if key.startswith("node:"):
                # if running in cluster, filter out driver ip
                if key != f"node:{driver_ip}":
                    nodes.append(key)
        # for the case of local mode and single node spark standalone
        if not nodes:
            nodes.append(f"node:{driver_ip}")

        partition_stores = {}
        for node in nodes:
            name = f"partition:{uuid_str}:{node}"
            if version.parse(ray.__version__) >= version.parse("1.4.0"):
                store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore)\
                    .options(name=name, lifetime="detached").remote()
            else:
                store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore) \
                    .options(name=name).remote()
            partition_stores[name] = store

        # actor creation is aync, this is to make sure they all have been started
        ray.get([v.get_partitions.remote() for v in partition_stores.values()])
        partition_store_names = list(partition_stores.keys())
        result_rdd = spark_xshards.rdd.mapPartitionsWithIndex(
            lambda idx, part: write_to_ray(idx, part, address, password,
                                           partition_store_names)).cache()
        result = result_rdd.collect()

        id2ip = {}
        id2store_name = {}
        for idx, ip, local_store_name in result:
            id2ip[idx] = ip
            id2store_name[idx] = local_store_name

        return RayXShards(uuid_str, result_rdd, partition_stores)
    def test_auto_shard_tf(self):
        # file 1 contains all 0s, file 2 contains all 1s
        # If shard by files, then each model will
        # see the same records in the same batch.
        # If shard by records, then each batch
        # will have different records.
        # The loss func is constructed such that
        # the former case will return 0, and the latter
        # case will return non-zero.

        ray_ctx = RayContext.get()
        trainer = Estimator.from_keras(model_creator=auto_shard_model_creator,
                                       verbose=True,
                                       backend="tf2",
                                       workers_per_node=2)
        stats = trainer.fit(create_auto_shard_datasets,
                            epochs=1,
                            batch_size=4,
                            steps_per_epoch=2)
        assert stats["train_loss"] == 0.0
示例#15
0
 def from_partition_refs(ip2part_id, part_id2ref, old_rdd):
     ray_ctx = RayContext.get()
     uuid_str = str(uuid.uuid4())
     id2store_name = {}
     partition_stores = {}
     part_id2ip = {}
     result = []
     for node, part_ids in ip2part_id.items():
         name = f"partition:{uuid_str}:{node}"
         store = ray.remote(num_cpus=0, resources={f"node:{node}": 1e-4})(LocalStore) \
             .options(name=name).remote()
         partition_stores[name] = store
         for idx in part_ids:
             result.append(
                 store.upload_partition.remote(idx, part_id2ref[idx]))
             id2store_name[idx] = name
             part_id2ip[idx] = node
     ray.get(result)
     new_id_ip_store_rdd = old_rdd.mapPartitionsWithIndex(lambda idx, _: [(
         idx, part_id2ip[idx], id2store_name[idx])]).cache()
     return RayXShards(uuid_str, new_id_ip_store_rdd, partition_stores)
示例#16
0
def init_orca_context(cluster_mode=None,
                      cores=2,
                      memory="2g",
                      num_nodes=1,
                      init_ray_on_spark=False,
                      **kwargs):
    """
    Creates or gets a SparkContext for different Spark cluster modes (and launch Ray services
    across the cluster if necessary).

    :param cluster_mode: The mode for the Spark cluster. One of "local", "yarn-client",
           "yarn-cluster", "k8s-client" and "standalone". Default to be None and in this case 
           there is supposed to be an existing SparkContext in your application.
           
           For "yarn-client" and "yarn-cluster", you are supposed to use conda environment 
           and set the environment variable HADOOP_CONF_DIR.

           For "k8s-client", you are supposed to additionally specify the arguments master 
           and container_image.
           For "k8s-cluster", you are supposed to use spark-submit to submit the application 
           and use the default cluster_mode instead.
           In this case, please set the Spark configurations through command line options or
           the properties file. 
           To make things easier, you are recommended to use the launch scripts we provide:
           https://github.com/intel-analytics/BigDL/tree/branch-2.0/scripts.

           For other cluster modes, you are recommended to install and run bigdl through
           pip, which is more convenient.
    :param cores: The number of cores to be used on each node. Default to be 2.
    :param memory: The memory allocated for each node. Default to be '2g'.
    :param num_nodes: The number of nodes to be used in the cluster. Default to be 1.
           For Spark local, num_nodes should always be 1 and you don't need to change it.
    :param init_ray_on_spark: Whether to launch Ray services across the cluster.
           Default to be False and in this case the Ray cluster would be launched lazily when
           Ray is involved in Project Orca.
    :param kwargs: The extra keyword arguments used for creating SparkContext and
           launching Ray if any. 

    :return: An instance of SparkContext.
    """
    print("Initializing orca context")
    import atexit
    atexit.register(stop_orca_context)
    from pyspark import SparkContext
    import warnings
    spark_args = {}
    for key in ["conf", "spark_log_level", "redirect_spark_log"]:
        if key in kwargs:
            spark_args[key] = kwargs[key]
    if cluster_mode is not None:
        cluster_mode = cluster_mode.lower()
    activate_sc = SparkContext._active_spark_context is not None
    if activate_sc:
        if cluster_mode is not None and cluster_mode != "spark-submit":
            warnings.warn(
                "Use an existing SparkContext, " +
                "cluster_mode is determined by the existing SparkContext",
                Warning)
        from bigdl.dllib.nncontext import init_nncontext
        sc = init_nncontext(conf=None,
                            spark_log_level="WARN",
                            redirect_spark_log=True)
    else:
        cluster_mode = "local" if cluster_mode is None else cluster_mode
        if cluster_mode == "local":
            if num_nodes > 1:
                warnings.warn(
                    "For Spark local mode, num_nodes should be 1, but got " +
                    repr(num_nodes) + ", ignored", Warning)
            os.environ["SPARK_DRIVER_MEMORY"] = memory
            if "python_location" in kwargs:
                spark_args["python_location"] = kwargs["python_location"]
            from bigdl.dllib.nncontext import init_spark_on_local
            sc = init_spark_on_local(cores, **spark_args)
        elif cluster_mode == "spark-submit":
            from bigdl.dllib.nncontext import init_nncontext
            sc = init_nncontext(**spark_args)
        elif cluster_mode.startswith(
                "yarn"):  # yarn, yarn-client or yarn-cluster
            hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
            if not hadoop_conf:
                assert "hadoop_conf" in kwargs,\
                    "Directory path to hadoop conf not found for yarn-client mode. Please either " \
                    "specify argument hadoop_conf or set the environment variable HADOOP_CONF_DIR"
                hadoop_conf = kwargs["hadoop_conf"]
            from bigdl.dllib.utils.utils import detect_conda_env_name
            conda_env_name = detect_conda_env_name()
            for key in [
                    "driver_cores", "driver_memory",
                    "extra_executor_memory_for_ray", "extra_python_lib",
                    "penv_archive", "additional_archive", "hadoop_user_name",
                    "spark_yarn_archive", "jars"
            ]:
                if key in kwargs:
                    spark_args[key] = kwargs[key]
            from bigdl.dllib.nncontext import init_spark_on_yarn, init_spark_on_yarn_cluster
            if cluster_mode == "yarn-cluster":
                sc = init_spark_on_yarn_cluster(hadoop_conf=hadoop_conf,
                                                conda_name=conda_env_name,
                                                num_executors=num_nodes,
                                                executor_cores=cores,
                                                executor_memory=memory,
                                                **spark_args)
            else:
                sc = init_spark_on_yarn(hadoop_conf=hadoop_conf,
                                        conda_name=conda_env_name,
                                        num_executors=num_nodes,
                                        executor_cores=cores,
                                        executor_memory=memory,
                                        **spark_args)
        elif cluster_mode.startswith("k8s"):  # k8s or k8s-client
            if cluster_mode == "k8s-cluster":
                raise ValueError(
                    'For k8s-cluster mode, '
                    'please submit the application via spark-submit'
                    'and use the default cluster_mode instead')
            assert "master" in kwargs, "Please specify master for k8s-client mode"
            assert "container_image" in kwargs, "Please specify container_image for k8s-client mode"
            for key in [
                    "driver_cores", "driver_memory",
                    "extra_executor_memory_for_ray", "extra_python_lib",
                    "jars", "python_location"
            ]:
                if key in kwargs:
                    spark_args[key] = kwargs[key]
            from bigdl.dllib.nncontext import init_spark_on_k8s
            sc = init_spark_on_k8s(master=kwargs["master"],
                                   container_image=kwargs["container_image"],
                                   num_executors=num_nodes,
                                   executor_cores=cores,
                                   executor_memory=memory,
                                   **spark_args)
        elif cluster_mode == "standalone":
            for key in [
                    "driver_cores", "driver_memory",
                    "extra_executor_memory_for_ray", "extra_python_lib",
                    "jars", "master", "python_location", "enable_numa_binding"
            ]:
                if key in kwargs:
                    spark_args[key] = kwargs[key]
            from bigdl.dllib.nncontext import init_spark_standalone
            sc = init_spark_standalone(num_executors=num_nodes,
                                       executor_cores=cores,
                                       executor_memory=memory,
                                       **spark_args)
        else:
            raise ValueError(
                "cluster_mode can only be local, yarn-client, yarn-cluster,"
                "k8s-client or standalone, "
                "but got: %s".format(cluster_mode))
    ray_args = {}
    for key in [
            "redis_port", "password", "object_store_memory", "verbose", "env",
            "extra_params", "num_ray_nodes", "ray_node_cpu_cores",
            "include_webui"
    ]:
        if key in kwargs:
            ray_args[key] = kwargs[key]
    from bigdl.orca.ray import RayContext
    ray_ctx = RayContext(sc, **ray_args)
    if init_ray_on_spark:
        driver_cores = 0  # This is the default value.
        ray_ctx.init(driver_cores=driver_cores)
    return sc
示例#17
0
 def get_ray_context():
     from bigdl.orca.ray import RayContext
     return RayContext.get()
示例#18
0
parser.add_argument("--slave_num",
                    type=int,
                    default=2,
                    help="The number of slave nodes to be used in the cluster."
                    "You can change it depending on your own cluster setting.")
parser.add_argument(
    "--cores",
    type=int,
    default=8,
    help="The number of cpu cores you want to use on each node. "
    "You can change it depending on your own cluster setting.")
parser.add_argument(
    "--memory",
    type=str,
    default="10g",
    help="The size of slave(executor)'s memory you want to use."
    "You can change it depending on your own cluster setting.")

if __name__ == "__main__":

    args = parser.parse_args()
    num_nodes = 1 if args.cluster_mode == "local" else args.slave_num
    init_orca_context(cluster_mode=args.cluster_mode,
                      cores=args.cores,
                      num_nodes=num_nodes,
                      memory=args.memory)

    runner = HorovodRayRunner(RayContext.get())
    runner.run(func=run_horovod)
    stop_orca_context()
 def test_local(self):
     node_num = 4
     sc = init_spark_on_local(cores=node_num)
     ray_ctx = RayContext(sc=sc, object_store_memory="1g")
     ray_ctx.init()
     actors = [TestRay.remote() for i in range(0, node_num)]
     print(ray.get([actor.hostname.remote() for actor in actors]))
     ray_ctx.stop()
     time.sleep(3)
     # repeat
     print("-------------------first repeat begin!------------------")
     ray_ctx = RayContext(sc=sc, object_store_memory="1g")
     ray_ctx.init()
     actors = [TestRay.remote() for i in range(0, node_num)]
     print(ray.get([actor.hostname.remote() for actor in actors]))
     ray_ctx.stop()
     sc.stop()
     time.sleep(3)
     for process_info in ray_ctx.ray_processesMonitor.process_infos:
         for pid in process_info.pids:
             assert not psutil.pid_exists(pid)
示例#20
0
    def __init__(self,
                 config,
                 model_creator,
                 loss_creator=None,
                 eval_metrics_creator=None,
                 validation_metrics_creator=None,
                 num_workers=None,
                 num_servers=None,
                 runner_cores=None):
        ray_ctx = RayContext.get()
        if not num_workers:
            num_workers = ray_ctx.num_ray_nodes
        self.config = {} if config is None else config
        assert isinstance(config, dict), "config must be a dict"
        for param in ["optimizer", "optimizer_params", "log_interval"]:
            assert param in config, param + " must be specified in config"
        self.model_creator = model_creator
        self.loss_creator = loss_creator
        self.validation_metrics_creator = validation_metrics_creator
        self.eval_metrics_creator = eval_metrics_creator
        self.num_workers = num_workers
        self.num_servers = num_servers if num_servers else self.num_workers

        # Generate actor class
        # Add a dummy custom resource: _mxnet_worker and _mxnet_server to diff worker from server
        # if runner_cores is specified so that we can place one worker and one server on a node
        # for better performance.
        Worker = ray.remote(num_cpus=runner_cores, resources={"_mxnet_worker": 1})(MXNetRunner) \
            if runner_cores else ray.remote(MXNetRunner)
        Server = ray.remote(num_cpus=runner_cores, resources={"_mxnet_server": 1})(MXNetRunner) \
            if runner_cores else ray.remote(MXNetRunner)

        # Start runners: workers followed by servers
        self.workers = [Worker.remote() for i in range(self.num_workers)]
        self.servers = [Server.remote() for i in range(self.num_servers)]
        self.runners = self.workers + self.servers

        env = {
            "DMLC_PS_ROOT_URI": str(get_host_ip()),
            "DMLC_PS_ROOT_PORT": str(find_free_port()),
            "DMLC_NUM_SERVER": str(self.num_servers),
            "DMLC_NUM_WORKER": str(self.num_workers),
        }
        envs = []
        for i in range(self.num_workers):
            current_env = env.copy()
            current_env['DMLC_ROLE'] = 'worker'
            envs.append(current_env)
        for i in range(self.num_servers):
            current_env = env.copy()
            current_env['DMLC_ROLE'] = 'server'
            envs.append(current_env)

        env['DMLC_ROLE'] = 'scheduler'
        modified_env = os.environ.copy()
        modified_env.update(env)
        # Need to contain system env to run bash
        # TODO: Need to kill this process manually?
        subprocess.Popen("python -c 'import mxnet'",
                         shell=True,
                         env=modified_env)

        ray.get([
            runner.setup_distributed.remote(envs[i], self.config,
                                            self.model_creator,
                                            self.loss_creator,
                                            self.validation_metrics_creator,
                                            self.eval_metrics_creator)
            for i, runner in enumerate(self.runners)
        ])
示例#21
0
@ray.remote
class TestRay:
    def hostname(self):
        import socket
        return socket.gethostname()


node_num = 4
sc = init_spark_on_yarn(hadoop_conf="/opt/work/hadoop-2.7.2/etc/hadoop/",
                        conda_name="rayexample",
                        num_executors=node_num,
                        executor_cores=28,
                        executor_memory="10g",
                        driver_memory="2g",
                        driver_cores=4,
                        extra_executor_memory_for_ray="30g")
ray_ctx = RayContext(sc=sc, object_store_memory="2g")
ray_ctx.init()
actors = [TestRay.remote() for i in range(0, node_num)]
print(ray.get([actor.hostname.remote() for actor in actors]))
ray_ctx.stop()
# repeat
ray_ctx = RayContext(sc=sc, object_store_memory="1g")
ray_ctx.init()
actors = [TestRay.remote() for i in range(0, node_num)]
print(ray.get([actor.hostname.remote() for actor in actors]))
ray_ctx.stop()

sc.stop()
time.sleep(3)
示例#22
0
    def __init__(self,
                 *,
                 model_creator,
                 optimizer_creator,
                 loss_creator=None,
                 metrics=None,
                 scheduler_creator=None,
                 training_operator_cls=TrainingOperator,
                 initialization_hook=None,
                 config=None,
                 scheduler_step_freq="batch",
                 use_tqdm=False,
                 backend="torch_distributed",
                 workers_per_node=1,
                 sync_stats=True,
                 log_level=logging.INFO):
        if config is not None and "batch_size" in config:
            raise Exception(
                "Please do not specify batch_size in config. Input batch_size in the"
                " fit/evaluate/predict function of the estimator instead.")

        # todo remove ray_ctx to run on workers
        ray_ctx = RayContext.get()
        if not (isinstance(model_creator, types.FunctionType) and isinstance(
                optimizer_creator,
                types.FunctionType)):  # Torch model is also callable.
            raise ValueError(
                "Must provide a function for both model_creator and optimizer_creator"
            )

        self.model_creator = model_creator
        self.optimizer_creator = optimizer_creator
        self.loss_creator = loss_creator
        self.scheduler_creator = scheduler_creator
        self.training_operator_cls = training_operator_cls
        self.scheduler_step_freq = scheduler_step_freq
        self.use_tqdm = use_tqdm
        self.sync_stats = sync_stats

        if not training_operator_cls and not loss_creator:
            raise ValueError("If a loss_creator is not provided, you must "
                             "provide a custom training operator.")

        self.initialization_hook = initialization_hook
        self.config = {} if config is None else config
        worker_config = self.config.copy()
        params = dict(model_creator=self.model_creator,
                      optimizer_creator=self.optimizer_creator,
                      loss_creator=self.loss_creator,
                      scheduler_creator=self.scheduler_creator,
                      training_operator_cls=self.training_operator_cls,
                      scheduler_step_freq=self.scheduler_step_freq,
                      use_tqdm=self.use_tqdm,
                      config=worker_config,
                      metrics=metrics,
                      sync_stats=sync_stats,
                      log_level=log_level)

        if backend == "torch_distributed":
            cores_per_node = ray_ctx.ray_node_cpu_cores // workers_per_node
            num_nodes = ray_ctx.num_ray_nodes * workers_per_node
            RemoteRunner = ray.remote(
                num_cpus=cores_per_node)(PytorchRayWorker)
            self.remote_workers = [
                RemoteRunner.remote(**params) for i in range(num_nodes)
            ]
            ray.get([
                worker.setup.remote(cores_per_node)
                for i, worker in enumerate(self.remote_workers)
            ])

            head_worker = self.remote_workers[0]
            address = ray.get(head_worker.setup_address.remote())

            logger.info(f"initializing pytorch process group on {address}")

            ray.get([
                worker.setup_torch_distribute.remote(address, i, num_nodes)
                for i, worker in enumerate(self.remote_workers)
            ])

        elif backend == "horovod":
            from bigdl.orca.learn.horovod.horovod_ray_runner import HorovodRayRunner
            self.horovod_runner = HorovodRayRunner(
                ray_ctx,
                worker_cls=PytorchRayWorker,
                worker_param=params,
                workers_per_node=workers_per_node)
            self.remote_workers = self.horovod_runner.remote_workers
            cores_per_node = self.horovod_runner.cores_per_node
            ray.get([
                worker.setup.remote(cores_per_node)
                for i, worker in enumerate(self.remote_workers)
            ])

            ray.get([
                worker.setup_horovod.remote()
                for i, worker in enumerate(self.remote_workers)
            ])
        else:
            raise Exception(
                "Only \"torch_distributed\" and \"horovod\" are supported "
                "values of backend, but got {}".format(backend))
        self.num_workers = len(self.remote_workers)
    def impl_test_fit_and_evaluate(self, backend):
        import tensorflow as tf
        ray_ctx = RayContext.get()
        batch_size = 32
        global_batch_size = batch_size * ray_ctx.num_ray_nodes

        if backend == "horovod":
            trainer = Estimator.from_keras(model_creator=simple_model,
                                           compile_args_creator=compile_args,
                                           verbose=True,
                                           config=None,
                                           backend=backend)
        else:

            trainer = Estimator.from_keras(model_creator=model_creator,
                                           verbose=True,
                                           config=None,
                                           backend=backend,
                                           workers_per_node=2)

        # model baseline performance
        start_stats = trainer.evaluate(create_test_dataset,
                                       batch_size=global_batch_size,
                                       num_steps=NUM_TEST_SAMPLES //
                                       global_batch_size)
        print(start_stats)

        def scheduler(epoch):
            if epoch < 2:
                return 0.001
            else:
                return 0.001 * tf.math.exp(0.1 * (2 - epoch))

        scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler,
                                                             verbose=1)
        # train for 2 epochs
        trainer.fit(create_train_datasets,
                    epochs=2,
                    batch_size=global_batch_size,
                    steps_per_epoch=10,
                    callbacks=[scheduler])
        trainer.fit(create_train_datasets,
                    epochs=2,
                    batch_size=global_batch_size,
                    steps_per_epoch=10,
                    callbacks=[scheduler])

        # model performance after training (should improve)
        end_stats = trainer.evaluate(create_test_dataset,
                                     batch_size=global_batch_size,
                                     num_steps=NUM_TEST_SAMPLES //
                                     global_batch_size)
        print(end_stats)

        # sanity check that training worked
        dloss = end_stats["validation_loss"] - start_stats["validation_loss"]
        dmse = (end_stats["validation_mean_squared_error"] -
                start_stats["validation_mean_squared_error"])
        print(f"dLoss: {dloss}, dMSE: {dmse}")

        assert dloss < 0 and dmse < 0, "training sanity check failed. loss increased!"
示例#24
0
slave_num = 2

sc = init_spark_on_yarn(hadoop_conf="/opt/work/almaren-yarn-config/",
                        conda_name="ray_train",
                        num_executors=slave_num,
                        executor_cores=28,
                        executor_memory="10g",
                        driver_memory="2g",
                        driver_cores=4,
                        extra_executor_memory_for_ray="30g",
                        conf={"hello": "world"})

ray_ctx = RayContext(sc=sc,
                     object_store_memory="25g",
                     extra_params={"temp-dir": "/tmp/hello/"},
                     env={
                         "http_proxy": "http://child-prc.intel.com:913",
                         "http_proxys": "http://child-prc.intel.com:913"
                     })
ray_ctx.init()


@ray.remote
class TestRay():
    def hostname(self):
        import socket
        return socket.gethostname()

    def check_cv2(self):
        # conda install -c conda-forge opencv==3.4.2
        import cv2
示例#25
0
    def __init__(self,
                 model_creator,
                 compile_args_creator=None,
                 config=None,
                 verbose=False,
                 backend="tf2",
                 workers_per_node=1,
                 cpu_binding=False):
        self.model_creator = model_creator
        self.compile_args_creator = compile_args_creator
        self.config = {} if config is None else config
        self.verbose = verbose

        ray_ctx = RayContext.get()
        if "batch_size" in self.config:
            raise Exception(
                "Please do not specify batch_size in config. Input batch_size in the"
                " fit/evaluate function of the estimator instead.")

        if "inter_op_parallelism" not in self.config:
            self.config["inter_op_parallelism"] = 1

        if "intra_op_parallelism" not in self.config:
            self.config[
                "intra_op_parallelism"] = ray_ctx.ray_node_cpu_cores // workers_per_node

        if backend == "horovod":
            assert compile_args_creator is not None, "compile_args_creator should not be None," \
                                                     " when backend is set to horovod"

        params = {
            "model_creator": model_creator,
            "compile_args_creator": compile_args_creator,
            "config": self.config,
            "verbose": self.verbose,
        }

        if backend == "tf2":
            cores_per_node = ray_ctx.ray_node_cpu_cores // workers_per_node
            num_nodes = ray_ctx.num_ray_nodes * workers_per_node

            self.cluster = RayDLCluster(num_workers=num_nodes,
                                        worker_cores=cores_per_node,
                                        worker_cls=TFRunner,
                                        worker_param=params,
                                        cpu_binding=cpu_binding)
            self.remote_workers = self.cluster.get_workers()
            ips = ray.get([
                worker.get_node_ip.remote() for worker in self.remote_workers
            ])
            ports = ray.get([
                worker.find_free_port.remote()
                for worker in self.remote_workers
            ])

            urls = [
                "{ip}:{port}".format(ip=ips[i], port=ports[i])
                for i in range(len(self.remote_workers))
            ]
            ray.get([worker.setup.remote() for worker in self.remote_workers])
            # Get setup tasks in order to throw errors on failure
            ray.get([
                worker.setup_distributed.remote(urls, i,
                                                len(self.remote_workers))
                for i, worker in enumerate(self.remote_workers)
            ])
        elif backend == "horovod":
            # it is necessary to call self.run first to set horovod environment
            from bigdl.orca.learn.horovod.horovod_ray_runner import HorovodRayRunner
            horovod_runner = HorovodRayRunner(
                ray_ctx,
                worker_cls=TFRunner,
                worker_param=params,
                workers_per_node=workers_per_node)
            horovod_runner.run(lambda: print("worker initialized"))
            self.remote_workers = horovod_runner.remote_workers
            ray.get([worker.setup.remote() for worker in self.remote_workers])
            ray.get([
                worker.setup_horovod.remote()
                for i, worker in enumerate(self.remote_workers)
            ])
        else:
            raise Exception("Only \"tf2\" and \"horovod\" are legal "
                            "values of backend, but got {}".format(backend))

        self.num_workers = len(self.remote_workers)