def fit( self, input_df, validation_df=None, metric="mse", recipe=SmokeRecipe(), mc=False, resources_per_trial={"cpu": 2}, upload_dir=None, ): """ Trains the model for time sequence prediction. If future sequence length > 1, use seq2seq model, else use vanilla LSTM model. :param input_df: The input time series data frame, Example: datetime value "extra feature 1" "extra feature 2" 2019-01-01 1.9 1 2 2019-01-02 2.3 0 2 :param validation_df: validation data :param metric: String. Metric used for train and validation. Available values are "mean_squared_error" or "r_square" :param recipe: a Recipe object. Various recipes covers different search space and stopping criteria. Default is SmokeRecipe(). :param resources_per_trial: Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}` :param upload_dir: Optional URI to sync training results and checkpoints. We only support hdfs URI for now. It defaults to "hdfs:///user/{hadoop_user_name}/ray_checkpoints/{predictor_name}". Where hadoop_user_name is specified in init_orca_context or init_spark_on_yarn, which defaults to "root". predictor_name is the name used in predictor instantiation. ) :return: a pipeline constructed with the best model and configs. """ self._check_df(input_df) if validation_df is not None: self._check_df(validation_df) ray_ctx = RayContext.get() is_local = ray_ctx.is_local # BasePredictor._check_fit_metric(metric) if not is_local: if not upload_dir: hadoop_user_name = os.getenv("HADOOP_USER_NAME") upload_dir = os.path.join(os.sep, "user", hadoop_user_name, "ray_checkpoints", self.name) cmd = "hadoop fs -mkdir -p {}".format(upload_dir) process(cmd) else: upload_dir = None self.pipeline = self._hp_search( input_df, validation_df=validation_df, metric=metric, recipe=recipe, mc=mc, resources_per_trial=resources_per_trial, remote_dir=upload_dir) return self.pipeline
def __init__(self, *, model_creator, data_creator, optimizer_creator, loss_creator=None, scheduler_creator=None, training_operator_cls=TrainingOperator, initialization_hook=None, config=None, scheduler_step_freq="batch"): if not (callable(model_creator) and callable(optimizer_creator) and callable(data_creator)): raise ValueError( "Must provide a callable model_creator, optimizer_creator, " "and data_creator.") self.model_creator = model_creator self.optimizer_creator = optimizer_creator self.loss_creator = loss_creator self.data_creator = data_creator self.scheduler_creator = scheduler_creator self.training_operator_cls = training_operator_cls self.scheduler_step_freq = scheduler_step_freq if not training_operator_cls and not loss_creator: raise ValueError("If a loss_creator is not provided, you must " "provide a custom training operator.") self.initialization_hook = initialization_hook self.config = {} if config is None else config self.param = dict(model_creator=self.model_creator, data_creator=self.data_creator, optimizer_creator=self.optimizer_creator, loss_creator=self.loss_creator, scheduler_creator=self.scheduler_creator, training_operator_cls=self.training_operator_cls, scheduler_step_freq=self.scheduler_step_freq) super().__init__(RayContext.get(), worker_cls=TorchWorker, worker_param=self.param) def init_func(): import torch torch.set_num_threads(self.cores_per_node) print("Worker initialized") self.run(init_func) remote_setups = [ worker.setup.remote(None, None, None) for i, worker in enumerate(self.remote_workers) ] # Get setup tasks in order to throw errors on failure ray.get(remote_setups)
def get_default_remote_dir(name): from zoo.ray import RayContext from zoo.orca.automl.search.utils import process ray_ctx = RayContext.get() if ray_ctx.is_local: return None else: default_remote_dir = f"hdfs:///tmp/{name}" process(command=f"hadoop fs -mkdir -p {default_remote_dir}") return default_remote_dir
def to_spark_rdd(self): ray_ctx = RayContext.get() sc = ray_ctx.sc address = ray_ctx.redis_address password = ray_ctx.redis_password num_parts = ray.get(self.meta_store.num_partitions.remote()) meta_store_name = f"meta_store:{self.uuid}" rdd = sc.parallelize([0] * num_parts * 10, num_parts)\ .mapPartitionsWithIndex( lambda idx, _: get_from_ray(idx, address, password, meta_store_name)) return rdd
def predict(self, x=None, horizon=24, mc=False, num_workers=None): """ Predict horizon time-points ahead the input x in fit_eval :param x: We don't support input x currently. :param horizon: horizon length to predict :param mc: :param num_workers: the number of workers to use. Note that there has to be an activate RayContext if num_workers > 1. :return: """ if x is not None: raise ValueError("We don't support input x directly.") if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling predict" ) if num_workers is None: num_workers = TCMF.get_default_num_workers() if num_workers > 1: import ray from zoo.ray import RayContext try: RayContext.get(initialize=False) except: try: # detect whether ray has been started. ray.put(None) except: raise RuntimeError( f"There must be an activate ray context while running with " f"{num_workers} workers. You can either start and init a " f"RayContext by init_orca_context(..., init_ray_on_spark=" f"True) or start Ray with ray.init()") out = self.model.predict_horizon( future=horizon, bsize=90, normalize=False, num_workers=num_workers, ) return out[:, -horizon::]
def test_gluon(self): current_ray_ctx = RayContext.get() address_info = current_ray_ctx.address_info assert "object_store_address" in address_info config = create_config(log_interval=2, optimizer="adam", optimizer_params={'learning_rate': 0.02}) estimator = Estimator(config, get_model, get_loss, eval_metrics_creator=get_metrics, validation_metrics_creator=get_metrics, num_workers=2) estimator.fit(get_train_data_iter, validation_data=get_test_data_iter, epochs=2) estimator.shutdown()
def from_partition_refs(parts_refs, part_ids, part_id2ip): ray_ctx = RayContext.get() uuid_str = str(uuid.uuid4()) meta_store = MetaStore.options(name=f"meta_store:{uuid_str}").remote() results = [] for part_id, part_ref in zip(part_ids, parts_refs): result = meta_store.set_partition_ref.remote(part_id, [part_ref]) results.append(result) ray.get(results) return RayRdd(uuid_str, meta_store, part_id2ip)
def impl_test_fit_and_evaluate(self, backend): import tensorflow as tf ray_ctx = RayContext.get() batch_size = 32 global_batch_size = batch_size * ray_ctx.num_ray_nodes config = { "batch_size": global_batch_size } if backend == "horovod": trainer = Estimator.from_keras( model_creator=simple_model, compile_args_creator=compile_args, verbose=True, config=config, backend=backend) else: trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, backend=backend, workers_per_node=2) # model baseline performance start_stats = trainer.evaluate(create_test_dataset, steps=NUM_TEST_SAMPLES // global_batch_size) print(start_stats) def scheduler(epoch): if epoch < 2: return 0.001 else: return 0.001 * tf.math.exp(0.1 * (2 - epoch)) scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1) # train for 2 epochs trainer.fit(create_train_datasets, epochs=2, steps_per_epoch=10, callbacks=[scheduler]) trainer.fit(create_train_datasets, epochs=2, steps_per_epoch=10, callbacks=[scheduler]) # model performance after training (should improve) end_stats = trainer.evaluate(create_test_dataset, steps=NUM_TEST_SAMPLES // global_batch_size) print(end_stats) # sanity check that training worked dloss = end_stats["validation_loss"] - start_stats["validation_loss"] dmse = (end_stats["validation_mean_squared_error"] - start_stats["validation_mean_squared_error"]) print(f"dLoss: {dloss}, dMSE: {dmse}") assert dloss < 0 and dmse < 0, "training sanity check failed. loss increased!"
def to_spark_xshards(self): from zoo.orca.data import SparkXShards ray_ctx = RayContext.get() sc = ray_ctx.sc address = ray_ctx.redis_address password = ray_ctx.redis_password num_parts = self.num_partitions() partition2store = self.partition2store_name rdd = sc.parallelize([0] * num_parts * 10, num_parts)\ .mapPartitionsWithIndex( lambda idx, _: get_from_ray(idx, address, password, partition2store)) spark_xshards = SparkXShards(rdd) return spark_xshards
def stop_orca_context(): """ Stop the SparkContext (and stop Ray services across the cluster if necessary). """ from pyspark import SparkContext from zoo.ray import RayContext ray_ctx = RayContext.get(initialize=False) if ray_ctx.initialized: ray_ctx.stop() sc = SparkContext.getOrCreate() if sc.getConf().get("spark.master").startswith("spark://"): from zoo import stop_spark_standalone stop_spark_standalone() sc.stop()
def test_horovod_learning_rate_schedule(self): import horovod major, minor, patch = horovod.__version__.split(".") larger_major = int(major) > 0 larger_minor = int(major) == 0 and int(minor) > 19 larger_patch = int(major) == 0 and int(minor) == 19 and int(patch) >= 2 if larger_major or larger_minor or larger_patch: ray_ctx = RayContext.get() batch_size = 32 workers_per_node = 4 global_batch_size = batch_size * workers_per_node config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=simple_model, compile_args_creator=compile_args, verbose=True, config=config, backend="horovod", workers_per_node=workers_per_node) import horovod.tensorflow.keras as hvd callbacks = [ hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=0.4, verbose=True), hvd.callbacks.LearningRateScheduleCallback(start_epoch=5, end_epoch=10, multiplier=1., initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=10, end_epoch=15, multiplier=1e-1, initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=15, end_epoch=20, multiplier=1e-2, initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=20, multiplier=1e-3, initial_lr=0.4), LRChecker() ] for i in range(30): trainer.fit(create_train_datasets, epochs=1, batch_size=global_batch_size, callbacks=callbacks) else: # skip tests in horovod lower version pass
def init_ray_context_fixture(): from zoo import init_spark_on_local from zoo.ray import RayContext sc = init_spark_on_local(cores=4, spark_log_level="INFO") ray_ctx = RayContext(sc=sc, object_store_memory="1g") ray_ctx.init() yield ray_ctx.stop() sc.stop()
def fit( self, input_df, validation_df=None, metric="mse", recipe=SmokeRecipe(), mc=False, resources_per_trial={"cpu": 2}, ): """ Trains the model for time sequence prediction. If future sequence length > 1, use seq2seq model, else use vanilla LSTM model. :param input_df: The input time series data frame, Example: datetime value "extra feature 1" "extra feature 2" 2019-01-01 1.9 1 2 2019-01-02 2.3 0 2 :param validation_df: validation data :param metric: String. Metric used for train and validation. Available values are "mean_squared_error" or "r_square" :param recipe: a Recipe object. Various recipes covers different search space and stopping criteria. Default is SmokeRecipe(). :param resources_per_trial: Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}` :return: a pipeline constructed with the best model and configs. """ self._check_df(input_df) if validation_df is not None: self._check_df(validation_df) ray_ctx = RayContext.get() is_local = ray_ctx.is_local # BasePredictor._check_fit_metric(metric) if not is_local: remote_dir = os.path.join(os.sep, "ray_results", self.name) if self.name not in get_remote_list(os.path.dirname(remote_dir)): cmd = "hadoop fs -mkdir -p {}".format(remote_dir) process(cmd) else: remote_dir = None self.pipeline = self._hp_search( input_df, validation_df=validation_df, metric=metric, recipe=recipe, mc=mc, resources_per_trial=resources_per_trial, remote_dir=remote_dir) return self.pipeline
def to_ray(self): """ Put data of this SparkXShards to Ray cluster object store. :return: a new RayXShards which contains data of this SparkXShards. """ from zoo.ray import RayContext ray_ctx = RayContext.get() object_store_address = ray_ctx.address_info["object_store_address"] def put_to_plasma(ids): def f(index, iterator): import pyarrow.plasma as plasma from zoo.util.utils import get_node_ip res = list(iterator) client = plasma.connect(object_store_address) target_id = ids[index] # If the ObjectID exists in plasma, we assume a task trial # succeeds and the data is already in the object store. if not client.contains(target_id): object_id = client.put(res, target_id) assert object_id == target_id, \ "Errors occurred when putting data into plasma object store" client.disconnect() yield target_id, get_node_ip() return f # Create plasma ObjectIDs beforehand instead of creating a random one every time to avoid # memory leak in case errors occur when putting data into plasma and Spark would retry. # ObjectIDs in plasma is a byte string of length 20 containing characters and numbers. # The random generation of ObjectIDs is often good enough to ensure unique IDs. import pyarrow.plasma as plasma object_ids = [ plasma.ObjectID.from_random() for i in range(self.rdd.getNumPartitions()) ] object_id_node_ips = self.rdd.mapPartitionsWithIndex( put_to_plasma(object_ids)).collect() self.uncache() # Sort the data according to the node_ips. object_id_node_ips.sort(key=lambda x: x[1]) partitions = [ RayPartition(object_id=id_ip[0], node_ip=id_ip[1], object_store_address=object_store_address) for id_ip in object_id_node_ips ] return RayXShards(partitions)
def orca_data_fixture(): from zoo import init_spark_on_local from zoo.ray import RayContext global ray_ctx sc = init_spark_on_local(cores=4, spark_log_level="INFO") access_key_id = os.getenv("AWS_ACCESS_KEY_ID") secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") ray_ctx = RayContext(sc=sc, object_store_memory="1g", env={ "AWS_ACCESS_KEY_ID": access_key_id, "AWS_SECRET_ACCESS_KEY": secret_access_key }) ray_ctx = RayContext(sc=sc, object_store_memory="1g") ray_ctx.init() yield ray_ctx.stop() sc.stop()
def rayonspark_fixture(): from zoo import init_spark_on_local from zoo.ray import RayContext global sc global ray_ctx sc = init_spark_on_local(cores=8, spark_log_level="INFO") ray_ctx = RayContext(sc=sc, object_store_memory="1g") ray_ctx.init() yield ray_ctx.stop() sc.stop()
def test_auto_shard_tf(self): # file 1 contains all 0s, file 2 contains all 1s # If shard by files, then each model will # see the same records in the same batch. # If shard by records, then each batch # will have different records. # The loss func is constructed such that # the former case will return 0, and the latter # case will return non-zero. ray_ctx = RayContext.get() trainer = Estimator.from_keras( model_creator=auto_shard_model_creator, verbose=True, backend="tf2", workers_per_node=2) stats = trainer.fit(create_auto_shard_datasets, epochs=1, batch_size=4, steps_per_epoch=2) assert stats["train_loss"] == 0.0
def _from_spark_xshards_ray_api(spark_xshards): ray_ctx = RayContext.get() address = ray_ctx.redis_address password = ray_ctx.redis_password driver_ip = ray._private.services.get_node_ip_address() uuid_str = str(uuid.uuid4()) resources = ray.cluster_resources() nodes = [] for key, value in resources.items(): if key.startswith("node:"): # if running in cluster, filter out driver ip if key != f"node:{driver_ip}": nodes.append(key) # for the case of local mode and single node spark standalone if not nodes: nodes.append(f"node:{driver_ip}") partition_stores = {} for node in nodes: name = f"partition:{uuid_str}:{node}" store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore)\ .options(name=name).remote() partition_stores[name] = store # actor creation is aync, this is to make sure they all have been started ray.get([v.get_partitions.remote() for v in partition_stores.values()]) partition_store_names = list(partition_stores.keys()) result = spark_xshards.rdd.mapPartitionsWithIndex( lambda idx, part: write_to_ray(idx, part, address, password, partition_store_names)).collect() num_empty_partitions = 0 id2ip = {} id2store_name = {} for idx, ip, local_store_name, is_empty in result: id2ip[idx] = ip id2store_name[idx] = local_store_name if is_empty: num_empty_partitions += 1 if num_empty_partitions > 0: logger.warning( f"Found {num_empty_partitions} empty partitions in your SparkXShards." ) return RayXShards(uuid_str, dict(id2store_name), dict(id2ip), partition_stores)
def test_gluon(self): current_ray_ctx = RayContext.get() address_info = current_ray_ctx.address_info assert "object_store_address" in address_info config = create_trainer_config( batch_size=32, log_interval=2, optimizer="adam", optimizer_params={'learning_rate': 0.02}) trainer = MXNetTrainer(config, get_train_data_iter, get_model, get_loss, eval_metrics_creator=get_metrics, validation_metrics_creator=get_metrics, num_workers=2, test_data=get_test_data_iter) trainer.train(nb_epoch=2)
def from_partition_refs(ip2part_id, part_id2ref): ray_ctx = RayContext.get() uuid_str = str(uuid.uuid4()) id2store_name = {} partition_stores = {} part_id2ip = {} result = [] for node, part_ids in ip2part_id.items(): name = f"partition:{uuid_str}:{node}" store = ray.remote(num_cpus=0, resources={f"node:{node}": 1e-4})(LocalStore) \ .options(name=name).remote() partition_stores[name] = store for idx in part_ids: result.append(store.upload_partition.remote(idx, part_id2ref[idx])) id2store_name[idx] = name part_id2ip[idx] = node ray.get(result) return RayXShards(uuid_str, id2store_name, part_id2ip, partition_stores)
def orca_data_fixture(): from zoo import init_spark_on_local from zoo.ray import RayContext ZooContext._orca_eager_mode = True sc = init_spark_on_local(cores=4, spark_log_level="INFO") access_key_id = os.getenv("AWS_ACCESS_KEY_ID") secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") if access_key_id is not None and secret_access_key is not None: ray_ctx = RayContext(sc=sc, object_store_memory="1g", env={ "AWS_ACCESS_KEY_ID": access_key_id, "AWS_SECRET_ACCESS_KEY": secret_access_key }) else: ray_ctx = RayContext(sc=sc, object_store_memory="1g") ray_ctx.init() yield ray_ctx.stop() sc.stop()
def to_spark_xshards(self): from zoo.orca.data import SparkXShards ray_ctx = RayContext.get() sc = ray_ctx.sc address = ray_ctx.redis_address password = ray_ctx.redis_password num_parts = self.num_partitions() partition2store = self.partition2store_name rdd = sc.parallelize([0] * num_parts * 10, num_parts)\ .mapPartitionsWithIndex( lambda idx, _: get_from_ray(idx, address, password, partition2store)) # the reason why we trigger computation here is to ensure we get the data # from ray before the RayXShards goes out of scope and the data get garbage collected from pyspark.storagelevel import StorageLevel rdd = rdd.cache() result_rdd = rdd.map(lambda x: x) # sparkxshards will uncache the rdd when gc spark_xshards = SparkXShards(result_rdd) return spark_xshards
def stop_orca_context(): """ Stop the SparkContext (and stop Ray services across the cluster if necessary). """ from pyspark import SparkContext # If users successfully call stop_orca_context after the program finishes, # namely when there is no active SparkContext, the registered exit function # should do nothing. if SparkContext._active_spark_context is not None: print("Stopping orca context") from zoo.ray import RayContext ray_ctx = RayContext.get(initialize=False) if ray_ctx.initialized: ray_ctx.stop() sc = SparkContext.getOrCreate() if sc.getConf().get("spark.master").startswith("spark://"): from zoo import stop_spark_standalone stop_spark_standalone() sc.stop()
def _from_spark_xshards_ray_api(spark_xshards): ray_ctx = RayContext.get() address = ray_ctx.redis_address password = ray_ctx.redis_password driver_ip = ray._private.services.get_node_ip_address() uuid_str = str(uuid.uuid4()) resources = ray.cluster_resources() nodes = [] for key, value in resources.items(): if key.startswith("node:"): # if running in cluster, filter out driver ip if key != f"node:{driver_ip}": nodes.append(key) # for the case of local mode and single node spark standalone if not nodes: nodes.append(f"node:{driver_ip}") partition_stores = {} for node in nodes: name = f"partition:{uuid_str}:{node}" if version.parse(ray.__version__) >= version.parse("1.4.0"): store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore)\ .options(name=name, lifetime="detached").remote() else: store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore) \ .options(name=name).remote() partition_stores[name] = store # actor creation is aync, this is to make sure they all have been started ray.get([v.get_partitions.remote() for v in partition_stores.values()]) partition_store_names = list(partition_stores.keys()) result_rdd = spark_xshards.rdd.mapPartitionsWithIndex( lambda idx, part: write_to_ray(idx, part, address, password, partition_store_names)).cache() result = result_rdd.collect() id2ip = {} id2store_name = {} for idx, ip, local_store_name in result: id2ip[idx] = ip id2store_name[idx] = local_store_name return RayXShards(uuid_str, result_rdd, partition_stores)
def test_local(self): @ray.remote class TestRay: def hostname(self): import socket return socket.gethostname() sc = init_spark_on_local(cores=4) ray_ctx = RayContext(sc=sc, object_store_memory="1g") ray_ctx.init() actors = [TestRay.remote() for i in range(0, 4)] print(ray.get([actor.hostname.remote() for actor in actors])) ray_ctx.stop() sc.stop()
def to_ray(self): import random import string from zoo.ray import RayContext ray_ctx = RayContext.get() object_store_address = ray_ctx.address_info["object_store_address"] # TODO: Handle failure when doing this? # TODO: delete the data in the plasma? def put_to_plasma(seed): def f(index, iterator): import pyarrow.plasma as plasma from zoo.orca.data.utils import get_node_ip # mapPartition would set the same random seed for each partition? # Here use the partition index to override the random seed so that there won't be # identical object_ids in plasma. random.seed(seed + str(index)) res = list(iterator) client = plasma.connect(object_store_address) object_id = client.put(res) yield object_id, get_node_ip() return f # Generate a random string here to make sure that when this method is called twice, the # seeds to generate plasma ObjectID are different. random_str = ''.join([ random.choice(string.ascii_letters + string.digits) for i in range(32) ]) object_id_node_ips = self.rdd.mapPartitionsWithIndex( put_to_plasma(random_str)).collect() self.uncache() # Sort the data according to the node_ips. object_id_node_ips.sort(key=lambda x: x[1]) partitions = [ RayPartition(shard_list=id_ip[0], node_ip=id_ip[1], object_store_address=object_store_address) for id_ip in object_id_node_ips ] return RayXShards(partitions)
def impl_test_auto_shard(self, backend): # file 1 contains all 0s, file 2 contains all 1s # If shard by files, then each model will # see the same records in the same batch. # If shard by records, then each batch # will have different records. # The loss func is constructed such that # the former case will return 0, and the latter # case will return non-zero. ray_ctx = RayContext.get() trainer = Estimator( model_creator=create_auto_shard_model, compile_args_creator=create_auto_shard_compile_args, verbose=True, config={}, backend=backend, workers_per_node=2) stats = trainer.fit(create_auto_shard_datasets, epochs=1, steps_per_epoch=2) assert stats["train_loss"] == 0.0
def __init__(self, config, model_creator, loss_creator=None, eval_metrics_creator=None, validation_metrics_creator=None, num_workers=None, num_servers=None, runner_cores=None): ray_ctx = RayContext.get() if not num_workers: num_workers = ray_ctx.num_ray_nodes self.config = {} if config is None else config assert isinstance(config, dict), "config must be a dict" for param in ["optimizer", "optimizer_params", "log_interval"]: assert param in config, param + " must be specified in config" self.model_creator = model_creator self.loss_creator = loss_creator self.validation_metrics_creator = validation_metrics_creator self.eval_metrics_creator = eval_metrics_creator self.num_workers = num_workers self.num_servers = num_servers if num_servers else self.num_workers # Generate actor class # Add a dummy custom resource: _mxnet_worker and _mxnet_server to diff worker from server # if runner_cores is specified so that we can place one worker and one server on a node # for better performance. Worker = ray.remote(num_cpus=runner_cores, resources={"_mxnet_worker": 1})(MXNetRunner) \ if runner_cores else ray.remote(MXNetRunner) Server = ray.remote(num_cpus=runner_cores, resources={"_mxnet_server": 1})(MXNetRunner) \ if runner_cores else ray.remote(MXNetRunner) # Start runners: workers followed by servers self.workers = [Worker.remote() for i in range(self.num_workers)] self.servers = [Server.remote() for i in range(self.num_servers)] self.runners = self.workers + self.servers env = { "DMLC_PS_ROOT_URI": str(get_host_ip()), "DMLC_PS_ROOT_PORT": str(find_free_port()), "DMLC_NUM_SERVER": str(self.num_servers), "DMLC_NUM_WORKER": str(self.num_workers), } envs = [] for i in range(self.num_workers): current_env = env.copy() current_env['DMLC_ROLE'] = 'worker' envs.append(current_env) for i in range(self.num_servers): current_env = env.copy() current_env['DMLC_ROLE'] = 'server' envs.append(current_env) env['DMLC_ROLE'] = 'scheduler' modified_env = os.environ.copy() modified_env.update(env) # Need to contain system env to run bash # TODO: Need to kill this process manually? subprocess.Popen("python -c 'import mxnet'", shell=True, env=modified_env) ray.get([ runner.setup_distributed.remote(envs[i], self.config, self.model_creator, self.loss_creator, self.validation_metrics_creator, self.eval_metrics_creator) for i, runner in enumerate(self.runners) ])
help="The file path to be read") (options, args) = parser.parse_args(sys.argv) # Prepare csv files df = pd.read_csv(options.file_path) sc = init_spark_on_local(cores="*") sqlContext = SQLContext(sc) num_nodes, num_cores = get_node_and_core_number() df_spark = sqlContext.createDataFrame(df) df_spark.printSchema() df_spark.repartition(num_cores).write.\ format('json').mode("overwrite").save("/tmp/ray-pandas-example") # init ray context ray_ctx = RayContext(sc=sc, object_store_memory="5g") ray_ctx.init(object_store_memory="5g") # read data data_shard = zoo.xshard.pandas.read_json("/tmp/ray-pandas-example", ray_ctx) # collect data data = data_shard.collect() print("collected data :") print(data[0].head()) # repartition partitions = data_shard.get_partitions() print("get %d partitions" % len(partitions)) data_shard.repartition(2)
help='The number of executor cores you want to use.') parser.add_argument('-n', '--num_workers', type=int, default=2, help='The number of workers to be launched.') opt = parser.parse_args() if opt.hadoop_conf: assert opt.conda_name is not None, "conda_name must be specified for yarn mode" sc = init_spark_on_yarn(hadoop_conf=opt.hadoop_conf, conda_name=opt.conda_name, num_executors=opt.num_workers, executor_cores=opt.executor_cores) else: sc = init_spark_on_local(cores="*") ray_ctx = RayContext(sc=sc) ray_ctx.init() import pandas as pd df = pd.read_csv(opt.path) feature_cols = [ "FIPS", "Lower 95% Confidence Interval", "Upper 95% Confidence Interval", "Average Annual Count", "Recent 5-Year Trend" ] target_col = "Age-Adjusted Incidence Rate" train_df, val_df = train_test_split(df, test_size=0.2, random_state=2) config = {'random_state': 2, 'min_child_weight': 3, 'n_jobs': 2} estimator = AutoXGBoost().regressor(feature_cols=feature_cols, target_col=target_col,