def train_model(self, num_epochs=300, num_workers=1, early_stop=False, tenacity=10): if num_workers == 1: return self.train_model_local(num_epochs=num_epochs, early_stop=early_stop, tenacity=tenacity) else: from zoo.automl.model.tcmf.local_model_distributed_trainer import train_yseq import ray # check whether there has been an activate ray context yet. from zoo.ray import RayContext RayContext.get() Ymat_id = ray.put(self.Ymat) covariates_id = ray.put(self.covariates) Ycov_id = ray.put(self.Ycov) trainer_config_keys = [ "vbsize", "hbsize", "end_index", "val_len", "lr", "num_inputs", "num_channels", "kernel_size", "dropout" ] trainer_config = {k: self.__dict__[k] for k in trainer_config_keys} model, val_loss = train_yseq(epochs=num_epochs, num_workers=num_workers, Ymat_id=Ymat_id, covariates_id=covariates_id, Ycov_id=Ycov_id, **trainer_config) self.seq = model return val_loss
def predict(self, x=None, horizon=24, mc=False, future_covariates=None, future_dti=None, num_workers=None): """ Predict horizon time-points ahead the input x in fit_eval :param x: We don't support input x currently. :param horizon: horizon length to predict :param mc: :param future_covariates: covariates corresponding to future horizon steps data to predict. :param future_dti: dti corresponding to future horizon steps data to predict. :param num_workers: the number of workers to use. Note that there has to be an activate RayContext if num_workers > 1. :return: """ if x is not None: raise ValueError("We don't support input x directly.") if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling predict" ) self._check_covariates_dti(covariates=future_covariates, dti=future_dti, ts_len=horizon, method_name="predict") if num_workers is None: num_workers = TCMF.get_default_num_workers() if num_workers > 1: import ray from zoo.ray import RayContext try: RayContext.get(initialize=False) except: try: # detect whether ray has been started. ray.put(None) except: raise RuntimeError( f"There must be an activate ray context while running with " f"{num_workers} workers. You can either start and init a " f"RayContext by init_orca_context(..., init_ray_on_spark=" f"True) or start Ray with ray.init()") out = self.model.predict_horizon( future=horizon, bsize=90, num_workers=num_workers, future_covariates=future_covariates, future_dti=future_dti, ) return out[:, -horizon::]
def __init__(self, model_creator, data_creator, optimizer_creator, loss_creator=None, scheduler_creator=None, training_operator_cls=None, initialization_hook=None, config=None, num_workers=None, use_fp16=False, use_tqdm=False, scheduler_step_freq="batch"): # Lift TorchTrainer to an Actor so that its local worker would be # created on the cluster as well. RemoteTrainer = ray.remote(TorchTrainer) # check whether there has been an active RayContext and get it. ray_ctx = RayContext.get() if not num_workers: num_workers = ray_ctx.num_ray_nodes self.trainer = RemoteTrainer.remote(model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, loss_creator=loss_creator, scheduler_creator=scheduler_creator, training_operator_cls=training_operator_cls, initialization_hook=initialization_hook, config=config, num_workers=num_workers, backend="gloo", use_fp16=use_fp16, use_tqdm=use_tqdm, scheduler_step_freq=scheduler_step_freq)
def test_parquet_images_training(self): from zoo.orca.learn.tf2 import Estimator temp_dir = tempfile.mkdtemp() try: ParquetDataset.write("file://" + temp_dir, images_generator(), images_schema) path = "file://" + temp_dir output_types = { "id": tf.string, "image": tf.string, "label": tf.float32 } output_shapes = {"id": (), "image": (), "label": ()} def data_creator(config, batch_size): dataset = read_parquet("tf_dataset", input_path=path, output_types=output_types, output_shapes=output_shapes) dataset = dataset.shuffle(10) dataset = dataset.map(lambda data_dict: (data_dict["image"], data_dict["label"])) dataset = dataset.map(parse_data_train) dataset = dataset.batch(batch_size) return dataset ray_ctx = RayContext.get() trainer = Estimator.from_keras(model_creator=model_creator) trainer.fit(data=data_creator, epochs=1, batch_size=2) finally: shutil.rmtree(temp_dir)
def _from_spark_rdd_ray_api(rdd): ray_ctx = RayContext.get() address = ray_ctx.redis_address password = ray_ctx.redis_password driver_ip = ray.services.get_node_ip_address() uuid_str = str(uuid.uuid4()) meta_store = MetaStore.options(name=f"meta_store:{uuid_str}").remote() resources = ray.cluster_resources() nodes = [] for key, value in resources.items(): if key.startswith("node:"): # if running in cluster, filter out driver ip if not (not ray_ctx.is_local and key == f"node:{driver_ip}"): nodes.append(key) partition_stores = {} for node in nodes: name = f"partition:{uuid_str}:{node}" store = ray.remote(num_cpus=0, resources={node: 1e-4})(PartitionUploader)\ .options(name=name).remote(meta_store) partition_stores[name] = store partition_store_names = list(partition_stores.keys()) id2ip = rdd.mapPartitionsWithIndex(lambda idx, part: write_to_ray( idx, part, address, password, partition_store_names)).collect() return RayRdd(uuid_str, meta_store, dict(id2ip))
def _from_spark_xshards_ray_api(spark_xshards): ray_ctx = RayContext.get() address = ray_ctx.redis_address password = ray_ctx.redis_password driver_ip = ray._private.services.get_node_ip_address() uuid_str = str(uuid.uuid4()) resources = ray.cluster_resources() nodes = [] for key, value in resources.items(): if key.startswith("node:"): # if running in cluster, filter out driver ip if ray_ctx.is_local or key != f"node:{driver_ip}": nodes.append(key) partition_stores = {} for node in nodes: name = f"partition:{uuid_str}:{node}" store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore)\ .options(name=name).remote() partition_stores[name] = store # actor creation is aync, this is to make sure they all have been started ray.get([v.get_partitions.remote() for v in partition_stores.values()]) partition_store_names = list(partition_stores.keys()) result = spark_xshards.rdd.mapPartitionsWithIndex(lambda idx, part: write_to_ray( idx, part, address, password, partition_store_names)).collect() id2ip = {idx: ip for idx, ip, _ in result} id2store_name = {idx: store for idx, _, store in result} return RayXShards(uuid_str, dict(id2store_name), dict(id2ip), partition_stores)
def fit( self, input_df, validation_df=None, metric="mse", recipe=SmokeRecipe(), mc=False, resources_per_trial={"cpu": 2}, upload_dir=None, ): """ Trains the model for time sequence prediction. If future sequence length > 1, use seq2seq model, else use vanilla LSTM model. :param input_df: The input time series data frame, Example: datetime value "extra feature 1" "extra feature 2" 2019-01-01 1.9 1 2 2019-01-02 2.3 0 2 :param validation_df: validation data :param metric: String. Metric used for train and validation. Available values are "mean_squared_error" or "r_square" :param recipe: a Recipe object. Various recipes covers different search space and stopping criteria. Default is SmokeRecipe(). :param resources_per_trial: Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}` :param upload_dir: Optional URI to sync training results and checkpoints. We only support hdfs URI for now. It defaults to "hdfs:///user/{hadoop_user_name}/ray_checkpoints/{predictor_name}". Where hadoop_user_name is specified in init_orca_context or init_spark_on_yarn, which defaults to "root". predictor_name is the name used in predictor instantiation. ) :return: a pipeline constructed with the best model and configs. """ self._check_df(input_df) if validation_df is not None: self._check_df(validation_df) ray_ctx = RayContext.get() is_local = ray_ctx.is_local # BasePredictor._check_fit_metric(metric) if not is_local: if not upload_dir: hadoop_user_name = os.getenv("HADOOP_USER_NAME") upload_dir = os.path.join(os.sep, "user", hadoop_user_name, "ray_checkpoints", self.name) cmd = "hadoop fs -mkdir -p {}".format(upload_dir) process(cmd) else: upload_dir = None self.metric = metric self.pipeline = self._hp_search( input_df, validation_df=validation_df, metric=metric, recipe=recipe, mc=mc, resources_per_trial=resources_per_trial, remote_dir=upload_dir) return self.pipeline
def get_default_num_workers(): from zoo.ray import RayContext try: ray_ctx = RayContext.get(initialize=False) num_workers = ray_ctx.num_ray_nodes except: num_workers = 1 return num_workers
def impl_test_fit_and_evaluate(self, backend): import tensorflow as tf ray_ctx = RayContext.get() batch_size = 32 global_batch_size = batch_size * ray_ctx.num_ray_nodes config = {"batch_size": global_batch_size} if backend == "horovod": trainer = Estimator.from_keras(model_creator=simple_model, compile_args_creator=compile_args, verbose=True, config=config, backend=backend) else: trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, backend=backend, workers_per_node=2) # model baseline performance start_stats = trainer.evaluate(create_test_dataset, steps=NUM_TEST_SAMPLES // global_batch_size) print(start_stats) def scheduler(epoch): if epoch < 2: return 0.001 else: return 0.001 * tf.math.exp(0.1 * (2 - epoch)) scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1) # train for 2 epochs trainer.fit(create_train_datasets, epochs=2, steps_per_epoch=10, callbacks=[scheduler]) trainer.fit(create_train_datasets, epochs=2, steps_per_epoch=10, callbacks=[scheduler]) # model performance after training (should improve) end_stats = trainer.evaluate(create_test_dataset, steps=NUM_TEST_SAMPLES // global_batch_size) print(end_stats) # sanity check that training worked dloss = end_stats["validation_loss"] - start_stats["validation_loss"] dmse = (end_stats["validation_mean_squared_error"] - start_stats["validation_mean_squared_error"]) print(f"dLoss: {dloss}, dMSE: {dmse}") assert dloss < 0 and dmse < 0, "training sanity check failed. loss increased!"
def __init__(self, *, model_creator, data_creator, optimizer_creator, loss_creator=None, scheduler_creator=None, training_operator_cls=TrainingOperator, initialization_hook=None, config=None, scheduler_step_freq="batch"): if not (callable(model_creator) and callable(optimizer_creator) and callable(data_creator)): raise ValueError( "Must provide a callable model_creator, optimizer_creator, " "and data_creator.") self.model_creator = model_creator self.optimizer_creator = optimizer_creator self.loss_creator = loss_creator self.data_creator = data_creator self.scheduler_creator = scheduler_creator self.training_operator_cls = training_operator_cls self.scheduler_step_freq = scheduler_step_freq if not training_operator_cls and not loss_creator: raise ValueError("If a loss_creator is not provided, you must " "provide a custom training operator.") self.initialization_hook = initialization_hook self.config = {} if config is None else config self.param = dict(model_creator=self.model_creator, data_creator=self.data_creator, optimizer_creator=self.optimizer_creator, loss_creator=self.loss_creator, scheduler_creator=self.scheduler_creator, training_operator_cls=self.training_operator_cls, scheduler_step_freq=self.scheduler_step_freq) super().__init__(RayContext.get(), worker_cls=TorchWorker, worker_param=self.param) def init_func(): import torch torch.set_num_threads(self.cores_per_node) print("Worker initialized") self.run(init_func) remote_setups = [ worker.setup.remote(None, None, None) for i, worker in enumerate(self.remote_workers) ] # Get setup tasks in order to throw errors on failure ray.get(remote_setups)
def get_default_remote_dir(name): from zoo.ray import RayContext from zoo.orca.automl.search.utils import process ray_ctx = RayContext.get() if ray_ctx.is_local: return None else: default_remote_dir = f"hdfs:///tmp/{name}" process(command=f"hadoop fs -mkdir -p {default_remote_dir}") return default_remote_dir
def to_spark_rdd(self): ray_ctx = RayContext.get() sc = ray_ctx.sc address = ray_ctx.redis_address password = ray_ctx.redis_password num_parts = ray.get(self.meta_store.num_partitions.remote()) meta_store_name = f"meta_store:{self.uuid}" rdd = sc.parallelize([0] * num_parts * 10, num_parts)\ .mapPartitionsWithIndex( lambda idx, _: get_from_ray(idx, address, password, meta_store_name)) return rdd
def from_partition_refs(parts_refs, part_ids, part_id2ip): ray_ctx = RayContext.get() uuid_str = str(uuid.uuid4()) meta_store = MetaStore.options(name=f"meta_store:{uuid_str}").remote() results = [] for part_id, part_ref in zip(part_ids, parts_refs): result = meta_store.set_partition_ref.remote(part_id, [part_ref]) results.append(result) ray.get(results) return RayRdd(uuid_str, meta_store, part_id2ip)
def test_gluon(self): current_ray_ctx = RayContext.get() address_info = current_ray_ctx.address_info assert "object_store_address" in address_info config = create_config(log_interval=2, optimizer="adam", optimizer_params={'learning_rate': 0.02}) estimator = Estimator(config, get_model, get_loss, eval_metrics_creator=get_metrics, validation_metrics_creator=get_metrics, num_workers=2) estimator.fit(get_train_data_iter, validation_data=get_test_data_iter, epochs=2) estimator.shutdown()
def to_spark_xshards(self): from zoo.orca.data import SparkXShards ray_ctx = RayContext.get() sc = ray_ctx.sc address = ray_ctx.redis_address password = ray_ctx.redis_password num_parts = self.num_partitions() partition2store = self.partition2store_name rdd = sc.parallelize([0] * num_parts * 10, num_parts)\ .mapPartitionsWithIndex( lambda idx, _: get_from_ray(idx, address, password, partition2store)) spark_xshards = SparkXShards(rdd) return spark_xshards
def test_horovod_learning_rate_schedule(self): import horovod major, minor, patch = horovod.__version__.split(".") larger_major = int(major) > 0 larger_minor = int(major) == 0 and int(minor) > 19 larger_patch = int(major) == 0 and int(minor) == 19 and int(patch) >= 2 if larger_major or larger_minor or larger_patch: ray_ctx = RayContext.get() batch_size = 32 workers_per_node = 4 global_batch_size = batch_size * workers_per_node config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=simple_model, compile_args_creator=compile_args, verbose=True, config=config, backend="horovod", workers_per_node=workers_per_node) import horovod.tensorflow.keras as hvd callbacks = [ hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=0.4, verbose=True), hvd.callbacks.LearningRateScheduleCallback(start_epoch=5, end_epoch=10, multiplier=1., initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=10, end_epoch=15, multiplier=1e-1, initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=15, end_epoch=20, multiplier=1e-2, initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=20, multiplier=1e-3, initial_lr=0.4), LRChecker() ] for i in range(30): trainer.fit(create_train_datasets, epochs=1, batch_size=global_batch_size, callbacks=callbacks) else: # skip tests in horovod lower version pass
def stop_orca_context(): """ Stop the SparkContext (and stop Ray services across the cluster if necessary). """ from pyspark import SparkContext from zoo.ray import RayContext ray_ctx = RayContext.get(initialize=False) if ray_ctx.initialized: ray_ctx.stop() sc = SparkContext.getOrCreate() if sc.getConf().get("spark.master").startswith("spark://"): from zoo import stop_spark_standalone stop_spark_standalone() sc.stop()
def fit( self, input_df, validation_df=None, metric="mse", recipe=SmokeRecipe(), mc=False, resources_per_trial={"cpu": 2}, ): """ Trains the model for time sequence prediction. If future sequence length > 1, use seq2seq model, else use vanilla LSTM model. :param input_df: The input time series data frame, Example: datetime value "extra feature 1" "extra feature 2" 2019-01-01 1.9 1 2 2019-01-02 2.3 0 2 :param validation_df: validation data :param metric: String. Metric used for train and validation. Available values are "mean_squared_error" or "r_square" :param recipe: a Recipe object. Various recipes covers different search space and stopping criteria. Default is SmokeRecipe(). :param resources_per_trial: Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}` :return: a pipeline constructed with the best model and configs. """ self._check_df(input_df) if validation_df is not None: self._check_df(validation_df) ray_ctx = RayContext.get() is_local = ray_ctx.is_local # BasePredictor._check_fit_metric(metric) if not is_local: remote_dir = os.path.join(os.sep, "ray_results", self.name) if self.name not in get_remote_list(os.path.dirname(remote_dir)): cmd = "hadoop fs -mkdir -p {}".format(remote_dir) process(cmd) else: remote_dir = None self.pipeline = self._hp_search( input_df, validation_df=validation_df, metric=metric, recipe=recipe, mc=mc, resources_per_trial=resources_per_trial, remote_dir=remote_dir) return self.pipeline
def to_ray(self): """ Put data of this SparkXShards to Ray cluster object store. :return: a new RayXShards which contains data of this SparkXShards. """ from zoo.ray import RayContext ray_ctx = RayContext.get() object_store_address = ray_ctx.address_info["object_store_address"] def put_to_plasma(ids): def f(index, iterator): import pyarrow.plasma as plasma from zoo.util.utils import get_node_ip res = list(iterator) client = plasma.connect(object_store_address) target_id = ids[index] # If the ObjectID exists in plasma, we assume a task trial # succeeds and the data is already in the object store. if not client.contains(target_id): object_id = client.put(res, target_id) assert object_id == target_id, \ "Errors occurred when putting data into plasma object store" client.disconnect() yield target_id, get_node_ip() return f # Create plasma ObjectIDs beforehand instead of creating a random one every time to avoid # memory leak in case errors occur when putting data into plasma and Spark would retry. # ObjectIDs in plasma is a byte string of length 20 containing characters and numbers. # The random generation of ObjectIDs is often good enough to ensure unique IDs. import pyarrow.plasma as plasma object_ids = [ plasma.ObjectID.from_random() for i in range(self.rdd.getNumPartitions()) ] object_id_node_ips = self.rdd.mapPartitionsWithIndex( put_to_plasma(object_ids)).collect() self.uncache() # Sort the data according to the node_ips. object_id_node_ips.sort(key=lambda x: x[1]) partitions = [ RayPartition(object_id=id_ip[0], node_ip=id_ip[1], object_store_address=object_store_address) for id_ip in object_id_node_ips ] return RayXShards(partitions)
def test_auto_shard_tf(self): # file 1 contains all 0s, file 2 contains all 1s # If shard by files, then each model will # see the same records in the same batch. # If shard by records, then each batch # will have different records. # The loss func is constructed such that # the former case will return 0, and the latter # case will return non-zero. ray_ctx = RayContext.get() trainer = Estimator.from_keras( model_creator=auto_shard_model_creator, verbose=True, backend="tf2", workers_per_node=2) stats = trainer.fit(create_auto_shard_datasets, epochs=1, batch_size=4, steps_per_epoch=2) assert stats["train_loss"] == 0.0
def _from_spark_xshards_ray_api(spark_xshards): ray_ctx = RayContext.get() address = ray_ctx.redis_address password = ray_ctx.redis_password driver_ip = ray._private.services.get_node_ip_address() uuid_str = str(uuid.uuid4()) resources = ray.cluster_resources() nodes = [] for key, value in resources.items(): if key.startswith("node:"): # if running in cluster, filter out driver ip if key != f"node:{driver_ip}": nodes.append(key) # for the case of local mode and single node spark standalone if not nodes: nodes.append(f"node:{driver_ip}") partition_stores = {} for node in nodes: name = f"partition:{uuid_str}:{node}" store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore)\ .options(name=name).remote() partition_stores[name] = store # actor creation is aync, this is to make sure they all have been started ray.get([v.get_partitions.remote() for v in partition_stores.values()]) partition_store_names = list(partition_stores.keys()) result = spark_xshards.rdd.mapPartitionsWithIndex( lambda idx, part: write_to_ray(idx, part, address, password, partition_store_names)).collect() num_empty_partitions = 0 id2ip = {} id2store_name = {} for idx, ip, local_store_name, is_empty in result: id2ip[idx] = ip id2store_name[idx] = local_store_name if is_empty: num_empty_partitions += 1 if num_empty_partitions > 0: logger.warning( f"Found {num_empty_partitions} empty partitions in your SparkXShards." ) return RayXShards(uuid_str, dict(id2store_name), dict(id2ip), partition_stores)
def test_gluon(self): current_ray_ctx = RayContext.get() address_info = current_ray_ctx.address_info assert "object_store_address" in address_info config = create_trainer_config( batch_size=32, log_interval=2, optimizer="adam", optimizer_params={'learning_rate': 0.02}) trainer = MXNetTrainer(config, get_train_data_iter, get_model, get_loss, eval_metrics_creator=get_metrics, validation_metrics_creator=get_metrics, num_workers=2, test_data=get_test_data_iter) trainer.train(nb_epoch=2)
def from_partition_refs(ip2part_id, part_id2ref): ray_ctx = RayContext.get() uuid_str = str(uuid.uuid4()) id2store_name = {} partition_stores = {} part_id2ip = {} result = [] for node, part_ids in ip2part_id.items(): name = f"partition:{uuid_str}:{node}" store = ray.remote(num_cpus=0, resources={f"node:{node}": 1e-4})(LocalStore) \ .options(name=name).remote() partition_stores[name] = store for idx in part_ids: result.append(store.upload_partition.remote(idx, part_id2ref[idx])) id2store_name[idx] = name part_id2ip[idx] = node ray.get(result) return RayXShards(uuid_str, id2store_name, part_id2ip, partition_stores)
def _from_spark_xshards_ray_api(spark_xshards): ray_ctx = RayContext.get() address = ray_ctx.redis_address password = ray_ctx.redis_password driver_ip = ray._private.services.get_node_ip_address() uuid_str = str(uuid.uuid4()) resources = ray.cluster_resources() nodes = [] for key, value in resources.items(): if key.startswith("node:"): # if running in cluster, filter out driver ip if key != f"node:{driver_ip}": nodes.append(key) # for the case of local mode and single node spark standalone if not nodes: nodes.append(f"node:{driver_ip}") partition_stores = {} for node in nodes: name = f"partition:{uuid_str}:{node}" if version.parse(ray.__version__) >= version.parse("1.4.0"): store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore)\ .options(name=name, lifetime="detached").remote() else: store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore) \ .options(name=name).remote() partition_stores[name] = store # actor creation is aync, this is to make sure they all have been started ray.get([v.get_partitions.remote() for v in partition_stores.values()]) partition_store_names = list(partition_stores.keys()) result_rdd = spark_xshards.rdd.mapPartitionsWithIndex( lambda idx, part: write_to_ray(idx, part, address, password, partition_store_names)).cache() result = result_rdd.collect() id2ip = {} id2store_name = {} for idx, ip, local_store_name in result: id2ip[idx] = ip id2store_name[idx] = local_store_name return RayXShards(uuid_str, result_rdd, partition_stores)
def stop_orca_context(): """ Stop the SparkContext (and stop Ray services across the cluster if necessary). """ from pyspark import SparkContext # If users successfully call stop_orca_context after the program finishes, # namely when there is no active SparkContext, the registered exit function # should do nothing. if SparkContext._active_spark_context is not None: print("Stopping orca context") from zoo.ray import RayContext ray_ctx = RayContext.get(initialize=False) if ray_ctx.initialized: ray_ctx.stop() sc = SparkContext.getOrCreate() if sc.getConf().get("spark.master").startswith("spark://"): from zoo import stop_spark_standalone stop_spark_standalone() sc.stop()
def to_spark_xshards(self): from zoo.orca.data import SparkXShards ray_ctx = RayContext.get() sc = ray_ctx.sc address = ray_ctx.redis_address password = ray_ctx.redis_password num_parts = self.num_partitions() partition2store = self.partition2store_name rdd = sc.parallelize([0] * num_parts * 10, num_parts)\ .mapPartitionsWithIndex( lambda idx, _: get_from_ray(idx, address, password, partition2store)) # the reason why we trigger computation here is to ensure we get the data # from ray before the RayXShards goes out of scope and the data get garbage collected from pyspark.storagelevel import StorageLevel rdd = rdd.cache() result_rdd = rdd.map(lambda x: x) # sparkxshards will uncache the rdd when gc spark_xshards = SparkXShards(result_rdd) return spark_xshards
def to_ray(self): import random import string from zoo.ray import RayContext ray_ctx = RayContext.get() object_store_address = ray_ctx.address_info["object_store_address"] # TODO: Handle failure when doing this? # TODO: delete the data in the plasma? def put_to_plasma(seed): def f(index, iterator): import pyarrow.plasma as plasma from zoo.orca.data.utils import get_node_ip # mapPartition would set the same random seed for each partition? # Here use the partition index to override the random seed so that there won't be # identical object_ids in plasma. random.seed(seed + str(index)) res = list(iterator) client = plasma.connect(object_store_address) object_id = client.put(res) yield object_id, get_node_ip() return f # Generate a random string here to make sure that when this method is called twice, the # seeds to generate plasma ObjectID are different. random_str = ''.join([ random.choice(string.ascii_letters + string.digits) for i in range(32) ]) object_id_node_ips = self.rdd.mapPartitionsWithIndex( put_to_plasma(random_str)).collect() self.uncache() # Sort the data according to the node_ips. object_id_node_ips.sort(key=lambda x: x[1]) partitions = [ RayPartition(shard_list=id_ip[0], node_ip=id_ip[1], object_store_address=object_store_address) for id_ip in object_id_node_ips ] return RayXShards(partitions)
def impl_test_auto_shard(self, backend): # file 1 contains all 0s, file 2 contains all 1s # If shard by files, then each model will # see the same records in the same batch. # If shard by records, then each batch # will have different records. # The loss func is constructed such that # the former case will return 0, and the latter # case will return non-zero. ray_ctx = RayContext.get() trainer = Estimator( model_creator=create_auto_shard_model, compile_args_creator=create_auto_shard_compile_args, verbose=True, config={}, backend=backend, workers_per_node=2) stats = trainer.fit(create_auto_shard_datasets, epochs=1, steps_per_epoch=2) assert stats["train_loss"] == 0.0
def __init__(self, config, model_creator, loss_creator=None, eval_metrics_creator=None, validation_metrics_creator=None, num_workers=None, num_servers=None, runner_cores=None): ray_ctx = RayContext.get() if not num_workers: num_workers = ray_ctx.num_ray_nodes self.config = {} if config is None else config assert isinstance(config, dict), "config must be a dict" for param in ["optimizer", "optimizer_params", "log_interval"]: assert param in config, param + " must be specified in config" self.model_creator = model_creator self.loss_creator = loss_creator self.validation_metrics_creator = validation_metrics_creator self.eval_metrics_creator = eval_metrics_creator self.num_workers = num_workers self.num_servers = num_servers if num_servers else self.num_workers # Generate actor class # Add a dummy custom resource: _mxnet_worker and _mxnet_server to diff worker from server # if runner_cores is specified so that we can place one worker and one server on a node # for better performance. Worker = ray.remote(num_cpus=runner_cores, resources={"_mxnet_worker": 1})(MXNetRunner) \ if runner_cores else ray.remote(MXNetRunner) Server = ray.remote(num_cpus=runner_cores, resources={"_mxnet_server": 1})(MXNetRunner) \ if runner_cores else ray.remote(MXNetRunner) # Start runners: workers followed by servers self.workers = [Worker.remote() for i in range(self.num_workers)] self.servers = [Server.remote() for i in range(self.num_servers)] self.runners = self.workers + self.servers env = { "DMLC_PS_ROOT_URI": str(get_host_ip()), "DMLC_PS_ROOT_PORT": str(find_free_port()), "DMLC_NUM_SERVER": str(self.num_servers), "DMLC_NUM_WORKER": str(self.num_workers), } envs = [] for i in range(self.num_workers): current_env = env.copy() current_env['DMLC_ROLE'] = 'worker' envs.append(current_env) for i in range(self.num_servers): current_env = env.copy() current_env['DMLC_ROLE'] = 'server' envs.append(current_env) env['DMLC_ROLE'] = 'scheduler' modified_env = os.environ.copy() modified_env.update(env) # Need to contain system env to run bash # TODO: Need to kill this process manually? subprocess.Popen("python -c 'import mxnet'", shell=True, env=modified_env) ray.get([ runner.setup_distributed.remote(envs[i], self.config, self.model_creator, self.loss_creator, self.validation_metrics_creator, self.eval_metrics_creator) for i, runner in enumerate(self.runners) ])
def __init__(self, model_creator, compile_args_creator=None, config=None, verbose=False, backend="tf2", workers_per_node=1): self.model_creator = model_creator self.compile_args_creator = compile_args_creator self.config = {} if config is None else config self.verbose = verbose ray_ctx = RayContext.get() if "batch_size" in self.config: raise Exception( "Please do not specify batch_size in config. Input batch_size in the" " fit/evaluate function of the estimator instead.") if "inter_op_parallelism" not in self.config: self.config["inter_op_parallelism"] = 1 if "intra_op_parallelism" not in self.config: self.config[ "intra_op_parallelism"] = ray_ctx.ray_node_cpu_cores // workers_per_node if backend == "horovod": assert compile_args_creator is not None, "compile_args_creator should not be None," \ " when backend is set to horovod" params = { "model_creator": model_creator, "compile_args_creator": compile_args_creator, "config": self.config, "verbose": self.verbose, } if backend == "tf2": cores_per_node = ray_ctx.ray_node_cpu_cores // workers_per_node num_nodes = ray_ctx.num_ray_nodes * workers_per_node worker_class = ray.remote(num_cpus=cores_per_node)(TFRunner) self.remote_workers = [ worker_class.remote(**params) for i in range(0, num_nodes) ] ips = ray.get([ worker.get_node_ip.remote() for worker in self.remote_workers ]) ports = ray.get([ worker.find_free_port.remote() for worker in self.remote_workers ]) urls = [ "{ip}:{port}".format(ip=ips[i], port=ports[i]) for i in range(len(self.remote_workers)) ] # Get setup tasks in order to throw errors on failure ray.get([ worker.setup_distributed.remote(urls, i, len(self.remote_workers)) for i, worker in enumerate(self.remote_workers) ]) elif backend == "horovod": # it is necessary to call self.run first to set horovod environment from zoo.orca.learn.horovod.horovod_ray_runner import HorovodRayRunner horovod_runner = HorovodRayRunner( ray_ctx, worker_cls=TFRunner, worker_param=params, workers_per_node=workers_per_node) horovod_runner.run(lambda: print("worker initialized")) self.remote_workers = horovod_runner.remote_workers ray.get([ worker.setup_horovod.remote() for i, worker in enumerate(self.remote_workers) ]) else: raise Exception("Only \"tf2\" and \"horovod\" are legal " "values of backend, but got {}".format(backend)) self.num_workers = len(self.remote_workers)