def process_spark_xshards(spark_xshards, num_workers): from zoo.orca.data.ray_xshards import RayXShards data = spark_xshards if data.num_partitions() != num_workers: data = data.repartition(num_workers) ray_xshards = RayXShards.from_spark_xshards(data) return ray_xshards
def _predict_spark_xshards(self, xshards, params): ray_xshards = RayXShards.from_spark_xshards(xshards) def transform_func(worker, shards_ref): params["data_creator"] = make_data_creator(shards_ref) return worker.predict.remote(**params) pred_shards = ray_xshards.transform_shards_with_actors( self.remote_workers, transform_func) spark_xshards = pred_shards.to_spark_xshards() return spark_xshards
def process_spark_xshards(spark_xshards, num_workers): data = spark_xshards if data.num_partitions() != num_workers: data = data.repartition(num_workers) # todo currently we need this information to pad the short partitions # so that every model run exactly the same number of steps in one epoch max_length = data.rdd.map(data_length) \ .mapPartitions(lambda iterator: [sum(iterator)]).max() ray_xshards = RayXShards.from_spark_xshards(data) return max_length, ray_xshards
def get_ray_xshards(): from zoo.orca.data import XShards import numpy as np ndarray_dict = {"x": np.random.randn(10, 4), "y": np.random.randn(10, 4)} spark_xshards = XShards.partition(ndarray_dict) ray_xshards = RayXShards.from_spark_xshards(spark_xshards) return ray_xshards, ndarray_dict
def _predict_spark_xshards(self, xshards, param): ray_xshards = RayXShards.from_spark_xshards(xshards) def transform_func(worker, shards_ref): data_creator = lambda config, batch_size: shards_ref return worker.predict.remote(data_creator, **param) pred_shards = ray_xshards.transform_shards_with_actors( self.remote_workers, transform_func) spark_xshards = pred_shards.to_spark_xshards() return spark_xshards
def evaluate(self, data, batch_size=32, num_steps=None, verbose=1, sample_weight=None, callbacks=None, data_config=None, feature_cols=None, label_cols=None): """ Evaluates the model on the validation data set. :param data: evaluate data. It can be XShards, Spark DataFrame or creator function which returns Iter or DataLoader. If data is XShards, each partition can be a Pandas DataFrame or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays. :param batch_size: Batch size used for evaluation. Default: 32. :param num_steps: Total number of steps (batches of samples) before declaring the evaluation round finished. Ignored with the default value of `None`. :param verbose: Prints output of one model if true. :param sample_weight: Optional Numpy array of weights for the training samples, used for weighting the loss function. You can either pass a flat (1D) Numpy array with the same length as the input samples (1:1 mapping between weights and samples), or in the case of temporal data, you can pass a 2D array with shape (samples, sequence_length), to apply a different weight to every timestep of every sample. :param callbacks: List of Keras compatible callbacks to apply during evaluation. :param data_config: An optional dictionary that can be passed to data creator function. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame or an XShards of Pandas DataFrame. Default: None. :param label_cols: Label column name(s) of data. Only used when data is a Spark DataFrame or an XShards of Pandas DataFrame. Default: None. :return: validation result """ logger.info("Starting validation step.") params = dict( batch_size=batch_size, verbose=verbose, sample_weight=sample_weight, steps=num_steps, callbacks=callbacks, data_config=data_config, ) from zoo.orca.data import SparkXShards data, _ = maybe_dataframe_to_xshards(data, validation_data=None, feature_cols=feature_cols, label_cols=label_cols, mode="evaluate") if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols) data = data if data.num_partitions() != self.num_workers: data = data.repartition(self.num_workers) ray_xshards = RayXShards.from_spark_xshards(data) def transform_func(worker, partition_refs): params["data_creator"] = make_data_creator(partition_refs) return worker.validate.remote(**params) worker_stats = ray_xshards.reduce_partitions_for_actors( self.remote_workers, transform_func) else: # data_creator functions; should return Iter or DataLoader params["data_creator"] = data params_list = [params] * self.num_workers worker_stats = ray.get([ w.validate.remote(**params_list[i]) for i, w in enumerate(self.remote_workers) ]) worker_stats = list(itertools.chain.from_iterable(worker_stats)) stats = worker_stats[0].copy() return stats
def process_spark_xshards(spark_xshards, num_workers): from zoo.orca.data.ray_xshards import RayXShards data = spark_xshards ray_xshards = RayXShards.from_spark_xshards(data) return ray_xshards