def evaluate(self, data_creator, verbose=1, sample_weight=None, steps=None, callbacks=None, data_config=None, feature_cols=None, label_cols=None): """Evaluates the model on the validation data set.""" logger.info("Starting validation step.") params = dict( verbose=verbose, sample_weight=sample_weight, steps=steps, callbacks=callbacks, data_config=data_config, ) from zoo.orca.data import SparkXShards from pyspark.sql import DataFrame if isinstance(data_creator, DataFrame): assert feature_cols is not None,\ "feature_col must be provided if data_creator is a spark dataframe" assert label_cols is not None,\ "label_cols must be provided if data_creator is a spark dataframe" schema = data_creator.schema numpy_rdd = data_creator.rdd.map(lambda row: convert_row_to_numpy( row, schema, feature_cols, label_cols)) shard_rdd = numpy_rdd.mapPartitions( lambda x: arrays2dict(x, feature_cols, label_cols)) data_creator = SparkXShards(shard_rdd) if isinstance(data_creator, SparkXShards): data = data_creator if data.num_partitions() != self.num_workers: data = data.repartition(self.num_workers) ray_xshards = RayXShards.from_spark_xshards(data) def transform_func(worker, shards_ref): params["data_creator"] = shards_ref_to_creator(shards_ref) return worker.validate.remote(**params) stats_shards = ray_xshards.transform_shards_with_actors( self.remote_workers, transform_func, gang_scheduling=True) worker_stats = stats_shards.collect() else: # data_creator functions; should return Iter or DataLoader params["data_creator"] = data_creator params_list = [params] * self.num_workers worker_stats = ray.get([ w.validate.remote(**params_list[i]) for i, w in enumerate(self.remote_workers) ]) worker_stats = list(itertools.chain.from_iterable(worker_stats)) stats = worker_stats[0].copy() return stats
def update_predict_xshards(xshard, pred_xshards): def updates(d1_d2): d1, d2 = d1_d2 d1.update(d2) return d1 result = SparkXShards(xshard.rdd.zip(pred_xshards.rdd).map(updates)) return result
def load(self, model_path, minPartitions=None): """ restore model from model file and config. :param model_path: the model file :return: the restored model """ self.internal = SparkXShards.load_pickle(model_path, minPartitions=minPartitions)
def get_pred_xshards(key): rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: { key: np.stack(x) }).map(lambda x: {key: [x[key][:, :24], x[key][:, 24:]]}) shards = SparkXShards(shards) return shards
def test_openvino_predict_xshards(self): input_data_list = [np.array([self.input] * 4), np.array([self.input] * 2)] sc = init_nncontext() rdd = sc.parallelize(input_data_list, numSlices=2) shards = SparkXShards(rdd) def pre_processing(images): return {"x": images} shards = shards.transform_shard(pre_processing) result = self.est.predict(shards) result_c = result.collect() assert isinstance(result, SparkXShards) assert result_c[0]["prediction"].shape == (4, 1000) assert result_c[1]["prediction"].shape == (2, 1000) assert self.check_result(result_c[0]["prediction"], 4) assert self.check_result(result_c[1]["prediction"], 2)
def test_transform_with_repartition(self): # shards of pandas dataframe file_path = os.path.join(self.resource_path, "orca/data/csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) partitions = data_shard.rdd.glom().collect() for par in partitions: assert len(par) <= 1 def negative(df, column_name): df[column_name] = df[column_name] * (-1) return df shard2 = data_shard.transform_shard(negative, "sale_price") shard3 = shard2.repartition(4) partitions3 = shard3.rdd.glom().collect() for par in partitions3: assert len(par) <= 1 shard4 = shard2.repartition(1) partitions4 = shard4.rdd.glom().collect() for par in partitions4: assert len(par) <= 1 shard5 = shard4.transform_shard(negative, "sale_price") partitions5 = shard5.rdd.glom().collect() for par in partitions5: assert len(par) <= 1 # shards of list data = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]] sc = init_nncontext() rdd = sc.parallelize(data) data_shard = SparkXShards(rdd) shard2 = data_shard.repartition(6) partitions2 = shard2.rdd.glom().collect() for par in partitions2: assert len(par) <= 1 shard3 = data_shard.repartition(1) partitions2 = shard3.rdd.glom().collect() for par in partitions2: assert len(par) <= 1 # shards of numpy array data = [np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]), np.array([9, 10, 11, 12]), np.array([13, 14, 15, 16])] sc = init_nncontext() rdd = sc.parallelize(data) data_shard = SparkXShards(rdd) shard2 = data_shard.repartition(6) partitions2 = shard2.rdd.glom().collect() for par in partitions2: assert len(par) <= 1 shard3 = data_shard.repartition(1) partitions2 = shard3.rdd.glom().collect() for par in partitions2: assert len(par) <= 1
def test_spark_xshards(self): from zoo import init_nncontext from zoo.orca.data import SparkXShards estimator = get_estimator(workers_per_node=1) sc = init_nncontext() x_rdd = sc.parallelize(np.random.rand(4000, 1, 50).astype(np.float32)) # torch 1.7.1+ requires target size same as output size, which is (batch, 1) y_rdd = sc.parallelize( np.random.randint(0, 2, size=(4000, 1, 1)).astype(np.float32)) rdd = x_rdd.zip(y_rdd).map(lambda x_y: {'x': x_y[0], 'y': x_y[1]}) train_rdd, val_rdd = rdd.randomSplit([0.9, 0.1]) train_xshards = SparkXShards(train_rdd) val_xshards = SparkXShards(val_rdd) train_stats = estimator.fit(train_xshards, batch_size=256, epochs=2) print(train_stats) val_stats = estimator.evaluate(val_xshards, batch_size=128) print(val_stats) estimator.shutdown()
def _dataframe_to_xshards(data, feature_cols, label_cols=None): from zoo.orca import OrcaContext schema = data.schema shard_size = OrcaContext._shard_size numpy_rdd = data.rdd.map(lambda row: convert_row_to_numpy( row, schema, feature_cols, label_cols)) shard_rdd = numpy_rdd.mapPartitions( lambda x: arrays2dict(x, feature_cols, label_cols, shard_size)) return SparkXShards(shard_rdd)
def test_openvino_predict_xshards(self): input_data_list = [ np.random.random([1, 4, 3, 224, 224]), np.random.random([2, 4, 3, 224, 224]) ] sc = init_nncontext() rdd = sc.parallelize(input_data_list, numSlices=2) shards = SparkXShards(rdd) def pre_processing(images): return {"x": images} shards = shards.transform_shard(pre_processing) result = self.est.predict(shards) result_c = result.collect() assert isinstance(result, SparkXShards) assert result_c[0]["prediction"].shape == (1, 4, 1000) assert result_c[1]["prediction"].shape == (2, 4, 1000)
def test_zip(self): def negative(df, column_name, minus_val): df[column_name] = df[column_name] * (-1) df[column_name] = df[column_name] - minus_val return df file_path = os.path.join(self.resource_path, "orca/data/json") data_shard = zoo.orca.data.pandas.read_json(file_path, orient='columns', lines=True) data_shard = data_shard.repartition(2) data_shard.cache() transformed_shard = data_shard.transform_shard(negative, "value", 2) zipped_shard = data_shard.zip(transformed_shard) assert not transformed_shard.is_cached( ), "transformed_shard should be uncached." data = zipped_shard.collect() assert data[0][0]["value"].values[0] + data[0][1]["value"].values[0] == -2, \ "value should be -2" list1 = list([1, 2, 3]) with self.assertRaises(Exception) as context: data_shard.zip(list1) self.assertTrue( 'other should be a SparkXShards' in str(context.exception)) transformed_shard = transformed_shard.repartition( data_shard.num_partitions() - 1) with self.assertRaises(Exception) as context: data_shard.zip(transformed_shard) self.assertTrue( 'The two SparkXShards should have the same number of partitions' in str(context.exception)) dict_data = [{"x": 1, "y": 2}, {"x": 2, "y": 3}] sc = init_nncontext() rdd = sc.parallelize(dict_data) dict_shard = SparkXShards(rdd) dict_shard = dict_shard.repartition(1) with self.assertRaises(Exception) as context: transformed_shard.zip(dict_shard) self.assertTrue( 'The two SparkXShards should have the same number of elements in ' 'each partition' in str(context.exception))
def test_spark_xshards(self): from zoo import init_nncontext from zoo.orca.data import SparkXShards estimator = Estimator.from_torch(model=get_model, optimizer=get_optimizer, loss=nn.BCELoss(), config={"lr": 1e-1}, backend="torch_distributed") sc = init_nncontext() x_rdd = sc.parallelize(np.random.rand(4000, 1, 50).astype(np.float32)) y_rdd = sc.parallelize( np.random.randint(0, 2, size=(4000, 1)).astype(np.float32)) rdd = x_rdd.zip(y_rdd).map(lambda x_y: {'x': x_y[0], 'y': x_y[1]}) train_rdd, val_rdd = rdd.randomSplit([0.9, 0.1]) train_xshards = SparkXShards(train_rdd) val_xshards = SparkXShards(val_rdd) train_stats = estimator.fit(train_xshards, batch_size=256, epochs=2) print(train_stats) val_stats = estimator.evaluate(val_xshards, batch_size=128) print(val_stats) estimator.shutdown()
def to_spark_xshards(self): from zoo.orca.data import SparkXShards ray_ctx = RayContext.get() sc = ray_ctx.sc address = ray_ctx.redis_address password = ray_ctx.redis_password num_parts = self.num_partitions() partition2store = self.partition2store_name rdd = sc.parallelize([0] * num_parts * 10, num_parts)\ .mapPartitionsWithIndex( lambda idx, _: get_from_ray(idx, address, password, partition2store)) spark_xshards = SparkXShards(rdd) return spark_xshards
def convert_predict_to_xshard(prediction_rdd): def transform_predict(iter): predictions = list(iter) # list of np array if isinstance(predictions[0], list): predictions = np.array(predictions).T.tolist() result = [np.array(predict) for predict in predictions] return [{'prediction': result}] # np array else: return [{'prediction': np.array(predictions)}] return SparkXShards(prediction_rdd.mapPartitions(transform_predict))
def test_convert_predict_rdd_to_xshard(self): rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) pred_rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) result_shards = convert_predict_rdd_to_xshard(shards, pred_rdd) result = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result, expected_result)
def test_xshards_predict(self): sc = init_nncontext() rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) estimator = get_estimator(workers_per_node=2, model_fn=lambda config: IdentityNet()) result_shards = estimator.predict(shards, batch_size=4) result = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result, expected_result)
def convert_predict_rdd_to_xshard(data, prediction_rdd): import numpy as np from zoo.orca.data import SparkXShards def group_index(iter): for data in iter: size = get_size(data["x"]) for i in range(size): yield size def transform_predict(predictions): # list of np array if isinstance(predictions[0], list): predictions = np.array(predictions).T.tolist() result = [np.array(predict) for predict in predictions] return result # np array else: return np.array(predictions) def group(iter): this_index = 0 buffer = [] this_count = None for (count, pred) in iter: if this_index == 0: this_count = count if this_index < this_count: buffer.append(pred) this_index += 1 if this_index == this_count: yield transform_predict(buffer) buffer.clear() this_index = 0 def add_pred(shard_pred): shard, pred = shard_pred shard["prediction"] = pred return shard indexed_rdd = data.rdd.mapPartitions(group_index) grouped_pred = indexed_rdd.zip(prediction_rdd).mapPartitions(group) result_rdd = data.rdd.zip(grouped_pred).map(add_pred) return SparkXShards(result_rdd)
def to_spark_xshards(self): from zoo.orca.data import SparkXShards ray_ctx = RayContext.get() sc = ray_ctx.sc address = ray_ctx.redis_address password = ray_ctx.redis_password num_parts = self.num_partitions() partition2store = self.partition2store_name rdd = sc.parallelize([0] * num_parts * 10, num_parts)\ .mapPartitionsWithIndex( lambda idx, _: get_from_ray(idx, address, password, partition2store)) # the reason why we trigger computation here is to ensure we get the data # from ray before the RayXShards goes out of scope and the data get garbage collected from pyspark.storagelevel import StorageLevel rdd = rdd.cache() result_rdd = rdd.map(lambda x: x) # sparkxshards will uncache the rdd when gc spark_xshards = SparkXShards(result_rdd) return spark_xshards
def predict(self, data_creator, batch_size=None, verbose=1, steps=None, callbacks=None, data_config=None, feature_cols=None): """Evaluates the model on the validation data set.""" logger.info("Starting predict step.") params = dict( verbose=verbose, batch_size=batch_size, steps=steps, callbacks=callbacks, data_config=data_config, ) from zoo.orca.data import SparkXShards from pyspark.sql import DataFrame if isinstance(data_creator, DataFrame): assert feature_cols is not None,\ "feature_col must be provided if data_creator is a spark dataframe" schema = data_creator.schema numpy_rdd = data_creator.rdd.map(lambda row: convert_row_to_numpy( row, schema, feature_cols, None)) shard_rdd = numpy_rdd.mapPartitions( lambda x: arrays2dict(x, feature_cols, None)) data_creator = SparkXShards(shard_rdd) if isinstance(data_creator, SparkXShards): ray_xshards = RayXShards.from_spark_xshards(data_creator) def transform_func(worker, shards_ref): params["data_creator"] = shards_ref_to_creator(shards_ref) return worker.predict.remote(**params) stats_shards = ray_xshards.transform_shards_with_actors( self.remote_workers, transform_func, gang_scheduling=False) spark_xshards = stats_shards.to_spark_xshards() else: raise ValueError("Only xshards is supported for predict") return spark_xshards
def _read_as_xshards(path): rdd, schema = ParquetDataset._read_as_dict_rdd(path) def merge_records(schema, iter): l = list(iter) result = {} for k in schema.keys(): result[k] = [] for i, rec in enumerate(l): for k in schema.keys(): result[k].append(rec[k]) for k, v in schema.items(): if not v.feature_type == FeatureType.IMAGE: result[k] = np.stack(result[k]) return [result] result_rdd = rdd.mapPartitions(lambda iter: merge_records(schema, iter)) xshards = SparkXShards(result_rdd) return xshards
def read_parquet(file_path, columns=None, schema=None, **options): """ Read parquet files to SparkXShards of pandas DataFrames. :param file_path: Parquet file path, a list of multiple parquet file paths, or a directory containing parquet files. Local file system, HDFS, and AWS S3 are supported. :param columns: list of column name, default=None. If not None, only these columns will be read from the file. :param schema: pyspark.sql.types.StructType for the input schema or a DDL-formatted string (For example col0 INT, col1 DOUBLE). :param options: other options for reading parquet. :return: An instance of SparkXShards. """ sc = init_nncontext() from pyspark.sql import SQLContext sqlContext = SQLContext.getOrCreate(sc) spark = sqlContext.sparkSession # df = spark.read.parquet(file_path) df = spark.read.load(file_path, "parquet", schema=schema, **options) if columns: df = df.select(*columns) def to_pandas(columns): def f(iter): import pandas as pd data = list(iter) pd_df = pd.DataFrame(data, columns=columns) return [pd_df] return f pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns)) try: data_shards = SparkXShards(pd_rdd) except Exception as e: print("An error occurred when reading parquet files") raise e return data_shards
def read_parquet(file_path, columns=None, **kwargs): """ Read parquet files to SparkXShards of pandas DataFrames. :param file_path: Parquet file path, a list of multiple parquet file paths, or a directory containing parquet files. Local file system, HDFS, and AWS S3 are supported. :param columns: list of column name, default=None. If not None, only these columns will be read from the file. :param kwargs: Any additional kwargs. :return: An instance of SparkXShards. """ sc = init_nncontext() from pyspark.sql import SQLContext sqlContext = SQLContext.getOrCreate(sc) spark = sqlContext.sparkSession df = spark.read.parquet(file_path) if columns: df = df.select(*columns) def to_pandas(columns): def f(iter): import pandas as pd data = list(iter) pd_df = pd.DataFrame(data, columns=columns) return [pd_df] return f pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns)) try: data_shards = SparkXShards(pd_rdd) except Exception as e: print("An error occurred when reading parquet files") raise e return data_shards
def predict(self, data, feature_cols=None, batch_size=4): """ Predict input data :param batch_size: Int. Set batch Size, default is 4. :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy arrays are supported. If data is XShards, each partition is a dictionary of {'x': feature}, where feature(label) is a numpy array or a list of numpy arrays. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :return: predicted result. If the input data is XShards, the predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where the result is a numpy array or a list of numpy arrays. If the input data is numpy arrays or list of numpy arrays, the predict result is a numpy array or a list of numpy arrays. """ sc = init_nncontext() model_bytes_broadcast = sc.broadcast(self.model_bytes) weight_bytes_broadcast = sc.broadcast(self.weight_bytes) def partition_inference(partition): model_bytes = model_bytes_broadcast.value weight_bytes = weight_bytes_broadcast.value partition = list(partition) data_num = len(partition) ie = IECore() config = {'CPU_THREADS_NUM': str(self.core_num)} ie.set_config(config, 'CPU') net = ie.read_network(model=model_bytes, weights=weight_bytes, init_from_buffer=True) net.batch_size = batch_size local_model = ie.load_network(network=net, device_name="CPU", num_requests=data_num) inputs = list(iter(local_model.requests[0].input_blobs)) outputs = list(iter(local_model.requests[0].output_blobs)) assert len( outputs) != 0, "The number of model outputs should not be 0." def add_elem(d): d_len = len(d) if d_len < batch_size: rep_time = [1] * (d_len - 1) rep_time.append(batch_size - d_len + 1) return np.repeat(d, rep_time, axis=0), d_len else: return d, d_len results = [] for idx, batch_data in enumerate(partition): infer_request = local_model.requests[idx] input_dict = dict() elem_num = 0 if isinstance(batch_data, list): for i, input in enumerate(inputs): input_dict[input], elem_num = add_elem(batch_data[i]) else: input_dict[inputs[0]], elem_num = add_elem(batch_data) infer_request.infer(input_dict) if len(outputs) == 1: results.append(infer_request.output_blobs[ outputs[0]].buffer[:elem_num]) else: results.append( list( map( lambda output: infer_request.output_blobs[ output].buffer[:elem_num], outputs))) return results def predict_transform(dict_data, batch_size): assert isinstance(dict_data, dict), "each shard should be an dict" assert "x" in dict_data, "key x should in each shard" feature_data = dict_data["x"] if isinstance(feature_data, np.ndarray): assert feature_data.shape[0] <= batch_size, \ "The batch size of input data (the second dim) should be less than the model " \ "batch size, otherwise some inputs will be ignored." elif isinstance(feature_data, list): for elem in feature_data: assert isinstance(elem, np.ndarray), "Each element in the x list should be " \ "a ndarray, but get " + \ elem.__class__.__name__ assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some inputs " \ "will be ignored." else: raise ValueError( "x in each shard should be a ndarray or a list of ndarray." ) return feature_data if isinstance(data, DataFrame): from zoo.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe xshards, _ = dataframe_to_xshards(data, validation_data=None, feature_cols=feature_cols, label_cols=None, mode="predict") transformed_data = xshards.transform_shard(predict_transform, batch_size) result_rdd = transformed_data.rdd.mapPartitions( lambda iter: partition_inference(iter)) return convert_predict_rdd_to_dataframe( data, result_rdd.flatMap(lambda data: data)) elif isinstance(data, SparkXShards): transformed_data = data.transform_shard(predict_transform, batch_size) result_rdd = transformed_data.rdd.mapPartitions( lambda iter: partition_inference(iter)) def update_result_shard(data): shard, y = data shard["prediction"] = y return shard return SparkXShards( data.rdd.zip(result_rdd).map(update_result_shard)) elif isinstance(data, (np.ndarray, list)): if isinstance(data, np.ndarray): split_num = math.ceil(len(data) / batch_size) arrays = np.array_split(data, split_num) num_slices = min(split_num, self.node_num) data_rdd = sc.parallelize(arrays, numSlices=num_slices) elif isinstance(data, list): flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_rdd = [] split_num = math.ceil(flattened[0].shape[0] / batch_size) num_slices = min(split_num, self.node_num) for i in range(split_num): data_to_be_rdd.append([]) for x in flattened: assert isinstance(x, np.ndarray), "the data in the data list should be " \ "ndarrays, but get " + \ x.__class__.__name__ assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension" \ ", got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, split_num) for idx, x_part in enumerate(x_parts): data_to_be_rdd[idx].append(x_part) data_to_be_rdd = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_rdd ] data_rdd = sc.parallelize(data_to_be_rdd, numSlices=num_slices) print("Partition number: ", data_rdd.getNumPartitions()) result_rdd = data_rdd.mapPartitions( lambda iter: partition_inference(iter)) result_arr_list = result_rdd.collect() result_arr = None if isinstance(result_arr_list[0], list): result_arr = [ np.concatenate([r[i] for r in result_arr_list], axis=0) for i in range(len(result_arr_list[0])) ] elif isinstance(result_arr_list[0], np.ndarray): result_arr = np.concatenate(result_arr_list, axis=0) return result_arr else: raise ValueError( "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr" "ays are supported as input data, but get " + data.__class__.__name__)
def read_file_spark(file_path, file_type, **kwargs): sc = init_nncontext() file_url_splits = file_path.split("://") prefix = file_url_splits[0] node_num, core_num = get_node_and_core_number() file_paths = [] if isinstance(file_path, list): [ file_paths.extend(extract_one_path(path, file_type, os.environ)) for path in file_path ] else: file_paths = extract_one_path(file_path, file_type, os.environ) if not file_paths: raise Exception( "The file path is invalid/empty or does not include csv/json files" ) if ZooContext.orca_pandas_read_backend == "pandas": num_files = len(file_paths) total_cores = node_num * core_num num_partitions = num_files if num_files < total_cores else total_cores rdd = sc.parallelize(file_paths, num_partitions) if prefix == "hdfs": pd_rdd = rdd.mapPartitions( lambda iter: read_pd_hdfs_file_list(iter, file_type, **kwargs)) elif prefix == "s3": pd_rdd = rdd.mapPartitions( lambda iter: read_pd_s3_file_list(iter, file_type, **kwargs)) else: def loadFile(iterator): for x in iterator: df = read_pd_file(x, file_type, **kwargs) yield df pd_rdd = rdd.mapPartitions(loadFile) else: from pyspark.sql import SQLContext sqlContext = SQLContext.getOrCreate(sc) spark = sqlContext.sparkSession # TODO: add S3 confidentials if file_type == "json": df = spark.read.json(file_paths, **kwargs) elif file_type == "csv": df = spark.read.csv(file_paths, **kwargs) else: raise Exception("Unsupported file type") if df.rdd.getNumPartitions() < node_num: df = df.repartition(node_num) def to_pandas(columns): def f(iter): import pandas as pd data = list(iter) yield pd.DataFrame(data, columns=columns) return f pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns)) data_shards = SparkXShards(pd_rdd) return data_shards
def fit( self, data_creator, epochs=1, verbose=1, callbacks=None, validation_data_creator=None, class_weight=None, steps_per_epoch=None, validation_steps=None, validation_freq=1, data_config=None, feature_cols=None, label_cols=None, ): """Runs a training epoch.""" params = dict(epochs=epochs, verbose=verbose, callbacks=callbacks, class_weight=class_weight, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq, data_config=data_config) from zoo.orca.data import SparkXShards from pyspark.sql import DataFrame if isinstance(data_creator, DataFrame): assert feature_cols is not None,\ "feature_col must be provided if data_creator is a spark dataframe" assert label_cols is not None,\ "label_cols must be provided if data_creator is a spark dataframe" schema = data_creator.schema numpy_rdd = data_creator.rdd.map(lambda row: convert_row_to_numpy( row, schema, feature_cols, label_cols)) shard_rdd = numpy_rdd.mapPartitions( lambda x: arrays2dict(x, feature_cols, label_cols)) data_creator = SparkXShards(shard_rdd) if isinstance(data_creator, SparkXShards): max_length, ray_xshards = process_spark_xshards( data_creator, self.num_workers) if validation_data_creator is None: def transform_func(worker, shards_ref): params["data_creator"] = shards_ref_to_creator(shards_ref) return worker.step.remote(**params) stats_shards = ray_xshards.transform_shards_with_actors( self.remote_workers, transform_func, gang_scheduling=True) else: val_max_length, val_ray_xshards = process_spark_xshards( validation_data_creator, self.num_workers) def zip_func(worker, this_shards_ref, that_shards_ref): params["data_creator"] = shards_ref_to_creator( this_shards_ref) params["validation_data_creator"] =\ shards_ref_to_creator(that_shards_ref) return worker.step.remote(**params) stats_shards = ray_xshards.zip_shards_with_actors( val_ray_xshards, self.remote_workers, zip_func, gang_scheduling=True) worker_stats = stats_shards.collect() else: params["data_creator"] = data_creator params["validation_data_creator"] = validation_data_creator params_list = [params] * self.num_workers worker_stats = ray.get([ self.remote_workers[i].step.remote(**params_list[i]) for i in range(self.num_workers) ]) worker_stats = list(itertools.chain.from_iterable(worker_stats)) stats = worker_stats[0].copy() return stats
def test_nnEstimator(self): from zoo.pipeline.nnframes import NNModel linear_model = Sequential().add(Linear(2, 2)) mse_criterion = MSECriterion() df = self.get_estimator_df() est = Estimator.from_bigdl(model=linear_model, loss=mse_criterion, optimizer=Adam(), feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([2])) res0 = est.predict(df) res0_c = res0.collect() est.fit(df, 1, batch_size=4) nn_model = NNModel(est.get_model(), feature_preprocessing=SeqToTensor([2])) res1 = nn_model.transform(df) res2 = est.predict(df) res1_c = res1.collect() res2_c = res2.collect() assert type(res1).__name__ == 'DataFrame' assert type(res2).__name__ == 'DataFrame' assert len(res1_c) == len(res2_c) for idx in range(len(res1_c)): assert res1_c[idx]["prediction"] == res2_c[idx]["prediction"] with tempfile.TemporaryDirectory() as tempdirname: temp_path = os.path.join(tempdirname, "model") est.save(temp_path) est2 = Estimator.from_bigdl(model=linear_model, loss=mse_criterion) est2.load(temp_path, optimizer=Adam(), loss=mse_criterion, feature_preprocessing=SeqToTensor([2]), label_preprocessing=SeqToTensor([2])) est2.set_constant_gradient_clipping(0.1, 1.2) est2.clear_gradient_clipping() res3 = est2.predict(df) res3_c = res3.collect() assert type(res3).__name__ == 'DataFrame' assert len(res1_c) == len(res3_c) for idx in range(len(res1_c)): assert res1_c[idx]["prediction"] == res3_c[idx]["prediction"] est2.fit(df, 4, batch_size=4) data = self.sc.parallelize([((2.0, 1.0), (1.0, 2.0)), ((1.0, 2.0), (2.0, 1.0)), ((2.0, 1.0), (1.0, 2.0)), ((1.0, 2.0), (2.0, 1.0))]) data_shard = SparkXShards(data) data_shard = data_shard.transform_shard( lambda feature_label_tuple: { "x": [ np.expand_dims(np.array(feature_label_tuple[0][0]), axis=0 ), np.expand_dims(np.array(feature_label_tuple[0][1]), axis=0) ], "y": [ np.expand_dims(np.array(feature_label_tuple[1][0]), axis=0 ), np.expand_dims(np.array(feature_label_tuple[1][1]), axis=0) ] }) res4 = est.predict(data_shard) res4_c = res4.collect() assert type(res4).__name__ == 'SparkXShards' for idx in range(len(res4_c)): assert abs(res4_c[idx]["prediction"][0][0] - res3_c[idx]["prediction"][0]) == 0 assert abs(res4_c[idx]["prediction"][0][1] - res3_c[idx]["prediction"][1]) == 0 est.fit(data_shard, 1, batch_size=4) res5 = est.predict(data_shard) res5_c = res5.collect() res6 = est.predict(df) res6_c = res6.collect() for idx in range(len(res5_c)): assert abs(res5_c[idx]["prediction"][0][0] - res6_c[idx]["prediction"][0]) == 0 assert abs(res5_c[idx]["prediction"][0][1] - res6_c[idx]["prediction"][1]) == 0
def read_file_spark(file_path, file_type, **kwargs): sc = init_nncontext() node_num, core_num = get_node_and_core_number() backend = OrcaContext.pandas_read_backend if backend == "pandas": file_url_splits = file_path.split("://") prefix = file_url_splits[0] file_paths = [] if isinstance(file_path, list): [ file_paths.extend(extract_one_path(path, os.environ)) for path in file_path ] else: file_paths = extract_one_path(file_path, os.environ) if not file_paths: raise Exception( "The file path is invalid or empty, please check your data") num_files = len(file_paths) total_cores = node_num * core_num num_partitions = num_files if num_files < total_cores else total_cores rdd = sc.parallelize(file_paths, num_partitions) if prefix == "hdfs": pd_rdd = rdd.mapPartitions( lambda iter: read_pd_hdfs_file_list(iter, file_type, **kwargs)) elif prefix == "s3": pd_rdd = rdd.mapPartitions( lambda iter: read_pd_s3_file_list(iter, file_type, **kwargs)) else: def loadFile(iterator): for x in iterator: df = read_pd_file(x, file_type, **kwargs) yield df pd_rdd = rdd.mapPartitions(loadFile) else: # Spark backend; spark.read.csv/json accepts a folder path as input assert file_type == "json" or file_type == "csv", \ "Unsupported file type: %s. Only csv and json files are supported for now" % file_type from pyspark.sql import SQLContext sqlContext = SQLContext.getOrCreate(sc) spark = sqlContext.sparkSession # TODO: add S3 confidentials # The following implementation is adapted from # https://github.com/databricks/koalas/blob/master/databricks/koalas/namespace.py # with some modifications. if "mangle_dupe_cols" in kwargs: assert kwargs[ "mangle_dupe_cols"], "mangle_dupe_cols can only be True" kwargs.pop("mangle_dupe_cols") if "parse_dates" in kwargs: assert not kwargs["parse_dates"], "parse_dates can only be False" kwargs.pop("parse_dates") names = kwargs.get("names", None) if "names" in kwargs: kwargs.pop("names") usecols = kwargs.get("usecols", None) if "usecols" in kwargs: kwargs.pop("usecols") dtype = kwargs.get("dtype", None) if "dtype" in kwargs: kwargs.pop("dtype") squeeze = kwargs.get("squeeze", False) if "squeeze" in kwargs: kwargs.pop("squeeze") index_col = kwargs.get("index_col", None) if "index_col" in kwargs: kwargs.pop("index_col") if file_type == "csv": # Handle pandas-compatible keyword arguments kwargs["inferSchema"] = True header = kwargs.get("header", "infer") if isinstance(names, str): kwargs["schema"] = names if header == "infer": header = 0 if names is None else None if header == 0: kwargs["header"] = True elif header is None: kwargs["header"] = False else: raise ValueError("Unknown header argument {}".format(header)) if "quotechar" in kwargs: quotechar = kwargs["quotechar"] kwargs.pop("quotechar") kwargs["quote"] = quotechar if "escapechar" in kwargs: escapechar = kwargs["escapechar"] kwargs.pop("escapechar") kwargs["escape"] = escapechar # sep and comment are the same as pandas if "comment" in kwargs: comment = kwargs["comment"] if not isinstance(comment, str) or len(comment) != 1: raise ValueError( "Only length-1 comment characters supported") df = spark.read.csv(file_path, **kwargs) if header is None: df = df.selectExpr(*[ "`%s` as `%s`" % (field.name, i) for i, field in enumerate(df.schema) ]) else: df = spark.read.json(file_path, **kwargs) # Handle pandas-compatible postprocessing arguments if usecols is not None and not callable(usecols): usecols = list(usecols) renamed = False if isinstance(names, list): if len(set(names)) != len(names): raise ValueError( "Found duplicate names, please check your names input") if usecols is not None: if not callable(usecols): # usecols is list if len(names) != len(usecols) and len(names) != len( df.schema): raise ValueError("Passed names did not match usecols") if len(names) == len(df.schema): df = df.selectExpr(*[ "`%s` as `%s`" % (field.name, name) for field, name in zip(df.schema, names) ]) renamed = True else: if len(names) != len(df.schema): raise ValueError( "The number of names [%s] does not match the number " "of columns [%d]. Try names by a Spark SQL DDL-formatted " "string." % (len(names), len(df.schema))) df = df.selectExpr(*[ "`%s` as `%s`" % (field.name, name) for field, name in zip(df.schema, names) ]) renamed = True index_map = dict([(i, field.name) for i, field in enumerate(df.schema)]) if usecols is not None: if callable(usecols): cols = [ field.name for field in df.schema if usecols(field.name) ] missing = [] elif all(isinstance(col, int) for col in usecols): cols = [ field.name for i, field in enumerate(df.schema) if i in usecols ] missing = [ col for col in usecols if col >= len(df.schema) or df.schema[col].name not in cols ] elif all(isinstance(col, str) for col in usecols): cols = [ field.name for field in df.schema if field.name in usecols ] if isinstance(names, list): missing = [c for c in usecols if c not in names] else: missing = [col for col in usecols if col not in cols] else: raise ValueError( "usecols must only be list-like of all strings, " "all unicode, all integers or a callable.") if len(missing) > 0: raise ValueError( "usecols do not match columns, columns expected but not found: %s" % missing) if len(cols) > 0: df = df.select(cols) if isinstance(names, list): if not renamed: df = df.selectExpr(*[ "`%s` as `%s`" % (col, name) for col, name in zip(cols, names) ]) # update index map after rename for index, col in index_map.items(): if col in cols: index_map[index] = names[cols.index(col)] if df.rdd.getNumPartitions() < node_num: df = df.repartition(node_num) def to_pandas(columns, squeeze=False, index_col=None): def f(iter): import pandas as pd data = list(iter) pd_df = pd.DataFrame(data, columns=columns) if dtype is not None: if isinstance(dtype, dict): for col, type in dtype.items(): if isinstance(col, str): if col not in pd_df.columns: raise ValueError( "column to be set type is not" " in current dataframe") pd_df[col] = pd_df[col].astype(type) elif isinstance(col, int): if index_map[col] not in pd_df.columns: raise ValueError( "column index to be set type is not" " in current dataframe") pd_df[index_map[col]] = pd_df[ index_map[col]].astype(type) else: pd_df = pd_df.astype(dtype) if squeeze and len(pd_df.columns) == 1: pd_df = pd_df.iloc[:, 0] if index_col: pd_df = pd_df.set_index(index_col) return [pd_df] return f pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns, squeeze, index_col)) try: data_shards = SparkXShards(pd_rdd) except Exception as e: alternative_backend = "pandas" if backend == "spark" else "spark" print( "An error occurred when reading files with '%s' backend, you may switch to '%s' " "backend for another try. You can set the backend using " "OrcaContext.pandas_read_backend" % (backend, alternative_backend)) raise e return data_shards
def predict(self, data): """ Predict input data :param data: data to be predicted. XShards, numpy array and list of numpy arrays are supported. If data is XShards, each partition is a dictionary of {'x': feature}, where feature(label) is a numpy array or a list of numpy arrays. :return: predicted result. If the input data is XShards, the predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where the result is a numpy array or a list of numpy arrays. If the input data is numpy arrays or list of numpy arrays, the predict result is a numpy array or a list of numpy arrays. """ def predict_transform(dict_data, batch_size): assert isinstance(dict_data, dict), "each shard should be an dict" assert "x" in dict_data, "key x should in each shard" feature_data = dict_data["x"] if isinstance(feature_data, np.ndarray): assert feature_data.shape[1] <= batch_size, \ "The batch size of input data (the second dim) should be less than the model " \ "batch size, otherwise some inputs will be ignored." elif isinstance(feature_data, list): for elem in feature_data: assert isinstance(elem, np.ndarray), "Each element in the x list should be " \ "a ndarray, but get " + \ elem.__class__.__name__ assert elem.shape[1] <= batch_size, "The batch size of each input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some inputs " \ "will be ignored." else: raise ValueError( "x in each shard should be a ndarray or a list of ndarray." ) return dict_data["x"] sc = init_nncontext() if isinstance(data, SparkXShards): assert sc is not None, "You should pass sc(spark context) if data is a XShards." from zoo.orca.learn.utils import convert_predict_rdd_to_xshard transformed_data = data.transform_shard(predict_transform, self.batch_size) result_rdd = self.model.distributed_predict( transformed_data.rdd, sc) def update_shard(data): shard, y = data shard["prediction"] = y return shard return SparkXShards(data.rdd.zip(result_rdd).map(update_shard)) elif isinstance(data, (np.ndarray, list)): total_core_num = self.core_num * self.node_num if isinstance(data, np.ndarray): assert data.shape[1] <= self.batch_size, "The batch size of input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some " \ "inputs will be ignored." split_num = min(total_core_num, data.shape[0]) arrays = np.array_split(data, split_num) data_rdd = sc.parallelize(arrays, numSlices=split_num) elif isinstance(data, list): flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_rdd = [] split_num = min(total_core_num, flattened[0].shape[0]) for i in range(split_num): data_to_be_rdd.append([]) for x in flattened: assert isinstance(x, np.ndarray), "the data in the data list should be " \ "ndarrays, but get " + \ x.__class__.__name__ assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension" \ ", got first ndarray of size {} and another {}".format(data_length, len(x)) assert x.shape[1] <= self.batch_size, "The batch size of each input data (" \ "the second dim) should be less than " \ "the model batch size, otherwise some " \ "inputs will be ignored." x_parts = np.array_split(x, split_num) for idx, x_part in enumerate(x_parts): data_to_be_rdd[idx].append(x_part) data_to_be_rdd = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_rdd ] data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num) result_rdd = self.model.distributed_predict(data_rdd, sc) result_arr_list = result_rdd.collect() result_arr = np.concatenate(result_arr_list, axis=0) return result_arr else: raise ValueError( "Only XShards, a numpy array and a list of numpy arrays are supported " "as input data, but get " + data.__class__.__name__)
def predict(self, data, feature_cols=None): """ Predict input data :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy arrays are supported. If data is XShards, each partition is a dictionary of {'x': feature}, where feature(label) is a numpy array or a list of numpy arrays. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :return: predicted result. If the input data is XShards, the predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where the result is a numpy array or a list of numpy arrays. If the input data is numpy arrays or list of numpy arrays, the predict result is a numpy array or a list of numpy arrays. """ from pyspark.sql import DataFrame def predict_transform(dict_data, batch_size): assert isinstance(dict_data, dict), "each shard should be an dict" assert "x" in dict_data, "key x should in each shard" feature_data = dict_data["x"] if isinstance(feature_data, np.ndarray): assert feature_data.shape[0] <= batch_size, \ "The batch size of input data (the second dim) should be less than the model " \ "batch size, otherwise some inputs will be ignored." elif isinstance(feature_data, list): for elem in feature_data: assert isinstance(elem, np.ndarray), "Each element in the x list should be " \ "a ndarray, but get " + \ elem.__class__.__name__ assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some inputs " \ "will be ignored." else: raise ValueError( "x in each shard should be a ndarray or a list of ndarray." ) return feature_data sc = init_nncontext() if isinstance(data, DataFrame): from zoo.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe xshards, _ = dataframe_to_xshards(data, validation_data=None, feature_cols=feature_cols, label_cols=None, mode="predict") transformed_data = xshards.transform_shard(predict_transform, self.batch_size) result_rdd = self.model.distributed_predict( transformed_data.rdd, sc) def delete_useless_result(data): shard, y = data data_length = len(shard["x"]) return y[:data_length] result_rdd = xshards.rdd.zip(result_rdd).map(delete_useless_result) return convert_predict_rdd_to_dataframe( data, result_rdd.flatMap(lambda data: data)) elif isinstance(data, SparkXShards): transformed_data = data.transform_shard(predict_transform, self.batch_size) result_rdd = self.model.distributed_predict( transformed_data.rdd, sc) def update_shard(data): shard, y = data data_length = len(shard["x"]) shard["prediction"] = y[:data_length] return shard return SparkXShards(data.rdd.zip(result_rdd).map(update_shard)) elif isinstance(data, (np.ndarray, list)): if isinstance(data, np.ndarray): split_num = math.ceil(len(data) / self.batch_size) arrays = np.array_split(data, split_num) data_length_list = list(map(lambda arr: len(arr), arrays)) data_rdd = sc.parallelize(arrays, numSlices=split_num) elif isinstance(data, list): flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_rdd = [] split_num = math.ceil(flattened[0].shape[0] / self.batch_size) for i in range(split_num): data_to_be_rdd.append([]) for x in flattened: assert isinstance(x, np.ndarray), "the data in the data list should be " \ "ndarrays, but get " + \ x.__class__.__name__ assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension" \ ", got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, split_num) for idx, x_part in enumerate(x_parts): data_to_be_rdd[idx].append(x_part) data_length_list = list( map(lambda arr: len(arr), x_part)) data_to_be_rdd = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_rdd ] data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num) result_rdd = self.model.distributed_predict(data_rdd, sc) result_arr_list = result_rdd.collect() for i in range(0, len(result_arr_list)): result_arr_list[i] = result_arr_list[i][:data_length_list[i]] result_arr = np.concatenate(result_arr_list, axis=0) return result_arr else: raise ValueError( "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr" "ays are supported as input data, but get " + data.__class__.__name__)