def to_dataset(data, batch_size, batch_per_thread, validation_data, feature_cols, labels_cols, hard_code_batch_size, sequential_order, shuffle, auto_shard_files): # todo wrap argument into kwargs if validation_data: if isinstance(data, SparkXShards): assert isinstance(validation_data, SparkXShards), \ "train data and validation data should be both SparkXShards" if isinstance(data, Dataset): assert isinstance(validation_data, Dataset), \ "train data and validation data should be both orca.data.tf.Dataset" if isinstance(data, DataFrame): assert isinstance(validation_data, DataFrame), \ "train data and validation data should be both Spark DataFrame" if isinstance(data, tf.data.Dataset): assert isinstance(validation_data, tf.data.Dataset), \ "train data and validation data should be both tf.data.Dataset" if isinstance(data, SparkXShards): dataset = xshards_to_tf_dataset( data, batch_size, batch_per_thread, validation_data, hard_code_batch_size=hard_code_batch_size, sequential_order=sequential_order, shuffle=shuffle) elif isinstance(data, Dataset): dataset = TFDataDataset2(data, batch_size=batch_size, batch_per_thread=batch_per_thread, validation_dataset=validation_data) elif isinstance(data, DataFrame): dataset = TFDataset.from_dataframe(data, feature_cols, labels_cols, batch_size, batch_per_thread, hard_code_batch_size, validation_data, sequential_order, shuffle) elif is_tf_data_dataset(data): dataset = TFDataset.from_tf_data_dataset( data, batch_size, batch_per_thread, hard_code_batch_size, validation_data, sequential_order, shuffle, auto_shard_files=auto_shard_files) else: raise ValueError( "data must be SparkXShards or orca.data.tf.Dataset or " "Spark DataFrame or tf.data.Dataset") return dataset
def fit(self, x=None, y=None, batch_size=None, epochs=1, validation_data=None, distributed=False, **kwargs ): """ Train the model for a fixed num of epochs Arguments: :param x: Input data. It could be: - a TFDataset object - A Numpy array (or array-like), or a list of arrays (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. :param y: Target data. Like the input data `x`, It should be consistent with `x` (you cannot have Numpy inputs and tensor targets, or inversely). If `x` is a TFDataset, `y` should not be specified (since targets will be obtained from `x`). :param batch_size: Integer or `None`. Number of samples per gradient update. If `x` is a TFDataset, you do not need to specify batch_size. :param epochs: Integer. Number of epochs to train the model. An epoch is an iteration over the entire `x` and `y` data provided. :param validation_data: Data on which to evaluate the loss and any model metrics at the end of each epoch. The model will not be trained on this data. `validation_data` could be: - tuple `(x_val, y_val)` of Numpy arrays or tensors :param distributed: Boolean. Whether to do prediction in distributed mode or local mode. Default is True. In local mode, x must be a Numpy array. """ if isinstance(x, TFDataset): # todo check arguments assert validation_data is None, "validation_data must be None when " \ "using TFDataset as input, please " \ "use set the validation data in TFDataset" if not x.has_batch: raise ValueError("The batch_size of TFDataset must be " + "specified when used in KerasModel fit.") self._fit_distributed(x, epochs, **kwargs) elif distributed: dataset = TFDataset.from_ndarrays((x, y), val_tensors=validation_data, batch_size=batch_size) self._fit_distributed(dataset, epochs, **kwargs) else: self.model.fit(x=x, y=y, batch_size=batch_size, epochs=epochs, validation_data=validation_data, **kwargs )
def xshards_to_tf_dataset(data_shard, batch_size=-1, batch_per_thread=-1, validation_data_shard=None, hard_code_batch_size=False, sequential_order=False, shuffle=True): # todo data_shard.head ? feature_spec, label_spec = data_shard._for_each(get_spec(allow_tuple=True, allow_list=False))\ .first() feature_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in feature_spec] label_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in label_spec] \ if label_spec is not None else None assert batch_size != -1 or batch_per_thread != -1, \ "one of batch_size and batch_per_thread should be specified" val_rdd = None if validation_data_shard is None \ else validation_data_shard.rdd.flatMap(flatten_xy(allow_tuple=True, allow_list=False)) dataset = TFDataset.from_rdd(data_shard.rdd.flatMap(flatten_xy(allow_tuple=True, allow_list=False)), features=feature_spec, labels=label_spec, batch_size=batch_size, batch_per_thread=batch_per_thread, val_rdd=val_rdd, hard_code_batch_size=hard_code_batch_size, sequential_order=sequential_order, shuffle=shuffle) return dataset
def predict(self, x, batch_per_thread=None, distributed=False): """ Use a model to do prediction. :param x: Input data. It could be: - a TFDataset object - A Numpy array (or array-like), or a list of arrays (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. :param batch_per_thread: The default value is 1. When distributed is True,the total batch size is batch_per_thread * rdd.getNumPartitions. When distributed is False the total batch size is batch_per_thread * numOfCores. :param distributed: Boolean. Whether to do prediction in distributed mode or local mode. Default is True. In local mode, x must be a Numpy array. """ if isinstance(x, TFDataset): # todo check arguments if not x.has_batch: raise ValueError("The batch_per_thread of TFDataset" + " must be specified when used in KerasModel predict.") if isinstance(x, TFNdarrayDataset): x = _standarize_feature_dataset(x, self.model) return self._predict_distributed(x) else: if distributed: sc = getOrCreateSparkContext() rdd, types, shapes = _create_rdd_x(x, self.model._feed_input_names, sc) dataset = TFDataset.from_rdd(rdd, names=self.model._feed_input_names, types=types, shapes=shapes, batch_per_thread=-1 if batch_per_thread is None else batch_per_thread) results = self._predict_distributed(dataset).collect() output_num = len(self.model.outputs) if output_num == 1: return np.stack(results) else: predictions = [] for i in range(0, output_num): predictions.append(np.stack([res[i] for res in results])) return predictions else: return self.model.predict(x=x, batch_size=batch_per_thread)
def evaluate(self, x=None, y=None, batch_per_thread=None, distributed=False ): """ Evaluate a model on a given dataset :param x: Input data. It could be: - a TFDataset object - A Numpy array (or array-like), or a list of arrays (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. :param y: Target data. Like the input data `x`, It should be consistent with `x` (you cannot have Numpy inputs and tensor targets, or inversely). If `x` is a TFDataset, `y` should not be specified (since targets will be obtained from `x`). :param batch_per_thread: The default value is 1. When distributed is True,the total batch size is batch_per_thread * rdd.getNumPartitions. When distributed is False the total batch size is batch_per_thread * numOfCores. :param distributed: Boolean. Whether to do prediction in distributed mode or local mode. Default is True. In local mode, x must be a Numpy array. """ if isinstance(x, TFDataset): if not x.has_batch: raise ValueError("The batch_per_thread of TFDataset must be " + "specified when used in KerasModel evaluate.") if isinstance(x, TFNdarrayDataset): x = _standarize_feature_label_dataset(x, self.model) # todo check arguments check_data_compatible(x, self.model, mode="evaluate") return self._evaluate_distributed(x) else: if distributed: dataset = TFDataset.from_ndarrays((x, y), batch_per_thread=-1 if batch_per_thread is None else batch_per_thread ) dataset = _standarize_feature_label_dataset(dataset, self.model) return self._evaluate_distributed(dataset) else: results = self.model.evaluate(x=x, y=y, batch_size=batch_per_thread) results = dict(zip(self.metrics_names, results)) return results
def _xshards_to_tf_dataset(data_shard, batch_size=-1, batch_per_thread=-1, validation_data_shard=None): # todo data_shard.head ? import numpy as np def check_data_type_and_to_list(data): result = {} assert isinstance(data, dict), "each shard should be an dict" assert "x" in data, "key x should in each shard" x = data["x"] if isinstance(x, np.ndarray): new_x = [x] elif isinstance(x, tuple) and all( [isinstance(xi, np.ndarray) for xi in x]): new_x = x else: raise ValueError( "value of x should be a ndarray or a tuple of ndarrays") result["x"] = new_x if "y" in data: y = data["y"] if isinstance(y, np.ndarray): new_y = [y] elif isinstance(y, tuple) and all( [isinstance(yi, np.ndarray) for yi in y]): new_y = y else: raise ValueError( "value of x should be a ndarray or a tuple of ndarrays") result["y"] = new_y return result def get_spec(data): data = check_data_type_and_to_list(data) feature_spec = [(feat.dtype, feat.shape[1:]) for feat in data["x"]] if "y" in data: label_spec = [(label.dtype, label.shape[1:]) for label in data["y"]] else: label_spec = None return (feature_spec, label_spec) (feature_spec, label_spec) = data_shard.rdd.map(get_spec).first() feature_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in feature_spec] label_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in label_spec] \ if label_spec is not None else None assert batch_size != -1 or batch_per_thread != -1, \ "one of batch_size and batch_per_thread should be specified" # todo this might be very slow def flatten(data): data = check_data_type_and_to_list(data) features = data["x"] has_label = "y" in data labels = data["y"] if has_label else None length = features[0].shape[0] for i in range(length): fs = [feat[i] for feat in features] if has_label: ls = [l[i] for l in labels] yield (fs, ls) else: yield (fs, ) val_rdd = None if validation_data_shard is None \ else validation_data_shard.rdd.flatMap(flatten) dataset = TFDataset.from_rdd(data_shard.rdd.flatMap(flatten), features=feature_spec, labels=label_spec, batch_size=batch_size, batch_per_thread=batch_per_thread, val_rdd=val_rdd) return dataset