def xshards_to_tf_dataset(data_shard, batch_size=-1, batch_per_thread=-1, validation_data_shard=None, hard_code_batch_size=False, sequential_order=False, shuffle=True): # todo data_shard.head ? feature_spec, label_spec = data_shard._for_each(get_spec(allow_tuple=True, allow_list=False))\ .first() feature_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in feature_spec] label_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in label_spec] \ if label_spec is not None else None assert batch_size != -1 or batch_per_thread != -1, \ "one of batch_size and batch_per_thread should be specified" val_rdd = None if validation_data_shard is None \ else validation_data_shard.rdd.flatMap(flatten_xy(allow_tuple=True, allow_list=False)) dataset = TFDataset.from_rdd(data_shard.rdd.flatMap(flatten_xy(allow_tuple=True, allow_list=False)), features=feature_spec, labels=label_spec, batch_size=batch_size, batch_per_thread=batch_per_thread, val_rdd=val_rdd, hard_code_batch_size=hard_code_batch_size, sequential_order=sequential_order, shuffle=shuffle) return dataset
def predict(self, x, batch_per_thread=None, distributed=False): """ Use a model to do prediction. :param x: Input data. It could be: - a TFDataset object - A Numpy array (or array-like), or a list of arrays (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. :param batch_per_thread: The default value is 1. When distributed is True,the total batch size is batch_per_thread * rdd.getNumPartitions. When distributed is False the total batch size is batch_per_thread * numOfCores. :param distributed: Boolean. Whether to do prediction in distributed mode or local mode. Default is True. In local mode, x must be a Numpy array. """ if isinstance(x, TFDataset): # todo check arguments if not x.has_batch: raise ValueError("The batch_per_thread of TFDataset" + " must be specified when used in KerasModel predict.") if isinstance(x, TFNdarrayDataset): x = _standarize_feature_dataset(x, self.model) return self._predict_distributed(x) else: if distributed: sc = getOrCreateSparkContext() rdd, types, shapes = _create_rdd_x(x, self.model._feed_input_names, sc) dataset = TFDataset.from_rdd(rdd, names=self.model._feed_input_names, types=types, shapes=shapes, batch_per_thread=-1 if batch_per_thread is None else batch_per_thread) results = self._predict_distributed(dataset).collect() output_num = len(self.model.outputs) if output_num == 1: return np.stack(results) else: predictions = [] for i in range(0, output_num): predictions.append(np.stack([res[i] for res in results])) return predictions else: return self.model.predict(x=x, batch_size=batch_per_thread)
def _xshards_to_tf_dataset(data_shard, batch_size=-1, batch_per_thread=-1, validation_data_shard=None): # todo data_shard.head ? import numpy as np def check_data_type_and_to_list(data): result = {} assert isinstance(data, dict), "each shard should be an dict" assert "x" in data, "key x should in each shard" x = data["x"] if isinstance(x, np.ndarray): new_x = [x] elif isinstance(x, tuple) and all( [isinstance(xi, np.ndarray) for xi in x]): new_x = x else: raise ValueError( "value of x should be a ndarray or a tuple of ndarrays") result["x"] = new_x if "y" in data: y = data["y"] if isinstance(y, np.ndarray): new_y = [y] elif isinstance(y, tuple) and all( [isinstance(yi, np.ndarray) for yi in y]): new_y = y else: raise ValueError( "value of x should be a ndarray or a tuple of ndarrays") result["y"] = new_y return result def get_spec(data): data = check_data_type_and_to_list(data) feature_spec = [(feat.dtype, feat.shape[1:]) for feat in data["x"]] if "y" in data: label_spec = [(label.dtype, label.shape[1:]) for label in data["y"]] else: label_spec = None return (feature_spec, label_spec) (feature_spec, label_spec) = data_shard.rdd.map(get_spec).first() feature_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in feature_spec] label_spec = [(tf.dtypes.as_dtype(spec[0]), spec[1]) for spec in label_spec] \ if label_spec is not None else None assert batch_size != -1 or batch_per_thread != -1, \ "one of batch_size and batch_per_thread should be specified" # todo this might be very slow def flatten(data): data = check_data_type_and_to_list(data) features = data["x"] has_label = "y" in data labels = data["y"] if has_label else None length = features[0].shape[0] for i in range(length): fs = [feat[i] for feat in features] if has_label: ls = [l[i] for l in labels] yield (fs, ls) else: yield (fs, ) val_rdd = None if validation_data_shard is None \ else validation_data_shard.rdd.flatMap(flatten) dataset = TFDataset.from_rdd(data_shard.rdd.flatMap(flatten), features=feature_spec, labels=label_spec, batch_size=batch_size, batch_per_thread=batch_per_thread, val_rdd=val_rdd) return dataset