def from_train_op(cls, train_op, loss, *, inputs=None, labels=None, metrics=None, updates=None, sess=None, dataset=None, tensor_with_value=None, session_config=None, model_dir=None): sess = TFOptimizer._get_or_create_session(sess) grads, variables = TFOptimizer._get_vars_grads_from_train_op(train_op) if dataset is None: dataset = TFOptimizer._get_dataset_from_loss(loss) _ = dataset.tensors # trigger create tensors if not available dataset_inputs = dataset._original_tensors if isinstance(dataset_inputs, tuple) and len(dataset_inputs) == 2: if inputs is None: inputs = dataset_inputs[0] if labels is None: labels = dataset_inputs[1] else: if inputs is None: inputs = dataset_inputs if labels is None: labels = [] inputs = nest.flatten(inputs) labels = nest.flatten(labels) return TFOptimizer._from_grads(loss=loss, sess=sess, inputs=inputs, labels=labels, grads=grads, variables=variables, dataset=dataset, metrics=metrics, tensor_with_value=tensor_with_value, optim_method=FakeOptimMethod(), session_config=session_config, updates=updates, model_dir=model_dir, train_op=train_op)
def evaluate(self, data, batch_size=32, feature_cols=None, labels_cols=None, hard_code_batch_size=False): assert self.metrics is not None, \ "metrics is None, it should not be None in evaluate" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert labels_cols is not None, \ "label columns is None; it should not be None in evaluation" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) flat_inputs = nest.flatten(self.inputs) flat_labels = nest.flatten(self.labels) return evaluate_metrics(flat_inputs + flat_labels, sess=self.sess, dataset=dataset, metrics=self.metrics)
def predict(self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False): assert self.outputs is not None, \ "output is None, it should not be None in prediction" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) predicted_rdd = tfnet.predict(dataset) if isinstance(data, DataFrame): return convert_predict_to_dataframe(data, predicted_rdd) else: return predicted_rdd
def predict(self, input_fn, checkpoint_path=None): with tf.Graph().as_default() as g: result = self.estimator._call_input_fn( input_fn, tf.estimator.ModeKeys.PREDICT) if isinstance(result, TFDataset): spec = self._call_model_fn(result.feature_tensors, None, tf.estimator.ModeKeys.PREDICT, self.config) latest_checkpoint = self.estimator.latest_checkpoint() if latest_checkpoint: checkpoint_path = latest_checkpoint with tf.Session() as sess: saver = tf.train.Saver() if checkpoint_path: saver.restore(sess, checkpoint_path) else: sess.run(tf.global_variables_initializer()) inputs = nest.flatten(result.feature_tensors) outputs = nest.flatten(spec.predictions) tfnet = TFNet.from_session(sess, inputs=inputs, outputs=outputs) rdd = result.rdd.map(lambda t: Sample.from_ndarray( nest.flatten(t), np.array([0.0]))) results = tfnet.predict(rdd, result.batch_per_thread) return results return self.estimator.predict(input_fn, checkpoint_path=checkpoint_path)
def _create_placeholders(self): import tensorflow as tf if not self.hard_code_batch_size: tensors = nest.pack_sequence_as(self.tensor_structure, [ tf.placeholder( name=t.name, dtype=t.dtype, shape=[None] + list(t.shape)) for t in nest.flatten(self.tensor_structure) ]) else: if self.batch_per_thread > 0: tensors = nest.pack_sequence_as(self.tensor_structure, [ tf.placeholder( name=t.name, dtype=t.dtype, shape=[self.batch_per_thread] + list(t.shape)) for t in nest.flatten(self.tensor_structure) ]) else: tensors = nest.pack_sequence_as(self.tensor_structure, [ tf.placeholder( name=t.name, dtype=t.dtype, shape=[self.batch_size // self.total_core_num] + list(t.shape)) for t in nest.flatten(self.tensor_structure) ]) for tensor in nest.flatten(tensors): tf.get_default_graph().clear_collection(tensor.name) tf.add_to_collection(tensor.name, self) return tensors
def predict( self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False, auto_shard_files=False, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays}. :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame. :param hard_code_batch_size: whether to hard code batch size for prediction. The default value is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ assert self.outputs is not None, \ "output is None, it should not be None in prediction" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \ "estimator prediction" dataset = to_dataset( data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, label_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) predicted_rdd = tfnet.predict(dataset) if isinstance(data, DataFrame): return convert_predict_rdd_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards): return convert_predict_rdd_to_xshard(data, predicted_rdd) else: return predicted_rdd
def __init__(self, file_path, parse_fn, batch_size, batch_per_thread, hard_code_batch_size=False, validation_file_path=None): import tensorflow as tf g = tf.Graph() with g.as_default(): serialized_example = tf.placeholder(dtype=tf.string, shape=[]) results = parse_fn(serialized_example) flattened = nest.flatten(results) output_names = [tf.cast(t, dtype=tf.float32).name for t in flattened] serialized_graph = bytearray(g.as_graph_def().SerializeToString()) sc = getOrCreateSparkContext() train_rdd = callBigDlFunc("float", "createRDDFromTFRecords", file_path, sc, serialized_graph, serialized_example.name, output_names) validation_rdd = None if validation_file_path is not None: validation_rdd = callBigDlFunc("float", "createRDDFromTFRecords", validation_file_path, sc, serialized_graph, serialized_example.name, output_names) tensor_structure = nest.pack_sequence_as(results, [TensorMeta(tf.as_dtype(t.dtype), shape=t.shape, name="data_%s" % i) for i, t in enumerate(nest.flatten(results))]) super(TFRecordDataset, self).__init__(tensor_structure, batch_size, batch_per_thread, hard_code_batch_size) self.train_rdd = train_rdd self.validation_rdd = validation_rdd
def predict(self, input_fn, checkpoint_path=None): """Outputs predictions for given features. :param input_fn: A function that constructs the features. * A `TFDataset` object, each elements of which is a tuple `(features, None)`. * A `tf.data.Dataset` object: Outputs of `Dataset` object must have same constraints as below. * features: A `tf.Tensor` or a dictionary of string feature name to `Tensor`. features are consumed by `model_fn`. They should satisfy the expectation of `model_fn` from inputs. * A tuple, in which case the first item is extracted as features. :param checkpoint_path: Path of a specific checkpoint to predict. If `None`, the latest checkpoint in `model_dir` is used. If there are no checkpoints in `model_dir`, prediction is run with newly initialized `Variables` instead of ones restored from checkpoint. Return: Evaluated values of `predictions` tensors. """ with tf.Graph().as_default() as g: result = self.estimator._call_input_fn(input_fn, tf.estimator.ModeKeys.PREDICT) if isinstance(result, TFDataset): spec = self._call_model_fn(result.feature_tensors, None, tf.estimator.ModeKeys.PREDICT, self.config) latest_checkpoint = self.estimator.latest_checkpoint() if latest_checkpoint: checkpoint_path = latest_checkpoint with tf.Session() as sess: if checkpoint_path: saver = tf.train.Saver() saver.restore(sess, checkpoint_path) else: sess.run(tf.global_variables_initializer()) inputs = nest.flatten(result._original_tensors[0]) outputs = nest.flatten(spec.predictions) tfnet = TFNet.from_session(sess, inputs=inputs, outputs=outputs) predictions = tfnet.predict(result.get_prediction_data(), mini_batch=True) # If predictions is a dict, add back the keys and results is a dict as well. if isinstance(spec.predictions, dict): # Given a list of outputs; return a dict of outputs. def zip_key(outs, keys): assert len(outs) == len(keys) res_dict = {} for out, key in zip(outs, keys): res_dict[key] = out return res_dict pred_keys = sorted(spec.predictions.keys()) predictions = predictions.map(lambda res: zip_key(res, pred_keys)) return predictions return list(self.estimator.predict(input_fn, checkpoint_path=checkpoint_path))
def __init__(self, rdd, tensor_structure, batch_size, batch_per_thread, hard_code_batch_size=False, val_rdd=None): ''' TFDatasets represents a distributed collection of elements to be feed into Tensorflow graph. TFDatasets can be created using a RDD and each of its records is one or more numpy.ndarray of the same nested structure, representing the tensors to be feed into TensorFlow graph on each iteration. TFDatasets must be used with TFOptimizer or TFPredictor. ''' if batch_size > 0 and batch_per_thread > 0: raise ValueError("bath_size and batch_per_thread should not be set simultaneously") self.has_batch = True node_num, core_num = get_node_and_core_number() self.total_core_num = node_num * core_num if batch_size > 0: if batch_size % self.total_core_num != 0: raise ValueError("batch_size should be a multiple " + "of total core number, but got batch_size: " + "%s where total core number is %s" % (batch_size, self.total_core_num)) if batch_size <= 0 and batch_per_thread <= 0: batch_per_thread = 1 batch_size = self.total_core_num self.has_batch = False self.batch_size = batch_size self.batch_per_thread = batch_per_thread self.hard_code_batch_size = hard_code_batch_size self.tensor_structure = tensor_structure self.val_rdd = val_rdd if not self.hard_code_batch_size: self.output_shapes = nest.pack_sequence_as( self.tensor_structure, [[None] + list(t.shape) if t is not None else None for t in nest.flatten(self.tensor_structure)]) else: if self.batch_per_thread > 0: self.output_shapes = nest.pack_sequence_as( self.tensor_structure, [[self.batch_per_thread] + t.shape if t is not None else None for t in nest.flatten(self.tensor_structure)]) else: self.output_shapes = nest.pack_sequence_as( self.tensor_structure, [[self.batch_size // self.total_core_num] + t.shape if t is not None else None for t in nest.flatten(self.tensor_structure)]) self.rdd = rdd self.input_names = nest.pack_sequence_as( self.tensor_structure, [t.name if t is not None else None for t in nest.flatten(self.tensor_structure)]) self._tensors = None
def evaluate(self, input_fn, eval_methods, steps=None, checkpoint_path=None): if not all( isinstance(metric, six.string_types) for metric in eval_methods): raise ValueError("All metrics should be string types") with tf.Graph().as_default() as g: result = self.estimator._call_input_fn(input_fn, tf.estimator.ModeKeys.EVAL) if isinstance(result, TFDataset): spec = self._call_model_fn(result.feature_tensors, result.label_tensors, tf.estimator.ModeKeys.PREDICT, self.config) latest_checkpoint = self.estimator.latest_checkpoint() if latest_checkpoint: checkpoint_path = latest_checkpoint with tf.Session() as sess: if checkpoint_path: saver = tf.train.Saver() saver.restore(sess, checkpoint_path) else: sess.run(tf.global_variables_initializer()) inputs = nest.flatten(result._original_tensors[0]) outputs = nest.flatten(spec.predictions) tfnet = TFNet.from_session(sess, inputs=inputs, outputs=outputs) rdd = result.rdd.map(lambda t: Sample.from_ndarray( nest.flatten(t[0]), nest.flatten(t[1]))) if result.batch_per_thread < 0: batch_size = result.batch_size else: batch_size = result.batch_per_thread * result.rdd.getNumPartitions( ) eval_methods = [ self._to_bigdl_metric(m) for m in eval_methods ] results = tfnet.evaluate(rdd, batch_size, eval_methods) final_result = dict([(r.method, r.result) for r in results]) return final_result return self.estimator.evaluate(input_fn, steps, checkpoint_path=checkpoint_path)
def predict(self, data, batch_size=32): assert self.outputs is not None, \ "output is None, it should not be None in prediction" dataset = _to_dataset(data, batch_size=-1, batch_per_thread=batch_size) flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) return tfnet.predict(dataset)
def evaluate( self, data, batch_size=32, feature_cols=None, label_cols=None, auto_shard_files=False, ): """ Evaluate model. :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each partition is a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is a tuple of input tensors. :param batch_size: batch size per thread. :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame. :param label_cols: label column names if train data is Spark DataFrame. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. :return: evaluation result as a dictionary of {'metric name': metric value} """ assert self.metrics is not None, \ "metrics is None, it should not be None in evaluate" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert label_cols is not None, \ "label columns is None; it should not be None in evaluation" dataset = to_dataset( data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, label_cols=label_cols, hard_code_batch_size=False, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) flat_inputs = nest.flatten(self.inputs) flat_labels = nest.flatten(self.labels) return evaluate_metrics(flat_inputs + flat_labels, sess=self.sess, dataset=dataset, metrics=self.metrics)
def evaluate( self, data, batch_size=32, feature_cols=None, labels_cols=None, hard_code_batch_size=False, auto_shard_files=True, ): """ Evaluate model. :param data: evaluation data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is a tuple of input tensors. :param batch_size: batch size per thread. :param feature_cols: feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param hard_code_batch_size: whether to hard code batch size for evaluation. :return: evaluation result as a dictionary of {'metric name': metric value} """ assert self.metrics is not None, \ "metrics is None, it should not be None in evaluate" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in evaluation" assert labels_cols is not None, \ "label columns is None; it should not be None in evaluation" dataset = to_dataset( data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) flat_inputs = nest.flatten(self.inputs) flat_labels = nest.flatten(self.labels) return evaluate_metrics(flat_inputs + flat_labels, sess=self.sess, dataset=dataset, metrics=self.metrics)
def evaluate(self, data, batch_size=32): assert self.metrics is not None, \ "metrics is None, it should not be None in evaluate" dataset = _to_dataset(data, batch_size=-1, batch_per_thread=batch_size) flat_inputs = nest.flatten(self.inputs) flat_labels = nest.flatten(self.labels) return evaluate_metrics(flat_inputs + flat_labels, sess=self.sess, dataset=dataset, metrics=self.metrics)
def to_dataset(iter): data_list = list(iter) import tensorflow as tf if not data_list: return [] datasets = [create_dataset_fn(data) for data in data_list] from functools import reduce dataset = reduce(lambda x, y: x.concatenate(y), datasets) dataset = dataset.batch(batch_per_shard, drop_remainder) iterator = dataset.make_initializable_iterator() train_next_ops = nest.flatten(iterator.get_next()) output_types = [ t.as_datatype_enum for t in nest.flatten(dataset.output_types) ] init_op_name = iterator.initializer.name table_init_op = tf.tables_initializer().name output_names = [op.name for op in train_next_ops] graph = train_next_ops[0].graph flatten_shapes = nest.flatten(dataset.output_shapes) flatten_shapes = [shape[1:] for shape in flatten_shapes] flatten_tensor_structure = [ TensorMeta(dtype=output_types[i], shape=list(flatten_shapes[i]), name="zoo_input_{}".format(i)) for i in range(len(flatten_shapes)) ] structure = dataset.output_types if isinstance(structure, tf.DType): structure = (structure, ) tensor_structure = nest.pack_sequence_as(structure, flatten_tensor_structure) meta_info = { "init_op_name": init_op_name, "table_init_op": table_init_op, "output_names": output_names, "output_types": output_types, "tensor_structure": tensor_structure } return [(bytearray(graph.as_graph_def().SerializeToString()), meta_info)]
def _get_arguments_from_loss(loss, optim_method, session, val_outputs, val_labels, val_method): import tensorflow as tf if session is None: sess = tf.Session() sess.run(tf.global_variables_initializer()) else: sess = session grads_vars = tf.train.GradientDescentOptimizer(0).compute_gradients( loss) grads_vars.sort(key=lambda grad_var: grad_var[1].name) variables = [] grads = [] for (grad, var) in grads_vars: if grad is not None: variables.append(var) grads.append(grad) all_required_inputs = _find_placeholders([loss]) dataset = tf.get_collection(all_required_inputs[0].name)[0] inputs = nest.flatten(dataset._original_tensors) return [ loss, optim_method, sess, dataset, inputs, grads, variables, loss.graph, val_outputs, val_labels, val_method ]
def _tensors_to_rdd(tensors, sc, splits): import tensorflow as tf if isinstance(tensors, np.ndarray): tensors = (tensors,) if isinstance(tensors, list): for i in range(len(tensors)): if tensors[i].dtype == np.dtype("float64"): tensors[i] = np.float32(tensors[i]) data_list = _splits(tensors) rdd = sc.parallelize(data_list, splits) tensor_structure = [TensorMeta(tf.as_dtype(t.dtype), shape=t.shape[1:], name="input_%s" % i) for i, t in enumerate(tensors)] else: flattened = nest.flatten(tensors) for i in range(len(flattened)): if flattened[i].dtype == np.dtype("float64"): flattened[i] = np.float32(flattened[i]) data_list = _splits(flattened) rdd = sc.parallelize(data_list, splits) rdd = rdd.map(lambda x: nest.pack_sequence_as(tensors, x)) tensor_structure = nest.pack_sequence_as(tensors, [TensorMeta(tf.as_dtype(t.dtype), shape=t.shape[1:], name="input_%s" % i) for i, t in enumerate(flattened)]) return rdd, tensor_structure
def get_training_data(self): sample_rdd = self.rdd.map( lambda t: Sample.from_ndarray(nest.flatten(t), np.array([0.0]))) fs = FeatureSet.sample_rdd(sample_rdd, sequential_order=self.sequential_order, shuffle=self.shuffle) return fs
def get_prediction_data(self): rdd = self.rdd.map(lambda t: Sample.from_ndarray( nest.flatten(t[0] if isinstance(t, tuple) else t), np.array([0.0]))) rdd_wrapper = callZooFunc("float", "zooRDDSampleToMiniBatch", rdd, self.batch_per_thread) return rdd_wrapper.value().toJavaRDD()
def predict(self, data, batch_size=32): assert self.outputs is not None, \ "output is None, it should not be None in prediction" if isinstance(data, SparkXShards): dataset = _xshards_to_tf_dataset(data, batch_per_thread=batch_size) elif isinstance(data, Dataset): dataset = TFDataDataset2(data, batch_size=-1, batch_per_thread=batch_size) else: raise ValueError("data must be a SparkXShards or an orca.data.tf.Dataset") flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) return tfnet.predict(dataset)
def from_loss(cls, loss, optim_method, session=None, val_outputs=None, val_labels=None, val_method=None, val_split=0.0, clip_norm=None, clip_value=None, metrics=None, tensor_with_value=None, session_config=None, model_dir=None, updates=None): """ Create a TFOptimizer from a TensorFlow loss tensor. The loss tensor must come from a TensorFlow graph that only takes TFDataset.tensors and the tensors in `tensor_with_value` as inputs. :param loss: The loss tensor of the TensorFlow model, should be a scalar :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam :param session: the current TensorFlow Session, if you want to used a pre-trained model, you should use the Session to load the pre-trained variables and pass it to TFOptimizer. :param val_outputs: the validation output TensorFlow tensor to be used by val_methods :param val_labels: the validation label TensorFlow tensor to be used by val_methods :param val_method: the BigDL val_method(s) to be used. :param val_split: Float between 0 and 1. Fraction of the training data to be used as validation data. :param clip_norm: float >= 0. Gradients will be clipped when their L2 norm exceeds this value. :param clip_value: float >= 0. Gradients will be clipped when their absolute value exceeds this value. :param metrics: a dictionary. The key should be a string representing the metric's name and the value should be the corresponding TensorFlow tensor, which should be a scalar. :param tensor_with_value: a dictionary. The key is TensorFlow tensor, usually a placeholder, the value of the dictionary is a tuple of two elements. The first one of the tuple is the value to feed to the tensor in training phase and the second one is the value to feed to the tensor in validation phase. :return: a TFOptimizer """ sess = TFOptimizer._get_or_create_session(session) grads, variables = TFOptimizer._get_vars_grads(loss) dataset = TFOptimizer._get_dataset_from_loss(loss) inputs = nest.flatten(dataset._original_tensors) if clip_value is not None: if isinstance(clip_value, float) or isinstance(clip_value, int): if clip_value <= 0: ValueError("The clip_value argument should be positive number") clip_value = (-float(clip_value), float(clip_value)) if not isinstance(clip_value, tuple): raise ValueError("The clip_value argument should be" + " a positive float/int which clips to" + " (-clip_value, clip_value); " + "or a tuple which clips to (min_value, max_value)") if val_method is not None: val_methods = to_list(val_method) if metrics is None: metrics = {} for i, method in enumerate(val_methods): metrics['bigdl_metirc_' + str(i)] = BigDLMetric(method, val_outputs, val_labels) return TFOptimizer._from_grads(loss, sess, inputs, grads, variables, dataset, optim_method, val_split, clip_norm, clip_value, metrics, tensor_with_value, session_config, model_dir, updates)
def partition(data, num_shards=None): """ Partition local in memory data and form a SparkXShards :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure made of tuple, list, dict with ndarray as the leaf value :param num_shards: the number of shards that the data will be partitioned into :return: a SparkXShards """ sc = init_nncontext() node_num, core_num = get_node_and_core_number() shard_num = node_num * core_num if num_shards is None else num_shards import numpy as np type_err_msg = """ The types supported in zoo.orca.data.XShards.partition are 1. np.ndarray 2. a tuple, list, dict of np.ndarray 3. nested structure made of tuple, list, dict with ndarray as the leaf value But got data of type {} """.format(type(data)) supported_types = {list, tuple, dict} if isinstance(data, np.ndarray): if data.shape[0] < shard_num: raise ValueError( "The length of data {} is smaller than the total number " "of shards {}. Please adjust the num_shards option to be " "at most {}.".format(data.shape[0], shard_num, data.shape[0])) arrays = np.array_split(data, shard_num) rdd = sc.parallelize(arrays) else: assert type(data) in supported_types, type_err_msg flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_shard = [] if data_length < shard_num: raise ValueError( "The length of data {} is smaller than the total number " "of shards {}. Please adjust the num_shards option to be " "at most {}.".format(data_length, shard_num, data_length)) for i in range(shard_num): data_to_be_shard.append([]) for x in flattened: assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension, " \ "got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, shard_num) for idx, x_part in enumerate(x_parts): data_to_be_shard[idx].append(x_part) data_to_be_shard = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_shard ] rdd = sc.parallelize(data_to_be_shard) data_shards = SparkXShards(rdd) return data_shards
def _get_evaluation_data(self): feature_length = len(nest.flatten(self.tensor_structure[0])) jvalue = callZooFunc("float", "createMiniBatchRDDFromTFDatasetEval", self.rdd.map(lambda x: x[0]), self.init_op_name, self.table_init_op, self.output_names, self.output_types, self.shard_index_op_name, feature_length) rdd = jvalue.value().toJavaRDD() return rdd
def get_validation_data(self): if self.val_rdd is not None: sample_rdd = self.val_rdd.map(lambda t: Sample.from_ndarray( nest.flatten(t), np.array([0.0]))) return FeatureSet.sample_rdd( sample_rdd, sequential_order=self.sequential_order, shuffle=self.shuffle) return None
def get_validation_data(self): if self.val_rdd is not None: sample_rdd = self.val_rdd.map(lambda t: Sample.from_ndarray( nest.flatten(t), np.array([0.0]))) fs = FeatureSet.sample_rdd(sample_rdd, sequential_order=self.sequential_order, shuffle=self.shuffle) fs = fs.transform(SampleToMiniBatch(self.batch_size)) return fs return None
def from_train_op(cls, train_op, loss, metrics=None, updates=None, sess=None, dataset=None, tensor_with_value=None, session_config=None, model_dir=None): sess = TFOptimizer._get_or_create_session(sess) grads, variables = TFOptimizer._get_vars_grads_from_train_op(train_op) if dataset is None: dataset = TFOptimizer._get_dataset_from_loss(loss) inputs = nest.flatten(dataset._original_tensors) return TFOptimizer._from_grads(loss=loss, sess=sess, inputs=inputs, grads=grads, variables=variables, dataset=dataset, metrics=metrics, tensor_with_value=tensor_with_value, optim_method=FakeOptimMethod(), session_config=session_config, updates=updates, model_dir=model_dir, train_op=train_op)
def _expand_inputs(inputs, tensors_with_value, loss): additional_inputs = [] additional_values = [] inputs = nest.flatten(inputs) names = set([i.name for i in inputs]) if tensors_with_value: for t, v in tensors_with_value.items(): if t.name in names: msg = f"tensor {t} already in inputs, cannot put it in tensor_with_value" raise ValueError(msg) additional_inputs.append(t) additional_values.append(v) return inputs, additional_inputs, additional_values
def _expand_inputs(inputs, tensors_with_value, loss): additional_inputs = [] additional_values = [] all_required_inputs = find_placeholders([loss]) all_required_inputs_names = [v.name for v in all_required_inputs] if tensors_with_value: for t, v in tensors_with_value.items(): if t.name in all_required_inputs_names: additional_inputs.append(t) additional_values.append(v) if not isinstance(inputs, list): inputs = nest.flatten(inputs) return inputs, additional_inputs, additional_values
def partition(data): """ Partition local in memory data and form a SparkXShards :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure made of tuple, list, dict with ndarray as the leaf value :return: a SparkXShards """ sc = init_nncontext() node_num, core_num = get_node_and_core_number() total_core_num = node_num * core_num import numpy as np type_err_msg = """ The types supported in zoo.orca.data.XShards.partition are 1. np.ndarray 2. a tuple, list, dict of np.ndarray 3. nested structure made of tuple, list, dict with ndarray as the leaf value But got data of type {} """.format(type(data)) supported_types = {list, tuple, dict} if isinstance(data, np.ndarray): arrays = np.array_split(data, total_core_num) rdd = sc.parallelize(arrays) else: assert type(data) in supported_types, type_err_msg flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_shard = [] for i in range(total_core_num): data_to_be_shard.append([]) for x in flattened: assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension, " \ "got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, total_core_num) for idx, x_part in enumerate(x_parts): data_to_be_shard[idx].append(x_part) data_to_be_shard = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_shard ] rdd = sc.parallelize(data_to_be_shard) data_shards = SparkXShards(rdd) return data_shards
def _get_arguments_from_loss(loss, optim_method, session, val_outputs, val_labels, val_method): import tensorflow as tf if session is None: sess = tf.Session() sess.run(tf.global_variables_initializer()) else: sess = session grads, variables = TFOptimizer._get_vars_grads(loss) all_required_inputs = _find_placeholders([loss]) dataset = tf.get_collection(all_required_inputs[0].name)[0] inputs = nest.flatten(dataset._original_tensors) return [ loss, optim_method, sess, dataset, inputs, grads, variables, loss.graph, val_outputs, val_labels, val_method ]