def _get_arguments_from_loss(loss, optim_method, session, val_outputs, val_labels, val_method): import tensorflow as tf if session is None: sess = tf.Session() sess.run(tf.global_variables_initializer()) else: sess = session grads_vars = tf.train.GradientDescentOptimizer(0).compute_gradients( loss) variables = [] grads = [] from zoo.util.tf import process_grad for (grad, var) in grads_vars: variables.append(var) grad = process_grad(grad) grads.append(grad) all_required_inputs = _find_placeholders([loss]) dataset = tf.get_collection(all_required_inputs[0].name)[0] inputs = dataset.tensors _check_the_same(all_required_inputs, inputs) return (loss, optim_method, sess, dataset, inputs, grads, variables, loss.graph, val_outputs, val_labels, val_method)
def compute_gradients(self, *args, **kwargs): """Compute gradients of all trainable variables. See Optimizer.compute_gradients() for more info. In DistributedOptimizer, compute_gradients() is overriden to also allreduce the gradients before returning them. """ gradients = self._optimizer.compute_gradients(*args, **kwargs) results = [] for grad_var in gradients: grad = grad_var[0] var = grad_var[1] grad = process_grad(grad) with tf.control_dependencies([var]): grad_i = tf.identity(grad, name="zoo_identity_op_for_grad") results.append((grad_i, var)) return results
def get_gradients_for_keras(optimizer, loss, params): from tensorflow.python.util import nest from tensorflow.python.keras import backend from tensorflow.python.ops import gradients from tensorflow.python.ops import clip_ops from tensorflow.python.keras.optimizers import TFOptimizer params = nest.flatten(params) if isinstance(optimizer, TFOptimizer): scope_name = optimizer.optimizer._name else: scope_name = optimizer._name with backend.get_graph().as_default(), backend.name_scope(scope_name + "/gradients"): grads = gradients.gradients(loss, params) all_reduced_grads = [] for grad, param in zip(grads, params): if grad is None: raise ValueError( "Variable {} has `None` for gradient. " "Please make sure that all of your ops have a " "gradient defined (i.e. are differentiable). " "Common ops without gradient: " "K.argmax, K.round, K.eval.".format(param)) grad = process_grad(grad) with tf.control_dependencies([param]): grad_i = tf.identity(grad, name="zoo_identity_op_for_grad") all_reduced_grads.append(grad_i) grads = all_reduced_grads if hasattr(optimizer, "clipnorm"): grads = [ clip_ops.clip_by_norm(g, optimizer.clipnorm) for g in grads ] if hasattr(optimizer, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -optimizer.clipvalue, optimizer.clipvalue) for g in grads ] return grads
def _process_grads(graph, grads): with graph.as_default(): from zoo.util.tf import process_grad grads = [process_grad(grad) for grad in grads] return grads
def __init__(self, loss, optim_method, sess=None, dataset=None, inputs=None, grads=None, variables=None, graph=None, val_outputs=None, val_labels=None, val_method=None, val_split=0.0, tensors_with_value=None, session_config=None): ''' TFOptimizer is used for distributed training of TensorFlow on Spark/BigDL. :param loss: The loss tensor of the TensorFlow model, should be a scalar :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam :param sess: the current TensorFlow Session, if you want to used a pre-trained model, you should use the Session to load the pre-trained variables and pass it to TFOptimizer. ''' import tensorflow as tf from tensorflow.python.util import nest from zoo.util.tf import export_tf if dataset is None: args = TFOptimizer._get_arguments_from_loss(loss, optim_method, sess, val_outputs, val_labels, val_method) loss, optim_method, sess, dataset, inputs = args[:5] grads, variables, graph, val_outputs, val_labels, val_method = args[5:] additional_inputs = [] additional_values = [] all_required_inputs = _find_placeholders([loss]) all_required_inputs_names = [v.name for v in all_required_inputs] if tensors_with_value: for t, v in tensors_with_value.items(): if t.name in all_required_inputs_names: additional_inputs.append(t) additional_values.append(v) if not isinstance(inputs, list): inputs = nest.flatten(inputs) self.optim_method = optim_method self.sess = sess self.dataset = dataset self.inputs = inputs + additional_inputs self.graph = graph self.session_config = session_config from zoo.util.tf import process_grad grads = [process_grad(grad) for grad in grads] if self.dataset.batch_size <= 0: raise ValueError("You should set batch_size instead of batch_per_thread for training") if val_outputs is not None and val_labels is not None: with self.graph.as_default(): val_labels = [tf.identity(v) for v in val_labels] outputs = val_outputs + val_labels + [loss] else: outputs = [loss] self.grads = grads self.outputs = outputs self.export_dir = tempfile.mkdtemp() export_tf(self.sess, self.export_dir, inputs=self.inputs, outputs=self.grads + self.outputs) variable_names = [v.name for v in variables] grad_names = [g.name for g in grads] output_names = [o.name for o in outputs] def to_floats(vs): return [float(v) for v in vs] meta = { "input_names": [i.name for i in self.inputs], "output_names": output_names, "variables": variable_names, "grad_variables": grad_names, "default_tensor_values": [to_floats(v) for v in additional_values] } with open(os.path.join(self.export_dir, "training_meta.json"), "w") as f: f.write(json.dumps(meta)) self.variable_placeholders = [] with self.graph.as_default(): assigns = [] for v in variables: p = tf.placeholder(dtype=tf.float32, shape=v.shape) a = tf.assign(v, p) self.variable_placeholders.append(p) assigns.append(a) assign = tf.group(*assigns) self.assign = assign try: self.training_helper_layer = TFTrainingHelper(self.export_dir, session_config) except Py4JJavaError as e: if "expects to be colocated with unknown node" in str(e): raise Exception(""" If you are using the embedding layer in tf.keras, then this is a known issue of TensorFlow, see https://github.com/tensorflow/tensorflow/issues/21889. Please add zoo.util.tf.variable_creator_scope before model construction. For example: from zoo.util.tf import variable_creator_scope with variable_creator_scope(): model = tf.keras.models.Sequential([ tf.keras.layers.Embedding(1, 1, input_length=1)]) """) else: raise e data = self.dataset.rdd batch_size = self.dataset.batch_size def to_sample(t): if isinstance(t, list): t = tuple(t) return Sample.from_ndarray(nest.flatten(t), [np.array([0.0])]) sample_rdd = data.map(to_sample) if val_outputs is not None and val_labels is not None: if self.dataset.val_rdd is not None: val_rdd = self.dataset.val_rdd.map(to_sample) val_method = [TFValidationMethod(m, len(val_outputs), len(val_labels)) for m in to_list(val_method)] training_rdd = sample_rdd elif val_split != 0.0: training_rdd, val_rdd = sample_rdd.randomSplit([1 - val_split, val_split]) val_method = [TFValidationMethod(m, len(val_outputs), len(val_labels)) for m in to_list(val_method)] else: raise ValueError("Validation data is not specified. Please set " + "val rdd in TFDataset, or set val_split larger than zero") self.optimizer = Optimizer.create(self.training_helper_layer, training_rdd, IdentityCriterion(), batch_size=batch_size, optim_method=self.optim_method) self.optimizer.set_validation(self.dataset.batch_size, val_rdd, EveryEpoch(), val_method) else: training_rdd = sample_rdd self.optimizer = Optimizer.create(self.training_helper_layer, training_rdd, IdentityCriterion(), batch_size=batch_size, optim_method=self.optim_method)
def __init__(self, loss, optim_method, sess=None, val_outputs=None, val_labels=None, val_method=None): import tensorflow as tf from zoo.util.tf import export_tf ''' TFOptimizer is used for distributed training of tensorflow on Spark/BigDL. :param loss: The loss tensor of the tensorflow model, should be a scalar :param optim_method: the optimization method to be used, such as bigdl.optim.optimizer.Adam :param sess: the current tensorflow Session, if you want to used a pre-trained model, you should use the Session to load the pre-trained variables and pass it to TFOptimizer. ''' self.optim_method = optim_method if sess is None: self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) else: self.sess = sess grads_vars = tf.train.GradientDescentOptimizer(0).compute_gradients( loss) variables = [] grads = [] from zoo.util.tf import process_grad for (grad, var) in grads_vars: variables.append(var) grad = process_grad(grad) grads.append(grad) self.export_dir = tempfile.mkdtemp() all_required_inputs = _find_placeholders([loss]) self.dataset = tf.get_collection(all_required_inputs[0].name)[0] if self.dataset.batch_size <= 0: raise ValueError( "You should set batch_size instead of batch_per_thread for training" ) self.inputs = self.dataset.tensors _check_the_same(all_required_inputs, self.inputs) if val_outputs is not None and val_labels is not None: outputs = val_outputs + val_labels + [loss] else: outputs = [loss] export_tf(self.sess, self.export_dir, inputs=self.inputs, outputs=grads + outputs) variable_names = [v.name for v in variables] grad_names = [g.name for g in grads] output_names = [o.name for o in outputs] meta = { "input_names": [i.name for i in self.inputs], "output_names": output_names, "variables": variable_names, "grad_variables": grad_names } with open(os.path.join(self.export_dir, "training_meta.json"), "w") as f: f.write(json.dumps(meta)) self.training_helper_layer = TFTrainingHelper(self.export_dir) self.variable_placeholders = [] assigns = [] for v in variables: p = tf.placeholder(dtype=tf.float32, shape=v.shape) a = tf.assign(v, p) self.variable_placeholders.append(p) assigns.append(a) self.assign = tf.group(*assigns) data = self.dataset.rdd batch_size = self.dataset.batch_size sample_rdd = data.map( lambda t: Sample.from_ndarray(t, [np.array([0.0])])) self.optimizer = Optimizer.create(self.training_helper_layer, sample_rdd, IdentityCriterion(), batch_size=batch_size, optim_method=self.optim_method) if val_outputs is not None and val_labels is not None: val_sample_rdd = self.dataset.val_rdd\ .map(lambda t: Sample.from_ndarray(t, [np.array([0.0])])) val_method = TFValidationMethod(val_method, len(val_outputs), len(val_labels)) self.optimizer.set_validation(self.dataset.batch_size, val_sample_rdd, EveryEpoch(), val_method)
def create(loss, sess, inputs, grads, variables, graph, tensors_with_value, session_config, metrics): import tensorflow as tf from zoo.util.tf import export_tf additional_inputs = [] additional_values = [] all_required_inputs = _find_placeholders([loss]) all_required_inputs_names = [v.name for v in all_required_inputs] if tensors_with_value: for t, v in tensors_with_value.items(): if t.name in all_required_inputs_names: additional_inputs.append(t) additional_values.append(v) if not isinstance(inputs, list): inputs = nest.flatten(inputs) inputs = inputs + additional_inputs if session_config is not None: import tensorflow as tf assert isinstance(session_config, tf.ConfigProto),\ "session_config should be a tf.ConfigProto" session_config.use_per_session_threads = True session_config = session_config from zoo.util.tf import process_grad grads = [process_grad(grad) for grad in grads] outputs = [] val_methods = None if metrics is not None: idx = 0 val_methods = [] for metric_name in metrics: metric = metrics[metric_name] if tf.is_numeric_tensor(metric): outputs.append(metric) val_methods.append(StatelessMetric(metric_name, idx)) idx += 1 else: outputs += metric.outputs with graph.as_default(): val_labels = [tf.identity(v) for v in metric.labels] outputs += val_labels method = TFValidationMethod( metric.val_method, metric_name, list(range(idx, idx + len(metric.outputs))), list( range(idx + len(metric.outputs), idx + len(metric.outputs) + len(val_labels)))) val_methods.append(method) idx += len(metric.outputs) + len(val_labels) with graph.as_default(): real_batch_size = tf.shape(inputs[0])[0] outputs.append(real_batch_size) outputs.append(loss) export_dir = tempfile.mkdtemp() export_tf(sess, export_dir, inputs=inputs, outputs=grads + outputs) variable_names = [v.name for v in variables] grad_names = [g.name for g in grads] output_names = [o.name for o in outputs] def to_floats(vs): return [float(v) for v in vs] meta = { "input_names": [i.name for i in inputs], "output_names": output_names, "variables": variable_names, "grad_variables": grad_names, "default_tensor_values": [to_floats(v) for v in additional_values] } with open(os.path.join(export_dir, "training_meta.json"), "w") as f: f.write(json.dumps(meta)) variable_placeholders = [] with graph.as_default(): assigns = [] for v in variables: p = tf.placeholder(dtype=tf.float32, shape=v.shape) a = tf.assign(v, p) variable_placeholders.append(p) assigns.append(a) assign = tf.group(*assigns) assign = assign training_helper_layer = TFTrainingHelper(export_dir, session_config, assign, variable_placeholders) criterion = IdentityCriterion() return TFModel(training_helper_layer, criterion, val_methods)