def test_tf_optimizer_metrics(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) output = tf.layers.dense(features, 10) loss = tf.reduce_mean(tf.losses. sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss(loss, {"dense/": Adam(1e-3), "dense_1/": SGD(0.0)}, val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}) initial_weights = optimizer.tf_model.training_helper_layer.get_weights() optimizer.optimize(end_trigger=MaxEpoch(1)) updated_weights = optimizer.tf_model.training_helper_layer.get_weights() for i in [0, 1]: # weights and bias combined with "dense/" should be updated assert not np.allclose(initial_weights[i], updated_weights[i]) for i in [2, 3]: # weights and bias combined with "dense_1" should be unchanged assert np.allclose(initial_weights[i], updated_weights[i]) optimizer.sess.close()
def test_tf_optimizer_with_sparse_gradient_using_keras(self): import tensorflow as tf ids = np.random.randint(0, 10, size=[40]) labels = np.random.randint(0, 5, size=[40]) id_rdd = self.sc.parallelize(ids) label_rdd = self.sc.parallelize(labels) training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]]) dataset = TFDataset.from_rdd(training_rdd, features=(tf.int32, []), labels=(tf.int32, []), batch_size=8) words_input = tf.keras.layers.Input(shape=(), name='words_input') embedding_layer = tf.keras.layers.Embedding(input_dim=10, output_dim=5, name='word_embedding') word_embeddings = embedding_layer(words_input) embedding = tf.keras.layers.Flatten()(word_embeddings) output = tf.keras.layers.Dense(5, activation="softmax")(embedding) model = tf.keras.models.Model(inputs=[words_input], outputs=[output]) model.compile(optimizer="sgd", loss="sparse_categorical_crossentropy") optimizer = TFOptimizer.from_keras(model, dataset) optimizer.optimize()
def test_control_inputs(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) is_training = tf.placeholder(dtype=tf.bool, shape=()) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) features = tf.layers.dropout(features, training=is_training) output = tf.layers.dense(features, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss( loss, Adam(), val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), tensor_with_value={is_training: (True, False)}, metrics={"loss": loss}) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close()
def main(max_epoch, data_num): sc = init_nncontext() # get data, pre-process and create TFDataset def get_data_rdd(dataset): (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", dataset) image_rdd = sc.parallelize(images_data[:data_num]) labels_rdd = sc.parallelize(labels_data[:data_num]) rdd = image_rdd.zip(labels_rdd) \ .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD), np.array(rec_tuple[1])]) return rdd training_rdd = get_data_rdd("train") testing_rdd = get_data_rdd("test") dataset = TFDataset.from_rdd(training_rdd, names=["features", "labels"], shapes=[[28, 28, 1], []], types=[tf.float32, tf.int32], batch_size=280, val_rdd=testing_rdd) # construct the model from TFDataset images, labels = dataset.tensors with slim.arg_scope(lenet.lenet_arg_scope()): logits, end_points = lenet.lenet(images, num_classes=10, is_training=True) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) # create a optimizer optimizer = TFOptimizer(loss, Adam(1e-3), val_outputs=[logits], val_labels=[labels], val_method=Top1Accuracy(), model_dir="/tmp/lenet/") # kick off training optimizer.optimize(end_trigger=MaxEpoch(max_epoch)) saver = tf.train.Saver() saver.save(optimizer.sess, "/tmp/lenet/model")
def main(): sc = init_nncontext() global_batch_size = 256 loss = create_model(creat_dataset(global_batch_size)) optimizer = TFOptimizer.from_loss(loss, SGD(1e-3), model_dir="/tmp/lenet/") optimizer.optimize(end_trigger=MaxIteration(20))
def fit(self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, hard_code_batch_size=False, session_config=None, feed_dict=None): assert self.labels is not None, \ "labels is None; it should not be None in training" assert self.loss is not None, \ "loss is None; it should not be None in training" assert self.optimizer is not None, \ "optimizer is None; it should not be None in training" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert labels_cols is not None, \ "label columns is None; it should not be None in training" dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=False, shuffle=True) if feed_dict is not None: tensor_with_value = { key: (value, value) for key, value in feed_dict.items() } else: tensor_with_value = None optimizer = TFOptimizer.from_train_op( train_op=self.train_op, loss=self.loss, inputs=self.inputs, labels=self.labels, dataset=dataset, metrics=self.metrics, updates=self.updates, sess=self.sess, tensor_with_value=tensor_with_value, session_config=session_config, model_dir=self.model_dir) optimizer.optimize(end_trigger=MaxEpoch(epochs)) return self
def fit(self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, hard_code_batch_size=False, session_config=None, checkpoint_trigger=None ): """ Train this keras model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param hard_code_batch_size: whether hard code batch size for training. Default is False. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param checkpoint_trigger: when to trigger checkpoint during training. Should be bigdl optimzer trigger, like EveryEpoch(), SeveralIteration(num_iterations),etc. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert labels_cols is not None, \ "label columns is None; it should not be None in training" dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=False, shuffle=True ) self.tf_optimizer = TFOptimizer.from_keras(self.model.model, dataset, model_dir=self.model.model_dir, session_config=session_config, metrics=self.metrics) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboad(self.log_dir, self.app_name) self.tf_optimizer.optimize(MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self
def fit(self, data, steps, batch_size=32, validation_data=None, feed_dict=None, session_config=None): assert self.labels is not None, \ "labels is None; it should not be None in training" assert self.loss is not None, \ "loss is None; it should not be None in training" assert self.optimizer is not None, \ "optimizer is None; it not None in training" if isinstance(data, SparkXShards): dataset = _xshards_to_tf_dataset( data, batch_size=batch_size, validation_data_shard=validation_data) elif isinstance(data, Dataset): dataset = TFDataDataset2(data, batch_size=batch_size, batch_per_thread=-1, validation_dataset=validation_data) else: raise ValueError("data type {} is not supported; " "it must be created by zoo.orca.data.package") if feed_dict is not None: tensor_with_value = { key: (value, value) for key, value in feed_dict.items() } else: tensor_with_value = None optimizer = TFOptimizer.from_train_op( train_op=self.train_op, loss=self.loss, inputs=self.inputs, labels=self.labels, dataset=dataset, metrics=self.metrics, updates=self.updates, sess=self.sess, tensor_with_value=tensor_with_value, session_config=session_config, model_dir=self.model_dir) optimizer.optimize(end_trigger=MaxIteration(steps)) return self
def test_tfdataset_with_tfrecord(self): train_path = os.path.join(resource_path, "tfrecord/mnist_train.tfrecord") test_path = os.path.join(resource_path, "tfrecord/mnist_test.tfrecord") dataset = TFDataset.from_tfrecord_file(self.sc, train_path, batch_size=16, validation_file_path=test_path) raw_bytes = dataset.tensors[0] images, labels = parse_fn(raw_bytes) flat = tf.layers.flatten(images) logits = tf.layers.dense(flat, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) opt = TFOptimizer.from_loss(loss, Adam()) opt.optimize()
def test_checkpoint(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) output = tf.layers.dense(features, 10) loss = tf.reduce_mean(tf.losses. sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) model_dir = tempfile.mkdtemp() try: optimizer = TFOptimizer.from_loss(loss, Adam(), val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}, model_dir=model_dir) optimizer.optimize(end_trigger=MaxEpoch(1)) import re ckpt_path = None versions = [] for (root, dirs, files) in os.walk(model_dir, topdown=True): temp_versions = [] for file_name in files: if re.match("^optimMethod-TFParkTraining\.[0-9]+$", file_name) is not None: version = int(file_name.split(".")[1]) temp_versions.append(version) if temp_versions: ckpt_path = root versions = temp_versions break assert ckpt_path is not None, "Cannot fine checkpoint file" optimizer.load_checkpoint(ckpt_path, max(versions)) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close() finally: import shutil shutil.rmtree(model_dir)
def test_tfdataset_with_tf_data_dataset_which_contains_bool(self): dataset = tf.data.Dataset.from_tensor_slices( (np.random.randn(102, 28, 28, 1), np.random.randint(0, 10, size=(102, )), np.ones(shape=(102, 28, 28, 1), dtype=np.bool))) dataset = TFDataset.from_tf_data_dataset(dataset, batch_size=16) feature, labels, mask = dataset.tensors float_mask = tf.to_float(mask) masked_feature = tf.to_float(feature) * float_mask flatten = tf.layers.flatten(masked_feature) logits = tf.layers.dense(flatten, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) opt = TFOptimizer.from_loss(loss, Adam()) opt.optimize()
def main(max_epoch, data_num): sc = init_nncontext() # get data, pre-process and create TFDataset def get_data_rdd(dataset): (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", dataset) image_rdd = sc.parallelize(images_data[:data_num]) labels_rdd = sc.parallelize(labels_data[:data_num]) rdd = image_rdd.zip(labels_rdd) \ .map(lambda rec_tuple: [normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD), np.array(rec_tuple[1])]) return rdd training_rdd = get_data_rdd("train") testing_rdd = get_data_rdd("test") dataset = TFDataset.from_rdd(training_rdd, names=["features", "labels"], shapes=[[28, 28, 1], []], types=[tf.float32, tf.int32], batch_size=280, val_rdd=testing_rdd) data = Input(shape=[28, 28, 1]) x = Flatten()(data) x = Dense(64, activation='relu')(x) x = Dense(64, activation='relu')(x) predictions = Dense(10, activation='softmax')(x) model = Model(inputs=data, outputs=predictions) model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy']) optimizer = TFOptimizer.from_keras(model, dataset, model_dir="/tmp/mnist_keras") # kick off training optimizer.optimize(end_trigger=MaxEpoch(max_epoch)) model.save_weights("/tmp/mnist_keras/mnist_keras.h5")
def fit(self, data, epochs=1, batch_size=32, validation_data=None, session_config=None, feed_dict=None): assert self.labels is not None, \ "labels is None; it should not be None in training" assert self.loss is not None, \ "loss is None; it should not be None in training" assert self.optimizer is not None, \ "optimizer is None; it not None in training" dataset = _to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data) if feed_dict is not None: tensor_with_value = { key: (value, value) for key, value in feed_dict.items() } else: tensor_with_value = None optimizer = TFOptimizer.from_train_op( train_op=self.train_op, loss=self.loss, inputs=self.inputs, labels=self.labels, dataset=dataset, metrics=self.metrics, updates=self.updates, sess=self.sess, tensor_with_value=tensor_with_value, session_config=session_config, model_dir=self.model_dir) optimizer.optimize(end_trigger=MaxEpoch(epochs)) return self
def main(max_epoch, data_num): sc = init_nncontext() # get data, pre-process and create TFDataset (train_images_data, train_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") (test_images_data, test_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") train_images_data = (train_images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD train_labels_data = train_labels_data[:data_num].astype(np.int) test_images_data = (test_images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD test_labels_data = (test_labels_data[:data_num]).astype(np.int) dataset = TFDataset.from_ndarrays( (train_images_data, train_labels_data), batch_size=360, val_tensors=(test_images_data, test_labels_data)) # construct the model from TFDataset images, labels = dataset.tensors with slim.arg_scope(lenet.lenet_arg_scope()): logits, end_points = lenet.lenet(images, num_classes=10, is_training=True) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) acc = accuracy(logits, labels) # create a optimizer optimizer = TFOptimizer.from_loss(loss, Adam(1e-3), metrics={"acc": acc}, model_dir="/tmp/lenet/") # kick off training optimizer.optimize(end_trigger=MaxEpoch(max_epoch)) saver = tf.train.Saver() saver.save(optimizer.sess, "/tmp/lenet/model")
def test_tf_optimizer_with_sparse_gradient(self): ids = np.random.randint(0, 10, size=[40]) labels = np.random.randint(0, 5, size=[40]) id_rdd = self.sc.parallelize(ids) label_rdd = self.sc.parallelize(labels) training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]]) with tf.Graph().as_default(): dataset = TFDataset.from_rdd(training_rdd, names=["ids", "labels"], shapes=[[], []], types=[tf.int32, tf.int32], batch_size=8) id_tensor, label_tensor = dataset.tensors embedding_table = tf.get_variable(name="word_embedding", shape=[10, 5]) embedding = tf.nn.embedding_lookup(embedding_table, id_tensor) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=embedding, labels=label_tensor)) optimizer = TFOptimizer.from_loss(loss, Adam(1e-3)) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close()
def fit(self, data, epochs=1, batch_size=32, feature_cols=None, label_cols=None, validation_data=None, session_config=None, checkpoint_trigger=None, auto_shard_files=True): """ Train this keras model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each partition can be Pandas Dataframe or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param label_cols: label column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param checkpoint_trigger: when to trigger checkpoint during training. Should be a zoo.orca.learn.trigger, like EveryEpoch(), SeveralIteration( num_iterations),etc. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, tf.data.Dataset): assert isinstance(data.element_spec, tuple), \ "If data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if validation_data is not None: assert isinstance(validation_data, tf.data.Dataset), \ "train data and validation data should be both tf.data.Dataset" assert isinstance(validation_data.element_spec, tuple), \ "If validation_data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" data, validation_data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols, validation_data, "fit") if checkpoint_trigger is not None: checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger) if is_tf_data_dataset(data): data = data.map(_standardize_keras_target_data) validation_data = validation_data.map( _standardize_keras_target_data) memory_type = OrcaContext.train_data_store dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, label_cols=label_cols, hard_code_batch_size=False, sequential_order=False, shuffle=True, auto_shard_files=auto_shard_files, memory_type=memory_type) self.tf_optimizer = TFOptimizer.from_keras( self.model.model, dataset, model_dir=self.model.model_dir, session_config=session_config, metrics=self.metrics, optimizer=self.optimizer) if self.clip_norm: self.tf_optimizer.set_gradient_clipping_by_l2_norm( clip_norm=self.clip_norm) if self.clip_min and self.clip_max: self.tf_optimizer.set_constant_gradient_clipping( self.clip_min, self.clip_max) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboard( self.log_dir, self.app_name) self.tf_optimizer.optimize(MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self
def fit(self, data, epochs=1, batch_size=32, feature_cols=None, label_cols=None, validation_data=None, session_config=None, checkpoint_trigger=None, auto_shard_files=False, feed_dict=None): """ Train this graph model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each partition can be Pandas Dataframe or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is a tuple of input tensors. :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame or XShards of Pandas Dataframe. :param label_cols: label column names if train data is Spark DataFrame or XShards of Pandas Dataframe. :param validation_data: validation data. Validation data type should be the same as train data. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param feed_dict: a dictionary. The key is TensorFlow tensor, usually a placeholder, the value of the dictionary is a tuple of two elements. The first one of the tuple is the value to feed to the tensor in training phase and the second one is the value to feed to the tensor in validation phase. :param checkpoint_trigger: when to trigger checkpoint during training. Should be a zoo.orca.learn.trigger, like EveryEpoch(), SeveralIteration( num_iterations),etc. """ assert self.labels is not None, \ "labels is None; it should not be None in training" assert self.loss is not None, \ "loss is None; it should not be None in training" assert self.optimizer is not None, \ "optimizer is None; it should not be None in training" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" data, validation_data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols, validation_data, "fit") if checkpoint_trigger is not None: checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger) memory_type = OrcaContext.train_data_store dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, label_cols=label_cols, hard_code_batch_size=False, sequential_order=False, shuffle=True, auto_shard_files=auto_shard_files, memory_type=memory_type) if feed_dict is not None: tensor_with_value = { key: (value[0], value[1]) for key, value in feed_dict.items() } else: tensor_with_value = None if self.use_bigdl_optim: self.tf_optimizer = TFOptimizer.from_loss( self.loss, self.optimizer, session=self.sess, inputs=(self.inputs, self.labels), dataset=dataset, clip_norm=self.clip_norm, clip_value=self.clip_value, metrics=self.metrics, tensor_with_value=tensor_with_value, session_config=session_config, model_dir=self.model_dir, updates=self.updates) else: self.tf_optimizer = TFOptimizer.from_train_op( train_op=self.train_op, loss=self.loss, inputs=self.inputs, labels=self.labels, dataset=dataset, metrics=self.metrics, updates=self.updates, sess=self.sess, tensor_with_value=tensor_with_value, session_config=session_config, model_dir=self.model_dir) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboard( self.log_dir, self.app_name) self.tf_optimizer.optimize(end_trigger=MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self
def fit(self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, hard_code_batch_size=False, session_config=None, feed_dict=None, checkpoint_trigger=None): """ Train this graph model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is a tuple of input tensors. :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param hard_code_batch_size: whether hard code batch size for training. Default is False. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param feed_dict: a dictionary. The key is TensorFlow tensor, usually a placeholder, the value of the dictionary is a tuple of two elements. The first one of the tuple is the value to feed to the tensor in training phase and the second one is the value to feed to the tensor in validation phase. :param checkpoint_trigger: when to trigger checkpoint during training. Should be bigdl optimzer trigger, like EveryEpoch(), SeveralIteration(num_iterations),etc. """ assert self.labels is not None, \ "labels is None; it should not be None in training" assert self.loss is not None, \ "loss is None; it should not be None in training" assert self.optimizer is not None, \ "optimizer is None; it should not be None in training" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert labels_cols is not None, \ "label columns is None; it should not be None in training" dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=False, shuffle=True) if feed_dict is not None: tensor_with_value = { key: (value[0], value[1]) for key, value in feed_dict.items() } else: tensor_with_value = None self.tf_optimizer = TFOptimizer.from_train_op( train_op=self.train_op, loss=self.loss, inputs=self.inputs, labels=self.labels, dataset=dataset, metrics=self.metrics, updates=self.updates, sess=self.sess, tensor_with_value=tensor_with_value, session_config=session_config, model_dir=self.model_dir) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboad(self.log_dir, self.app_name) self.tf_optimizer.optimize(end_trigger=MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self
lrSchedule.add(Poly(0.5, maxIteration), polyIteration) optim = SGD(learningrate=options.learningRate, learningrate_decay=0.0, weightdecay=options.weightDecay, momentum=0.9, dampening=0.0, nesterov=False, leaningrate_schedule=lrSchedule) if options.maxEpoch: checkpoint_trigger = EveryEpoch() end_trigger = MaxEpoch(options.maxEpoch) else: checkpoint_trigger = SeveralIteration(options.checkpointIteration) end_trigger = MaxIteration(options.maxIteration) optimizer = TFOptimizer.from_loss(loss, optim, val_outputs=[logits], val_labels=[zero_based_label], val_method=[Accuracy(), Top5Accuracy(), Loss()], tensor_with_value={is_training: [True, False]}, model_dir="/tmp/logs") if options.resumeTrainingCheckpoint is not None: assert options.resumeTrainingVersion is not None,\ "--resumeTrainingVersion must be specified when --resumeTrainingCheckpoint is." optimizer.load_checkpoint(options.resumeTrainingCheckpoint, options.resumeTrainingVersion) optimizer.optimize(end_trigger=end_trigger, checkpoint_trigger=checkpoint_trigger) if options.checkpoint: saver = tf.train.Saver() saver.save(optimizer.sess, options.checkpoint)
def train(self, input_fn, end_trigger): with tf.Graph().as_default() as g: dataset = input_fn() generator_inputs = dataset.tensors[0] real_data = dataset.tensors[1] counter = tf.train.get_or_create_global_step() period = self._discriminator_steps + self._generator_steps is_discriminator_phase = tf.less(tf.mod(counter, period), self._discriminator_steps) with tf.variable_scope("Generator"): gen_data = self._call_fn_maybe_with_counter(self._generator_fn, counter, generator_inputs) with tf.variable_scope("Discriminator"): fake_d_outputs = self._call_fn_maybe_with_counter(self._discriminator_fn, counter, gen_data, generator_inputs) with tf.variable_scope("Discriminator", reuse=True): real_d_outputs = self._call_fn_maybe_with_counter(self._discriminator_fn, counter, real_data, generator_inputs) with tf.name_scope("Generator_loss"): generator_loss = self._call_fn_maybe_with_counter(self._generator_loss_fn, counter, fake_d_outputs) gen_reg_loss = tf.losses.get_regularization_loss("Generator") generator_loss = generator_loss + gen_reg_loss with tf.name_scope("Discriminator_loss"): discriminator_loss = self._call_fn_maybe_with_counter(self._discriminator_loss_fn, counter, real_d_outputs, fake_d_outputs) dis_reg_loss = tf.losses.get_regularization_loss("Discriminator") discriminator_loss = discriminator_loss + dis_reg_loss generator_variables = tf.trainable_variables("Generator") discriminator_variables = tf.trainable_variables("Discriminator") def run_gen_compute(): gen_grads_vars = self._gen_opt.compute_gradients(generator_loss, var_list=generator_variables) gen_grads = [grad for grad, var in gen_grads_vars] dis_grads = [tf.zeros_like(var) for var in discriminator_variables] return gen_grads + dis_grads def run_dis_compute(): dis_grads_vars = self._gen_opt.compute_gradients(discriminator_loss, var_list=discriminator_variables) dis_grads = [grad for grad, var in dis_grads_vars] gen_gards = [tf.zeros_like(var) for var in generator_variables] return gen_gards + dis_grads grads = tf.cond(is_discriminator_phase, run_dis_compute, run_gen_compute) grads_vars = list(zip(grads, generator_variables + discriminator_variables)) gen_grads_vars = grads_vars[:len(generator_variables)] dis_grads_vars = grads_vars[len(generator_variables):] grads = [grad for grad, var in grads_vars] _train_op = tf.cond(is_discriminator_phase, lambda: self._dis_opt.apply_gradients(dis_grads_vars), lambda: self._gen_opt.apply_gradients(gen_grads_vars)) variables = generator_variables + discriminator_variables loss = tf.cond(is_discriminator_phase, lambda: discriminator_loss, lambda: generator_loss) with tf.control_dependencies([_train_op]): increase_counter = tf.assign_add(counter, 1) with tf.control_dependencies([increase_counter]): train_op = tf.no_op() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() kpt = tf.train.latest_checkpoint(self.model_dir) if kpt is not None: saver.restore(sess, kpt) opt = TFOptimizer._from_grads(loss, sess, inputs=nest.flatten(dataset._original_tensors), labels=[], grads=grads, variables=variables, dataset=dataset, optim_method=FakeOptimMethod(), session_config=self._session_config, model_dir=os.path.join(self.model_dir, "tmp"), train_op=train_op) opt.optimize(end_trigger) saver = tf.train.Saver() saver.save(sess, self.checkpoint_path, global_step=counter)
def fit( self, data, epochs=1, batch_size=32, feature_cols=None, labels_cols=None, validation_data=None, hard_code_batch_size=False, session_config=None, checkpoint_trigger=None, auto_shard_files=True, ): """ Train this keras model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays, 'y': a label numpy array or a tuple of label numpy arrays} If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame. :param labels_cols: label column names if train data is Spark DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param hard_code_batch_size: whether hard code batch size for training. Default is False. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param checkpoint_trigger: when to trigger checkpoint during training. Should be a zoo.orca.learn.trigger, like EveryEpoch(), SeveralIteration(num_iterations),etc. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert labels_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, tf.data.Dataset): assert isinstance(data.element_spec, tuple), \ "If data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if validation_data is not None: assert isinstance(validation_data, tf.data.Dataset), \ "train data and validation data should be both tf.data.Dataset" assert isinstance(validation_data.element_spec, tuple), \ "If validation_data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if checkpoint_trigger is not None: checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger) if is_tf_data_dataset(data): data = data.map(_standardize_keras_target_data) validation_data = validation_data.map( _standardize_keras_target_data) dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, labels_cols=labels_cols, hard_code_batch_size=hard_code_batch_size, sequential_order=False, shuffle=True, auto_shard_files=auto_shard_files) if isinstance(dataset, TFNdarrayDataset): dataset = _standarize_feature_label_dataset( dataset, self.model.model) self.tf_optimizer = TFOptimizer.from_keras( self.model.model, dataset, model_dir=self.model.model_dir, session_config=session_config, metrics=self.metrics, optimizer=self.optimizer) if self.clip_norm: self.tf_optimizer.set_gradient_clipping_by_l2_norm( clip_norm=self.clip_norm) if self.clip_min and self.clip_max: self.tf_optimizer.set_constant_gradient_clipping( self.clip_min, self.clip_max) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboard( self.log_dir, self.app_name) self.tf_optimizer.optimize(MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self
def train(self, dataset, end_trigger): with tf.Graph().as_default() as g: generator_inputs = dataset.tensors[0] real_data = dataset.tensors[1] counter = tf.Variable(0, dtype=tf.int32) period = self._discriminator_steps + self._generator_steps is_discriminator_phase = tf.less(tf.mod(counter, period), self._discriminator_steps) with tf.variable_scope("generator"): gen_data = self._call_fn_maybe_with_counter( self._generator_fn, counter, generator_inputs) with tf.variable_scope("discriminator"): fake_d_outputs = self._call_fn_maybe_with_counter( self._discriminator_fn, counter, gen_data, generator_inputs) with tf.variable_scope("discriminator", reuse=True): real_d_outputs = self._call_fn_maybe_with_counter( self._discriminator_fn, counter, real_data, generator_inputs) with tf.name_scope("generator_loss"): generator_loss = self._call_fn_maybe_with_counter( self._generator_loss_fn, counter, fake_d_outputs) with tf.name_scope("discriminator_loss"): discriminator_loss = self._call_fn_maybe_with_counter( self._discriminator_loss_fn, counter, real_d_outputs, fake_d_outputs) generator_variables = tf.trainable_variables("generator") generator_grads = tf.gradients(generator_loss, generator_variables) discriminator_variables = tf.trainable_variables("discriminator") discriminator_grads = tf.gradients(discriminator_loss, discriminator_variables) variables = generator_variables + discriminator_variables def true_fn(): return [tf.zeros_like(grad) for grad in generator_grads] def false_fn(): return generator_grads g_grads = tf.cond(is_discriminator_phase, true_fn=true_fn, false_fn=false_fn) d_grads = tf.cond( is_discriminator_phase, lambda: discriminator_grads, lambda: [tf.zeros_like(grad) for grad in discriminator_grads]) loss = tf.cond(is_discriminator_phase, lambda: discriminator_loss, lambda: generator_loss) grads = g_grads + d_grads with tf.control_dependencies(grads): increase_counter = tf.assign_add(counter, 1) g_param_size = sum([np.product(g.shape) for g in g_grads]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) tf_model = TFModel.create_for_unfreeze( loss, sess, inputs=dataset._original_tensors, grads=grads, variables=variables, graph=g, tensors_with_value=None, session_config=None, metrics=None, updates=[increase_counter], model_dir=self.checkpoint_path) optimizer = TFOptimizer(tf_model, GanOptimMethod( self._discriminator_optim_method, self._generator_optim_method, g_param_size.value, self._discriminator_steps, self._generator_steps), sess=sess, dataset=dataset, model_dir=self.checkpoint_path) optimizer.optimize(end_trigger) steps = sess.run(counter) saver = tf.train.Saver() saver.save(optimizer.sess, self.checkpoint_path, global_step=steps)