def test_estimator_graph_checkpoint(self): import zoo.orca.data.pandas tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy(), df['item'].to_numpy()), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) temp = tempfile.mkdtemp() model_dir = os.path.join(temp, "test_model") est = Estimator.from_graph( inputs=[model.user, model.item], labels=[model.label], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}, model_dir=model_dir ) est.fit(data=data_shard, batch_size=8, epochs=6, validation_data=data_shard, checkpoint_trigger=SeveralIteration(4)) est.sess.close() tf.reset_default_graph() model = SimpleModel() est = Estimator.from_graph( inputs=[model.user, model.item], labels=[model.label], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}, model_dir=model_dir ) est.load_orca_checkpoint(model_dir) est.fit(data=data_shard, batch_size=8, epochs=10, validation_data=data_shard) result = est.evaluate(data_shard) assert "loss" in result print(result) shutil.rmtree(temp)
def test_estimator_graph_pandas_dataframe(self): import zoo.orca.data.pandas tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) est = Estimator.from_graph( inputs=[model.user, model.item], labels=[model.label], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) est.fit(data=data_shard, batch_size=8, epochs=10, feature_cols=['user', 'item'], label_cols=['label'], validation_data=data_shard) result = est.evaluate(data_shard, feature_cols=['user', 'item'], label_cols=['label']) assert "loss" in result print(result) est = Estimator.from_graph( inputs=[model.user, model.item], outputs=[model.logits]) predictions = est.predict(data_shard, feature_cols=['user', 'item']).collect() print(predictions)
def test_estimator_keras_tensorboard(self): import zoo.orca.data.pandas tf.reset_default_graph() model = self.create_model() file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy().reshape([-1, 1]), df['item'].to_numpy().reshape([-1, 1])), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) temp = tempfile.mkdtemp() model_dir = os.path.join(temp, "test_model") est = Estimator.from_keras(keras_model=model, model_dir=model_dir) assert est.get_train_summary("Loss") is None assert est.get_validation_summary("Top1Accuracy") is None est.fit(data=data_shard, batch_size=8, epochs=10, validation_data=data_shard) train_loss = est.get_train_summary("Loss") assert len(train_loss) > 0 val_scores = est.get_validation_summary("Top1Accuracy") assert len(val_scores) > 0 tf.reset_default_graph() # no model dir model = self.create_model() est = Estimator.from_keras(keras_model=model) log_dir = os.path.join(temp, "log") est.set_tensorboard(log_dir, "test") est.fit(data=data_shard, batch_size=8, epochs=10, validation_data=data_shard) assert os.path.exists(os.path.join(log_dir, "test/train")) assert os.path.exists(os.path.join(log_dir, "test/validation")) train_loss = est.get_train_summary("Loss") val_scores = est.get_validation_summary("Loss") assert len(train_loss) > 0 assert len(val_scores) > 0 shutil.rmtree(temp)
def test_estimator_keras_save_load(self): import zoo.orca.data.pandas tf.reset_default_graph() model = self.create_model() file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy().reshape([-1, 1]), df['item'].to_numpy().reshape([-1, 1])), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) est = Estimator.from_keras(keras_model=model) est.fit(data=data_shard, batch_size=8, epochs=10, validation_data=data_shard) eval_result = est.evaluate(data_shard) print(eval_result) temp = tempfile.mkdtemp() model_path = os.path.join(temp, 'test.h5') est.save_keras_model(model_path) tf.reset_default_graph() from tensorflow.python.keras import models from zoo.common.utils import load_from_file def load_func(file_path): return models.load_model(file_path) model = load_from_file(load_func, model_path) est = Estimator.from_keras(keras_model=model) data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy().reshape([-1, 1]), df['item'].to_numpy().reshape([-1, 1])), } return result data_shard = data_shard.transform_shard(transform) predictions = est.predict(data_shard).collect() assert predictions[0]['prediction'].shape[1] == 2 shutil.rmtree(temp)
def test_estimator_keras_xshards_options(self): import zoo.orca.data.pandas tf.reset_default_graph() model = self.create_model() file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy().reshape([-1, 1]), df['item'].to_numpy().reshape([-1, 1])), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) est = Estimator.from_keras(keras_model=model) # train with no validation est.fit(data=data_shard, batch_size=8, epochs=10) # train with different optimizer est = Estimator.from_keras(keras_model=model) est.fit(data=data_shard, batch_size=8, epochs=10 ) # train with session config tf_session_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) est = Estimator.from_keras(keras_model=model) est.fit(data=data_shard, batch_size=8, epochs=10, session_config=tf_session_config ) # train with model dir temp = tempfile.mkdtemp() model_dir = os.path.join(temp, "model") est = Estimator.from_keras(keras_model=model, model_dir=model_dir) est.fit(data=data_shard, batch_size=8, epochs=10, validation_data=data_shard) assert len(os.listdir(model_dir)) > 0 shutil.rmtree(temp)
def test_estimator_keras_xshards_checkpoint(self): import zoo.orca.data.pandas tf.reset_default_graph() model = self.create_model() file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy().reshape([-1, 1]), df['item'].to_numpy().reshape([-1, 1])), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) temp = tempfile.mkdtemp() model_dir = os.path.join(temp, "test_model") est = Estimator.from_keras(keras_model=model, model_dir=model_dir) est.fit(data=data_shard, batch_size=8, epochs=6, validation_data=data_shard, checkpoint_trigger=SeveralIteration(4)) eval_result = est.evaluate(data_shard) print(eval_result) tf.reset_default_graph() model = self.create_model() est = Estimator.from_keras(keras_model=model, model_dir=model_dir) est.load_orca_checkpoint(model_dir) est.fit(data=data_shard, batch_size=8, epochs=10, validation_data=data_shard, checkpoint_trigger=SeveralIteration(4)) eval_result = est.evaluate(data_shard) print(eval_result) shutil.rmtree(temp)
def test_estimator_keras_dataframe_mem_type(self): tf.reset_default_graph() model = self.create_model() sc = init_nncontext() sqlcontext = SQLContext(sc) file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv") df = sqlcontext.read.csv(file_path, header=True, inferSchema=True) from pyspark.sql.functions import array df = df.withColumn('user', array('user')) \ .withColumn('item', array('item')) est = Estimator.from_keras(keras_model=model) OrcaContext.train_data_store = "DISK_2" est.fit(data=df, batch_size=4, epochs=4, feature_cols=['user', 'item'], label_cols=['label'], validation_data=df) eval_result = est.evaluate(df, feature_cols=['user', 'item'], label_cols=['label']) assert 'acc Top1Accuracy' in eval_result prediction_df = est.predict(df, batch_size=4, feature_cols=['user', 'item']) assert 'prediction' in prediction_df.columns predictions = prediction_df.collect() assert len(predictions) == 48 OrcaContext.train_data_store = "DRAM"
def test_estimator_graph_dataframe(self): tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") sc = init_nncontext() sqlcontext = SQLContext(sc) df = sqlcontext.read.csv(file_path, header=True, inferSchema=True) est = Estimator.from_graph( inputs=[model.user, model.item], labels=[model.label], outputs=[model.logits], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], label_cols=['label'], validation_data=df) result = est.evaluate(df, batch_size=4, feature_cols=['user', 'item'], label_cols=['label']) print(result) prediction_df = est.predict(df, batch_size=4, feature_cols=['user', 'item']) assert 'prediction' in prediction_df.columns predictions = prediction_df.collect() assert len(predictions) == 48
def test_estimator_graph_tf_dataset(self): tf.reset_default_graph() model = SimpleModel() dataset = tf.data.Dataset.from_tensor_slices( (np.random.randint(0, 200, size=(100, )), np.random.randint(0, 50, size=(100, )), np.ones(shape=(100, ), dtype=np.int32))) est = Estimator.from_graph(inputs=[model.user, model.item], labels=[model.label], outputs=[model.logits], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) est.fit(data=dataset, batch_size=8, epochs=10, validation_data=dataset) result = est.evaluate(dataset, batch_size=4) assert 'loss' in result predict_dataset = tf.data.Dataset.from_tensor_slices( (np.random.randint(0, 200, size=(20, )), np.random.randint(0, 50, size=(20, )))) predictions = est.predict(predict_dataset).collect() assert predictions[0]['prediction'].shape[1] == 2
def main(max_epoch, dataset_dir): mnist_train = tfds.load(name="mnist", split="train", data_dir=dataset_dir) mnist_test = tfds.load(name="mnist", split="test", data_dir=dataset_dir) mnist_train = mnist_train.map(preprocess) mnist_test = mnist_test.map(preprocess) # tensorflow inputs images = tf.placeholder(dtype=tf.float32, shape=(None, 28, 28, 1)) # tensorflow labels labels = tf.placeholder(dtype=tf.int32, shape=(None,)) logits = lenet(images) loss = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) acc = accuracy(logits, labels) # create an estimator est = Estimator.from_graph(inputs=images, outputs=logits, labels=labels, loss=loss, optimizer=tf.train.AdamOptimizer(), metrics={"acc": acc}) est.fit(data=mnist_train, batch_size=320, epochs=max_epoch, validation_data=mnist_test) result = est.evaluate(mnist_test) print(result) est.save_tf_checkpoint("/tmp/lenet/model")
def test_estimator_graph_with_bigdl_optim_method(self): import zoo.orca.data.pandas tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy(), df['item'].to_numpy()), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) from zoo.orca.learn.optimizers import SGD from zoo.orca.learn.optimizers.schedule import Plateau sgd = SGD(learningrate=0.1, learningrate_schedule=Plateau("score", factor=0.1, patience=10, mode="min", )) est = Estimator.from_graph( inputs=[model.user, model.item], labels=[model.label], outputs=[model.logits], loss=model.loss, optimizer=sgd, metrics={"loss": model.loss}) est.fit(data=data_shard, batch_size=8, epochs=10, validation_data=data_shard)
def test_estimator_graph_fit_dataset(estimator_for_spark_fixture): import zoo.orca.data.pandas tf.reset_default_graph() model = SimpleModel() sc = estimator_for_spark_fixture file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path, sc) def transform(df): result = { "x": (df['user'].to_numpy(), df['item'].to_numpy()), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) dataset = Dataset.from_tensor_slices(data_shard) est = Estimator.from_graph(inputs=[model.user, model.item], labels=[model.label], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) est.fit(data=dataset, batch_size=8, steps=10, validation_data=dataset) result = est.evaluate(dataset, batch_size=4) assert 'loss' in result
def test_estimator_graph_evaluate(estimator_for_spark_fixture): import zoo.orca.data.pandas tf.reset_default_graph() model = SimpleModel() sc = estimator_for_spark_fixture file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path, sc) def transform(df): result = { "x": (df['user'].to_numpy(), df['item'].to_numpy()), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) est = Estimator.from_graph(inputs=[model.user, model.item], labels=[model.label], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) result = est.evaluate(data_shard) assert "loss" in result print(result)
def test_estimator_graph_fit(self): import zoo.orca.data.pandas tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy(), df['item'].to_numpy()), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) est = Estimator.from_graph( inputs=[model.user, model.item], labels=[model.label], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) est.fit(data=data_shard, batch_size=8, epochs=10, validation_data=data_shard)
def test_estimator_keras_xshards_with_mem_type(self): import zoo.orca.data.pandas tf.reset_default_graph() model = self.create_model() file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy().reshape([-1, 1]), df['item'].to_numpy().reshape([-1, 1])), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) est = Estimator.from_keras(keras_model=model) OrcaContext.train_data_store = "DISK_2" est.fit(data=data_shard, batch_size=4, epochs=10, validation_data=data_shard) eval_result = est.evaluate(data_shard) print(eval_result) OrcaContext.train_data_store = "DRAM"
def test_estimator_keras_xshards_clip(self): import zoo.orca.data.pandas tf.reset_default_graph() model = self.create_model_with_clip() file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy().reshape([-1, 1]), df['item'].to_numpy().reshape([-1, 1])), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) est = Estimator.from_keras(keras_model=model) est.fit(data=data_shard, batch_size=8, epochs=10, validation_data=data_shard)
def main(max_epoch): sc = init_orca_context(cores=4, memory="2g") # get DataSet # as_supervised returns tuple (img, label) instead of dict {'image': img, 'label':label} mnist_train = tfds.load(name="mnist", split="train", as_supervised=True) mnist_test = tfds.load(name="mnist", split="test", as_supervised=True) # Normalizes images, unit8 -> float32 def normalize_img(image, label): return tf.cast(image, tf.float32) / 255., label mnist_train = mnist_train.map( normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) mnist_test = mnist_test.map( normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) model = tf.keras.Sequential([ tf.keras.layers.Conv2D(20, kernel_size=(5, 5), strides=(1, 1), activation='tanh', input_shape=(28, 28, 1), padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Conv2D(50, kernel_size=(5, 5), strides=(1, 1), activation='tanh', padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(500, activation='tanh'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) est = Estimator.from_keras(keras_model=model) est.fit(data=mnist_train, batch_size=320, epochs=max_epoch, validation_data=mnist_test) result = est.evaluate(mnist_test) print(result) est.save_keras_model("/tmp/mnist_keras.h5") stop_orca_context()
def main(max_epoch): # get DataSet (train_feature, train_label), (val_feature, val_label) = tf.keras.datasets.mnist.load_data() # tf.data.Dataset.from_tensor_slices is for demo only. For production use, please use # file-based approach (e.g. tfrecord). train_dataset = tf.data.Dataset.from_tensor_slices( (train_feature, train_label)) train_dataset = train_dataset.map(preprocess) val_dataset = tf.data.Dataset.from_tensor_slices((val_feature, val_label)) val_dataset = val_dataset.map(preprocess) model = tf.keras.Sequential([ tf.keras.layers.Conv2D(20, kernel_size=(5, 5), strides=(1, 1), activation='tanh', input_shape=(28, 28, 1), padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Conv2D(50, kernel_size=(5, 5), strides=(1, 1), activation='tanh', padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(500, activation='tanh'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) est = Estimator.from_keras(keras_model=model) est.fit(data=train_dataset, batch_size=320, epochs=max_epoch, validation_data=val_dataset) result = est.evaluate(val_dataset) print(result) est.save_keras_model("/tmp/mnist_keras.h5")
def test_train_simple(orca_context_fixture): sc = orca_context_fixture temp_dir = tempfile.mkdtemp() try: _write_ndarrays(images=np.random.randn(500, 28, 28, 1).astype(np.float32), labels=np.random.randint(0, 10, (500, )).astype(np.int32), output_path=temp_dir) dataset = ParquetDataset.read_as_tf(temp_dir) def preprocess(data): return data['image'], data["label"] dataset = dataset.map(preprocess) import tensorflow as tf model = tf.keras.Sequential([ tf.keras.layers.Conv2D(20, kernel_size=(5, 5), strides=(1, 1), activation='tanh', input_shape=(28, 28, 1), padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Conv2D(50, kernel_size=(5, 5), strides=(1, 1), activation='tanh', padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(500, activation='tanh'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) est = Estimator.from_keras(keras_model=model) est.fit(data=dataset, batch_size=100, epochs=1) finally: shutil.rmtree(temp_dir)
def main(max_epoch): sc = init_orca_context(cores=4, memory="2g") # get DataSet mnist_train = tfds.load(name="mnist", split="train") mnist_test = tfds.load(name="mnist", split="test") # Normalizes images def normalize_img(data): data['image'] = tf.cast(data["image"], tf.float32) / 255. return data mnist_train = mnist_train.map( normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) mnist_test = mnist_test.map( normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) # tensorflow inputs images = tf.placeholder(dtype=tf.float32, shape=(None, 28, 28, 1)) # tensorflow labels labels = tf.placeholder(dtype=tf.int32, shape=(None, )) with slim.arg_scope(lenet.lenet_arg_scope()): logits, end_points = lenet.lenet(images, num_classes=10, is_training=True) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) acc = accuracy(logits, labels) # create an estimator est = Estimator.from_graph(inputs=images, outputs=logits, labels=labels, loss=loss, optimizer=tf.train.AdamOptimizer(), metrics={"acc": acc}) est.fit(data=mnist_train, batch_size=320, epochs=max_epoch, validation_data=mnist_test) result = est.evaluate(mnist_test) print(result) est.save_tf_checkpoint("/tmp/lenet/model") stop_orca_context()
def test_estimator_keras_tf_dataset(self): tf.reset_default_graph() model = self.create_model() dataset = tf.data.Dataset.from_tensor_slices( (np.random.randint(0, 200, size=(100, 1)), np.random.randint(0, 50, size=(100, 1)), np.ones(shape=(100, ), dtype=np.int32))) dataset = dataset.map(lambda user, item, label: [(user, item), label]) est = Estimator.from_keras(keras_model=model) est.fit(data=dataset, batch_size=8, epochs=10, validation_data=dataset) eval_result = est.evaluate(dataset) assert 'acc Top1Accuracy' in eval_result
def test_estimator_graph_dataframe_exception(self): tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") sc = init_nncontext() sqlcontext = SQLContext(sc) df = sqlcontext.read.csv(file_path, header=True, inferSchema=True) est = Estimator.from_graph(inputs=[model.user, model.item], labels=[model.label], outputs=[model.logits], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) with self.assertRaises(Exception) as context: est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], validation_data=df) self.assertTrue( 'label columns is None; it should not be None in training' in str( context.exception)) est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], labels_cols=['label']) with self.assertRaises(Exception) as context: predictions = est.predict(df, batch_size=4).collect() self.assertTrue( 'feature columns is None; it should not be None in prediction' in str(context.exception)) with self.assertRaises(Exception) as context: est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], labels_cols=['label'], validation_data=[1, 2, 3]) self.assertTrue( 'train data and validation data should be both Spark DataFrame' in str(context.exception))
def _test_estimator_graph_tf_dataset(self, dataset_creator): tf.reset_default_graph() model = SimpleModel() dataset = dataset_creator() est = Estimator.from_graph(inputs=[model.user, model.item], labels=[model.label], outputs=[model.logits], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) est.fit(data=dataset, batch_size=8, epochs=10, validation_data=dataset) result = est.evaluate(dataset, batch_size=4) assert 'loss' in result
def main(max_epoch, dataset_dir): mnist_train = tfds.load(name="mnist", split="train", data_dir=dataset_dir) mnist_test = tfds.load(name="mnist", split="test", data_dir=dataset_dir) mnist_train = mnist_train.map(preprocess) mnist_test = mnist_test.map(preprocess) model = tf.keras.Sequential([ tf.keras.layers.Conv2D(20, kernel_size=(5, 5), strides=(1, 1), activation='tanh', input_shape=(28, 28, 1), padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Conv2D(50, kernel_size=(5, 5), strides=(1, 1), activation='tanh', padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(500, activation='tanh'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) est = Estimator.from_keras(keras_model=model) est.fit(data=mnist_train, batch_size=320, epochs=max_epoch, validation_data=mnist_test, auto_shard_files=False) result = est.evaluate(mnist_test, auto_shard_files=False) print(result) est.save_keras_model("/tmp/mnist_keras.h5")
def train(train_data, test_data, user_size, item_size): model = NCF(opt.embedding_size, user_size, item_size) estimator = Estimator.from_graph(inputs=[model.user, model.item], outputs=[model.class_number], labels=[model.label], loss=model.loss, optimizer=model.optim, model_dir=opt.model_dir, metrics={"loss": model.loss}) estimator.fit(data=train_data, batch_size=opt.batch_size, epochs=opt.epochs, validation_data=test_data) checkpoint_path = os.path.join(opt.model_dir, "NCF.ckpt") estimator.save_tf_checkpoint(checkpoint_path) estimator.sess.close()
def test_estimator_graph(estimator_for_spark_fixture): import zoo.orca.data.pandas sc = estimator_for_spark_fixture tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path, sc) def transform(df): result = { "x": (df['user'].to_numpy(), df['item'].to_numpy()), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) est = Estimator.from_graph(inputs=[model.user, model.item], labels=[model.label], outputs=[model.logits], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) est.fit(data=data_shard, batch_size=8, steps=10, validation_data=data_shard) data_shard = zoo.orca.data.pandas.read_csv(file_path, sc) def transform(df): result = { "x": (df['user'].to_numpy(), df['item'].to_numpy()), } return result data_shard = data_shard.transform_shard(transform) predictions = est.predict(data_shard).collect() print(predictions)
def test_estimator_graph_predict_dataset(self): tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) est = Estimator.from_graph(inputs=[model.user, model.item], outputs=[model.logits]) def transform(df): result = { "x": (df['user'].to_numpy(), df['item'].to_numpy()), } return result data_shard = data_shard.transform_shard(transform) dataset = Dataset.from_tensor_slices(data_shard) predictions = est.predict(dataset).collect() assert len(predictions) == 10
def test_estimator_keras_get_model(self): tf.reset_default_graph() model = self.create_model() sc = init_nncontext() sqlcontext = SQLContext(sc) file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv") df = sqlcontext.read.csv(file_path, header=True, inferSchema=True) from pyspark.sql.functions import array df = df.withColumn('user', array('user')) \ .withColumn('item', array('item')) est = Estimator.from_keras(keras_model=model) est.fit(data=df, batch_size=4, epochs=4, feature_cols=['user', 'item'], label_cols=['label'], validation_data=df) assert est.get_model() is model
def test_submodel_in_keras_squential(self): mnet = tf.keras.applications.MobileNetV2(input_shape=(160, 160, 3), include_top=False, weights='imagenet') model = tf.keras.Sequential([ mnet, tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=0.0001), loss='binary_crossentropy', metrics=['accuracy']) dataset = tf.data.Dataset.from_tensor_slices( (np.random.randn(16, 160, 160, 3), np.random.randint(0, 1000, (16, 1)))) est = Estimator.from_keras(keras_model=model) est.fit(data=dataset, batch_size=4, epochs=1, validation_data=dataset)
def test_estimator_keras_xshards(self): import zoo.orca.data.pandas tf.reset_default_graph() model = self.create_model() file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv") data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy().reshape([-1, 1]), df['item'].to_numpy().reshape([-1, 1])), "y": df['label'].to_numpy() } return result data_shard = data_shard.transform_shard(transform) est = Estimator.from_keras(keras_model=model) est.fit(data=data_shard, batch_size=8, epochs=10, validation_data=data_shard) eval_result = est.evaluate(data_shard) print(eval_result) data_shard = zoo.orca.data.pandas.read_csv(file_path) def transform(df): result = { "x": (df['user'].to_numpy().reshape([-1, 1]), df['item'].to_numpy().reshape([-1, 1])), } return result data_shard = data_shard.transform_shard(transform) predictions = est.predict(data_shard).collect() assert predictions[0]['prediction'].shape[1] == 2