def test_linear_classifier(script_mode: bool): """ Works as intended. """ smd.del_hook() tf.reset_default_graph() with SagemakerSimulator() as sim: # Setup train_input_fn, eval_input_fn = get_input_fns() x_feature = tf.feature_column.numeric_column("x", shape=(28, 28)) estimator = tf.compat.v1.estimator.LinearClassifier( feature_columns=[x_feature], model_dir="/tmp/mnist_linear_classifier", n_classes=10) # Train if script_mode: hook = smd.EstimatorHook(out_dir=sim.out_dir) estimator.train(input_fn=train_input_fn, steps=100, hooks=[hook]) else: estimator.train(input_fn=train_input_fn, steps=100) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved."
def test_keras_v1(script_mode): """ Works as intended. """ smd.del_hook() tf.reset_default_graph() tf.keras.backend.clear_session() with SagemakerSimulator() as sim: model = get_keras_model_v1() (x_train, y_train), (x_test, y_test) = get_keras_data() model.compile( loss="sparse_categorical_crossentropy", optimizer=tf.keras.optimizers.RMSprop(), metrics=["accuracy"], ) if script_mode: hook = smd.KerasHook(out_dir=sim.out_dir) history = model.fit( x_train, y_train, batch_size=64, epochs=5, validation_split=0.2, callbacks=[hook] ) test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook]) else: history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2) test_scores = model.evaluate(x_test, y_test, verbose=2) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved."
def test_monitored_session(script_mode): """ Works as intended. """ smd.del_hook() tf.reset_default_graph() json_file_contents = """ { "S3OutputPath": "s3://sagemaker-test", "LocalPath": "/opt/ml/output/tensors", "HookParameters" : { "save_interval": "100" } } """ with SagemakerSimulator(json_file_contents=json_file_contents) as sim: train_op, X, Y = get_train_op_and_placeholders() init = tf.global_variables_initializer() mnist = get_data() if script_mode: hook = smd.SessionHook(out_dir=sim.out_dir) sess = tf.train.MonitoredSession(hooks=[hook]) else: sess = tf.train.MonitoredSession() with sess: sess.run(init) for step in range(1, 101): batch_x, batch_y = mnist.train.next_batch(32) sess.run(train_op, feed_dict={X: batch_x, Y: batch_y}) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved."
def test_monitored_session(script_mode: bool): """ Works as intended. """ smd.del_hook() tf.reset_default_graph() with SagemakerSimulator() as sim: train_op, X, Y = get_train_op_and_placeholders() init = tf.compat.v1.global_variables_initializer() mnist = get_data() if script_mode: hook = smd.SessionHook(out_dir=sim.out_dir) sess = tf.train.MonitoredSession(hooks=[hook]) else: sess = tf.train.MonitoredSession() with sess: sess.run(init) for step in range(1, 101): batch_x, batch_y = mnist.train.next_batch(32) sess.run(train_op, feed_dict={X: batch_x, Y: batch_y}) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved."
def test_estimator(script_mode: bool): """ Works as intended. """ smd.del_hook() tf.reset_default_graph() with SagemakerSimulator() as sim: # Setup mnist_classifier = get_estimator() train_input_fn, eval_input_fn = get_input_fns() # Train and evaluate train_steps, eval_steps = 80, 20 if script_mode: hook = smd.EstimatorHook(out_dir=sim.out_dir) hook.set_mode(mode=smd.modes.TRAIN) mnist_classifier.train(input_fn=train_input_fn, steps=train_steps, hooks=[hook]) hook.set_mode(mode=smd.modes.EVAL) mnist_classifier.evaluate(input_fn=eval_input_fn, steps=eval_steps, hooks=[hook]) else: mnist_classifier.train(input_fn=train_input_fn, steps=train_steps) mnist_classifier.evaluate(input_fn=eval_input_fn, steps=eval_steps) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) print(trial) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert trial.steps() == [0, train_steps], "Wrong step count for trial."
def test_keras_gradients(script_mode, tf_optimizer): """ Works as intended. """ smd.del_hook() tf.reset_default_graph() tf.keras.backend.clear_session() json_file_contents = """ { "S3OutputPath": "s3://sagemaker-test", "LocalPath": "/opt/ml/output/tensors", "CollectionConfigurations": [ { "CollectionName": "gradients" }, { "CollectionName": "optimizer_variables" }, { "CollectionName": "losses" } ] } """ with SagemakerSimulator(json_file_contents=json_file_contents) as sim: model = get_keras_model_v1() (x_train, y_train), (x_test, y_test) = get_keras_data() if tf_optimizer: opt = tf.train.RMSPropOptimizer(0.1) else: opt = tf.keras.optimizers.RMSprop() if script_mode: hook = smd.KerasHook( out_dir=sim.out_dir, include_collections=["gradients", "optimizer_variables", "losses"], ) opt = hook.wrap_optimizer(opt) model.compile( loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"] ) history = model.fit( x_train, y_train, batch_size=16, epochs=5, validation_split=0.2, callbacks=[hook] ) test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook]) else: model.compile( loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"] ) history = model.fit(x_train, y_train, batch_size=16, epochs=5, validation_split=0.2) test_scores = model.evaluate(x_test, y_test, verbose=2) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="gradients")) > 0 if not tf_optimizer: # as this is only supported for keras optimizers currently assert len(trial.tensor_names(collection="optimizer_variables")) > 0
def helper_test_keras_v2(script_mode: bool = False, eager_mode: bool = True): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() if not eager_mode: tf.compat.v1.disable_eager_execution() with SagemakerSimulator() as sim: model = get_keras_model_v2() (x_train, y_train), (x_test, y_test) = get_keras_data() x_train, x_test = x_train / 255, x_test / 255 opt = tf.keras.optimizers.RMSprop() if script_mode: hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True) opt = hook.wrap_optimizer(opt) model.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) history = model.fit(x_train, y_train, batch_size=64, epochs=2, validation_split=0.2, callbacks=[hook]) test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook]) else: model.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) history = model.fit(x_train, y_train, batch_size=64, epochs=2, validation_split=0.2) test_scores = model.evaluate(x_test, y_test, verbose=2) hook = smd.get_hook() assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." # DEFAULT TENSORS SAVED assert len(trial.tensor_names( collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved" assert len(trial.tensor_names( collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved" assert (len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0 ), "Weights were not expected to be saved by default" assert (len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0 ), "Biases were not expected to be saved by default"
def helper_test_keras_v2_json_config(json_file_contents, script_mode: bool = False, eager_mode: bool = True): """ Tests ZCC with custom hook configs """ smd.del_hook() tf.keras.backend.clear_session() if not eager_mode: tf.compat.v1.disable_eager_execution() with SagemakerSimulator(json_file_contents=json_file_contents) as sim: model = get_keras_model_v2() (x_train, y_train), (x_test, y_test) = get_keras_data() x_train, x_test = x_train / 255, x_test / 255 opt = tf.keras.optimizers.RMSprop() if script_mode: hook = smd.KerasHook.create_from_json_file() opt = hook.wrap_optimizer(opt) model.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) history = model.fit(x_train, y_train, batch_size=64, epochs=2, validation_split=0.2, callbacks=[hook]) test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook]) else: model.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) history = model.fit(x_train, y_train, epochs=2, batch_size=64, validation_split=0.2) test_scores = model.evaluate(x_test, y_test, verbose=2) hook = smd.get_hook() assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." if not eager_mode: assert len(trial.tensor_names(collection="gradients")) > 0 assert len(trial.tensor_names(collection="weights")) > 0 assert len(trial.tensor_names(collection="losses")) > 0
def helper_test_keras_v2_gradienttape(script_mode: bool = False, json_file_contents="{}", default=False): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() with SagemakerSimulator(json_file_contents=json_file_contents) as sim: helper_keras_gradienttape_train(script_mode=script_mode, json_file_contents=json_file_contents, sim=sim) hook = smd.get_hook() if script_mode: assert hook if default: assert hook.has_default_hook_configuration() hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0 else: if version.parse(tf.__version__) < version.parse("2.1.2"): assert not hook # only supported on TF 2.1.2 and greater return assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0 if is_tf_2_2() and default is False: # Inputs and Outputs are not saved with the default collection configurations. assert len(trial.tensor_names(collection="inputs")) > 0 assert len(trial.tensor_names(collection="outputs")) > 0 assert trial.tensor_names(collection="outputs") == [ "predictions" ] if "dense_layers" in json_file_contents: # Only assert for test_keras_v2_multi_collections # which defines this custom collection assert len( trial.tensor_names(collection="dense_layers")) > 0 else: assert len( trial.tensor_names(collection="dense_layers")) == 0
def test_outdir_sagemaker(monkeypatch): with TemporaryDirectory() as dir_name: json_file_contents = f""" {{ "S3OutputPath": "s3://sagemaker-test", "LocalPath": "{dir_name}", "HookParameters" : {{ "save_interval": "2", "include_workers": "all" }} }} """ from smdebug.tensorflow import get_hook with SagemakerSimulator(json_file_contents=json_file_contents) as sim: hook = get_hook("keras", create_if_not_exists=True) assert hook.out_dir == dir_name
def test_estimator(script_mode): """ Works as intended. """ smd.del_hook() tf.reset_default_graph() with SagemakerSimulator() as sim: train_steps, eval_steps = 80, 20 helper_train( script_mode=script_mode, sim=sim, train_steps=train_steps, eval_steps=eval_steps ) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) print(trial) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert trial.steps() == [0, train_steps], "Wrong step count for trial."
def test_monitored_session_gradients_zcc(): """ Works as intended. """ smd.del_hook() json_file_contents = """ { "S3OutputPath": "s3://sagemaker-test", "LocalPath": "/opt/ml/output/tensors", "CollectionConfigurations": [ { "CollectionName": "gradients" }, { "CollectionName": "losses" } ] } """ tf.reset_default_graph() with SagemakerSimulator(json_file_contents=json_file_contents) as sim: train_op, X, Y = get_train_op_and_placeholders() init = tf.compat.v1.global_variables_initializer() mnist = get_data() sess = tf.train.MonitoredSession() with sess: sess.run(init) for step in range(1, 101): batch_x, batch_y = mnist.train.next_batch(32) sess.run(train_op, feed_dict={X: batch_x, Y: batch_y}) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="gradients")) > 0
def helper_test_keras_v2(script_mode: bool = False, eager_mode: bool = True): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() if not eager_mode and is_tf_2_3() is False and is_tf_2_2() is False: # v1 training APIs are currently not supported # in ZCC mode with smdebug 0.9 and AWS TF 2.3.0 tf.compat.v1.disable_eager_execution() enable_tb = False if (tf.__version__ == "2.0.2" or is_tf_2_3()) else True run_eagerly = None if is_tf_2_2() or is_tf_2_3(): run_eagerly = eager_mode with SagemakerSimulator(enable_tb=enable_tb) as sim: helper_keras_fit(script_mode=script_mode, eager_mode=eager_mode, run_eagerly=run_eagerly, sim=sim) hook = smd.get_hook() assert hook # Check if the hook was executed with the default # hook configuration assert hook.has_default_hook_configuration() hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." # DEFAULT TENSORS SAVED assert len(trial.tensor_names( collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved" assert len(trial.tensor_names( collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved" assert (len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0 ), "Weights were not expected to be saved by default" assert (len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0 ), "Biases were not expected to be saved by default"
def test_bert_simple(): # Test bert with the default smdebug configuration smd.del_hook() with SagemakerSimulator(enable_tb=False) as sim: epochs = 1 model = TFBertForSequenceClassification.from_pretrained( "bert-base-uncased") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") data = tfds.load("glue/mrpc") train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, max_length=128, task="mrpc") train_dataset = train_dataset.shuffle(100).batch(32).repeat(2) optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) model.compile(optimizer=optimizer, loss=loss) model.fit(train_dataset, epochs=epochs, steps_per_epoch=10) hook = smd.get_hook() assert hook.has_default_hook_configuration() hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." # DEFAULT TENSORS SAVED assert len(trial.tensor_names( collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved" assert len(trial.tensor_names( collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved" assert (len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0 ), "Weights were not expected to be saved by default" assert (len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0 ), "Biases were not expected to be saved by default"
def test_sagemaker(): json_file_contents = """ { "S3OutputPath": "s3://sagemaker-test", "LocalPath": "/opt/ml/output/tensors", "HookParameters": null, "CollectionConfigurations": [ { "CollectionName": "weights", "CollectionParameters": null }, { "CollectionName": "losses", "CollectionParameters": null } ], "DebugHookSpecification": null } """ with SagemakerSimulator(json_file_contents=json_file_contents) as sim: smd.del_hook() hook = smd.get_hook(hook_type="session", create_if_not_exists=True) print(hook) assert "weights" in hook.include_collections, hook
def test_tensorflow2_with_unsupported_version(eager_mode: bool = True): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() helper_keras_fit() hook = smd.get_hook() assert hook is None
def test_estimator_gradients_zcc(nested=False, mirrored=False): """ Works as intended. """ smd.del_hook() tf.reset_default_graph() json_file_contents = """ { "S3OutputPath": "s3://sagemaker-test", "LocalPath": "/opt/ml/output/tensors", "HookParameters" : { "save_interval": "2", "include_workers": "all" }, "CollectionConfigurations": [ { "CollectionName": "gradients" }, { "CollectionName": "weights" }, { "CollectionName": "losses" }, { "CollectionName": "biases" } ] } """ with SagemakerSimulator(json_file_contents=json_file_contents) as sim: if mirrored: test_basic("/opt/ml/output/tensors", zcc=True) else: # Setup mnist_classifier = get_estimator(nested_optimizer=nested, mirrored=mirrored) train_input_fn, eval_input_fn = get_input_fns() # Train and evaluate train_steps, eval_steps = 10, 10 mnist_classifier.train(input_fn=train_input_fn, steps=train_steps) mnist_classifier.evaluate(input_fn=eval_input_fn, steps=eval_steps) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) print(trial) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert trial.steps() == [ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, ], "Wrong step count for trial." print(trial.tensor_names(collection="gradients")) assert len(trial.tensor_names(collection="gradients")) > 0 assert len(trial.tensor_names(collection="weights")) > 0 assert len(trial.tensor_names(collection="losses")) > 0 assert len( trial.tensor( trial.tensor_names(collection="gradients")[0]).steps()) > 0 assert len(trial.modes()) == 2
def helper_test_keras_v2_gradienttape(script_mode: bool = False, json_file_contents="{}", default=False): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() with SagemakerSimulator(json_file_contents=json_file_contents) as sim: model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), # WA for TF issue #36279 tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation="softmax"), ]) (x_train, y_train), _ = get_keras_data() dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64))) dataset = dataset.shuffle(1000).batch(64) opt = tf.keras.optimizers.RMSprop() cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True) train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy() n_epochs = 1 if script_mode: if json_file_contents == "{}": hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True) else: hook = smd.KerasHook.create_from_json_file() for epoch in range(n_epochs): print("Epoch %d/%d" % (epoch + 1, n_epochs)) for data, labels in dataset: dataset_labels = labels labels = tf.one_hot(labels, depth=10) with hook.wrap_tape(tf.GradientTape()) as tape: logits = model(data, training=True) # (32,10) loss_value = cce(labels, logits) grads = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) hook.save_tensor(tensor_name="accuracy", tensor_value=acc, collections_to_write="metrics") log = "Epoch %d " % (epoch + 1) log += "Accuracy %.4f" % train_acc_metric.result() print(log) train_acc_metric.reset_states() hook = smd.get_hook() assert hook if default: assert hook.has_default_hook_configuration() hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0 else: # ZCC support added from smdebug v0.8.0) for epoch in range(n_epochs): print("Epoch %d/%d" % (epoch + 1, n_epochs)) for data, labels in dataset: dataset_labels = labels labels = tf.one_hot(labels, depth=10) with tf.GradientTape(persistent=True) as tape: logits = model(data, training=True) # (32,10) loss_value = cce(labels, logits) grads = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) log = "Epoch %d " % (epoch + 1) log += "Accuracy %.4f" % train_acc_metric.result() print(log) train_acc_metric.reset_states() hook = smd.get_hook() if not (is_tf_2_2() or is_tf_2_3()): assert not hook # only supported on TF 2.2 and greater return assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0 if is_tf_2_2() and default is False: # Inputs and Outputs are not saved with the default collection configurations. assert len(trial.tensor_names(collection="inputs")) > 0 assert len(trial.tensor_names(collection="outputs")) > 0 assert trial.tensor_names(collection="outputs") == [ "predictions" ] if "dense_layers" in json_file_contents: # Only assert for test_keras_v2_multi_collections # which defines this custom collection assert len( trial.tensor_names(collection="dense_layers")) > 0 else: assert len( trial.tensor_names(collection="dense_layers")) == 0
def cnn_model_fn(features, labels, mode): """Model function for CNN.""" # Input Layer input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) # Convolutional Layer #1 conv1 = tf.layers.conv2d( inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu, ) # Pooling Layer #1 pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) # Convolutional Layer #2 and Pooling Layer #2 conv2 = tf.layers.conv2d(inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu) pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) # Dense Layer pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) dropout = tf.layers.dropout( inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) # Logits Layer logits = tf.layers.dense(inputs=dropout, units=10) predictions = { # Generate predictions (for PREDICT and EVAL mode) "classes": tf.argmax(input=logits, axis=1), # Add `softmax_tensor` to the graph. It is used for PREDICT and by the # `logging_hook`. "probabilities": tf.nn.softmax(logits, name="softmax_tensor"), } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate Loss (for both TRAIN and EVAL modes) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) # Configure the Training Op (for TRAIN mode) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) optimizer = smd.get_hook().wrap_optimizer(optimizer) train_op = optimizer.minimize( loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # Add evaluation metrics (for EVAL mode) eval_metric_ops = { "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def helper_mirrored( trial_dir, save_all=False, num_steps=3, save_config=None, reduction_config=None, include_collections=None, steps=None, zcc=False, eval_distributed=False, include_workers="all", ): num_gpus = get_available_gpus() num_devices = num_gpus if num_gpus > 0 else 1 batch_size = 10 * num_devices # input_fn which serves Dataset input_fn_provider = InputFnProvider( per_device_batch_size(batch_size, num_devices)) # Use multiple GPUs by MirroredStragtegy. # All avaiable GPUs will be used if `num_gpus` is omitted. # if num_devices > 1: distribution = tf.contrib.distribute.MirroredStrategy() # print("### Doing Multi GPU Training") # else: # distribution = None # Pass to RunConfig config = tf.estimator.RunConfig( train_distribute=distribution, eval_distribute=distribution if eval_distributed else None, model_dir="/tmp/mnist_convnet_model", ) if save_config is None: save_config = smd.SaveConfig(save_interval=2) if include_collections is None: include_collections = [ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.LOSSES, ] if not zcc: ts_hook = smd.SessionHook( out_dir=trial_dir, save_all=save_all, include_collections=include_collections, save_config=save_config, reduction_config=reduction_config, include_workers=include_workers, ) else: print("zcc is passed. ignoring include_collections and save_config") mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, config=config) if steps is None: steps = ["train"] for s in steps: if s == "train": print("Starting train") if not zcc: ts_hook.set_mode(smd.modes.TRAIN) # Train the model mnist_classifier.train( input_fn=input_fn_provider.train_input_fn, steps=num_steps, hooks=[ts_hook]) else: mnist_classifier.train( input_fn=input_fn_provider.train_input_fn, steps=num_steps) elif s == "eval": print("Starting eval") if not zcc: ts_hook.set_mode(smd.modes.EVAL) # Evaluate the model and print results mnist_classifier.evaluate( input_fn=input_fn_provider.eval_input_fn, steps=num_steps, hooks=[ts_hook]) else: mnist_classifier.evaluate( input_fn=input_fn_provider.eval_input_fn, steps=num_steps) elif s == "predict": print("Starting predict") if not zcc: ts_hook.set_mode(smd.modes.PREDICT) # Evaluate the model and print results p = mnist_classifier.predict( input_fn=input_fn_provider.eval_input_fn, hooks=[ts_hook]) else: p = mnist_classifier.predict( input_fn=input_fn_provider.eval_input_fn) for i in range(num_steps): next(p) get_hook()._cleanup() return distribution
def helper_test_keras_v2_json_config(json_file_contents, script_mode: bool = False, eager_mode: bool = True, custom_classifier=False): """ Tests ZCC with custom hook configs """ smd.del_hook() tf.keras.backend.clear_session() if not eager_mode and is_tf_2_3() is False and is_tf_2_2() is False: # v1 training APIs are currently not supported # in ZCC mode with smdebug 0.9 and AWS TF 2.3.0 tf.compat.v1.disable_eager_execution() run_eagerly = None if is_tf_2_2() or is_tf_2_3(): run_eagerly = eager_mode enable_tb = False if (tf.__version__ == "2.0.2" or is_tf_2_3()) else True with SagemakerSimulator(json_file_contents=json_file_contents, enable_tb=enable_tb) as sim: if custom_classifier: model = CustomClassifierModel([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation="softmax"), ]) else: model = get_keras_model_v2() (x_train, y_train), (x_test, y_test) = get_keras_data() x_train, x_test = x_train / 255, x_test / 255 opt = tf.keras.optimizers.RMSprop() if script_mode: hook = smd.KerasHook.create_from_json_file() opt = hook.wrap_optimizer(opt) model.compile( loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"], run_eagerly=run_eagerly, ) history = model.fit(x_train, y_train, batch_size=64, epochs=2, validation_split=0.2, callbacks=[hook]) test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook]) else: model.compile( loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"], run_eagerly=run_eagerly, ) history = model.fit(x_train, y_train, epochs=2, batch_size=64, validation_split=0.2) test_scores = model.evaluate(x_test, y_test, verbose=2) hook = smd.get_hook() assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." if not eager_mode and is_tf_2_2(): assert len(trial.tensor_names(collection="gradients")) > 0 assert len(trial.tensor_names(collection="weights")) > 0 assert len(trial.tensor_names(collection="losses")) > 0 if is_tf_2_2(): assert len(trial.tensor_names(collection="inputs")) > 0 assert len(trial.tensor_names(collection="outputs")) > 0 if "dense_layers" in json_file_contents: # Only assert for test_keras_v2_multi_collections # which defines this custom collection assert len(trial.tensor_names(collection="dense_layers")) > 0 else: assert len(trial.tensor_names(collection="dense_layers")) == 0
def train_model( trial_dir, save_all=False, hook=None, include_collections=None, reduction_config=None, save_config=None, eager=True, strategy=None, steps=None, add_callbacks=None, include_workers="all", ): tf.keras.backend.clear_session() if not eager: tf.compat.v1.disable_eager_execution() datasets, info = tfds.load(name="mnist", with_info=True, as_supervised=True) mnist_train, mnist_test = datasets["train"], datasets["test"] if strategy is None: strategy = tf.distribute.MirroredStrategy() # You can also do info.splits.total_num_examples to get the total # number of examples in the dataset. BUFFER_SIZE = 10000 BATCH_SIZE_PER_REPLICA = 64 BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync def scale(image, label): image = tf.cast(image, tf.float32) image /= 255 return image, label train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch( BATCH_SIZE) eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE) if hook is None: if save_config is None: save_config = SaveConfig(save_interval=3) hook = KerasHook( out_dir=trial_dir, save_config=save_config, reduction_config=reduction_config, include_collections=include_collections, save_all=save_all, include_workers=include_workers, ) if not save_all and include_collections is not None: for cname in hook.include_collections: if cname not in include_collections: hook.get_collection(cname).save_config = SaveConfig( end_step=0) opt = tf.keras.optimizers.Adam() opt = hook.wrap_optimizer(opt) with strategy.scope(): relu_layer = tf.keras.layers.Dense(64, activation="relu") model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation="relu", input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), relu_layer, tf.keras.layers.Dense(10, activation="softmax"), ]) model.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) hooks = [] if add_callbacks: if "tensorboard" in add_callbacks: hooks.append( # write_grads = True causes crash saying handle must be created in scope # erorr like this https://stackoverflow.com/questions/56836895/custom-training-loop-using-tensorflow-gpu-1-14-and-tf-distribute-mirroredstrateg # this crash is even if callback is off tf.keras.callbacks.TensorBoard(log_dir="/tmp/logs", histogram_freq=4, write_images=True)) hooks.append(hook) scalars_to_be_saved = dict() ts = time.time() scalars_to_be_saved["scalar/foobar"] = (ts, steps) hook.save_scalar("foobar", 1, sm_metric=True, timestamp=ts) if steps is None: steps = ["train"] for step in steps: if step == "train": model.fit(train_dataset, epochs=1, steps_per_epoch=10, callbacks=hooks, verbose=0) elif step == "eval": model.evaluate(eval_dataset, steps=10, callbacks=hooks, verbose=0) elif step == "predict": model.predict(train_dataset, steps=4, callbacks=hooks, verbose=0) smd.get_hook().close() return strategy, scalars_to_be_saved
def helper_test_keras_v2(script_mode: bool = False, eager_mode: bool = True): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() if not eager_mode and is_tf_2_3() is False and is_tf_2_2() is False: # v1 training APIs are currently not supported # in ZCC mode with smdebug 0.9 and AWS TF 2.3.0 tf.compat.v1.disable_eager_execution() enable_tb = False if (tf.__version__ == "2.0.2" or is_tf_2_3()) else True run_eagerly = None if is_tf_2_2() or is_tf_2_3(): run_eagerly = eager_mode with SagemakerSimulator(enable_tb=enable_tb) as sim: model = get_keras_model_v2() (x_train, y_train), (x_test, y_test) = get_keras_data() x_train, x_test = x_train / 255, x_test / 255 opt = tf.keras.optimizers.RMSprop() if script_mode: hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True) opt = hook.wrap_optimizer(opt) model.compile( loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"], run_eagerly=run_eagerly, ) history = model.fit( x_train, y_train, batch_size=64, epochs=1, validation_split=0.2, callbacks=[hook] ) test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook]) else: model.compile( loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"], run_eagerly=run_eagerly, ) history = model.fit(x_train, y_train, batch_size=64, epochs=1, validation_split=0.2) test_scores = model.evaluate(x_test, y_test, verbose=2) hook = smd.get_hook() assert hook # Check if the hook was executed with the default # hook configuration assert hook.has_default_hook_configuration() hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." # DEFAULT TENSORS SAVED assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved" assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved" assert ( len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0 ), "Weights were not expected to be saved by default" assert ( len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0 ), "Biases were not expected to be saved by default"
def cnn_model_fn(features, labels, mode): """Model function for CNN.""" # Input Layer # Reshape X to 4-D tensor: [batch_size, width, height, channels] # MNIST images are 28x28 pixels, and have one color channel input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) # Convolutional Layer #1 # Computes 32 features using a 5x5 filter with ReLU activation. # Padding is added to preserve width and height. # Input Tensor Shape: [batch_size, 28, 28, 1] # Output Tensor Shape: [batch_size, 28, 28, 32] conv1 = tf.layers.conv2d(inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu) # Pooling Layer #1 # First max pooling layer with a 2x2 filter and stride of 2 # Input Tensor Shape: [batch_size, 28, 28, 32] # Output Tensor Shape: [batch_size, 14, 14, 32] pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) # Convolutional Layer #2 # Computes 64 features using a 5x5 filter. # Padding is added to preserve width and height. # Input Tensor Shape: [batch_size, 14, 14, 32] # Output Tensor Shape: [batch_size, 14, 14, 64] conv2 = tf.layers.conv2d(inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu) # Pooling Layer #2 # Second max pooling layer with a 2x2 filter and stride of 2 # Input Tensor Shape: [batch_size, 14, 14, 64] # Output Tensor Shape: [batch_size, 7, 7, 64] pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) # Flatten tensor into a batch of vectors # Input Tensor Shape: [batch_size, 7, 7, 64] # Output Tensor Shape: [batch_size, 7 * 7 * 64] pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) # Dense Layer # Densely connected layer with 1024 neurons # Input Tensor Shape: [batch_size, 7 * 7 * 64] # Output Tensor Shape: [batch_size, 1024] dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) # Add dropout operation; 0.6 probability that element will be kept dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) # Logits layer # Input Tensor Shape: [batch_size, 1024] # Output Tensor Shape: [batch_size, 10] logits = tf.layers.dense(inputs=dropout, units=10) predictions = { # Generate predictions (for PREDICT and EVAL mode) "classes": tf.argmax(input=logits, axis=1), # Add `softmax_tensor` to the graph. It is used for PREDICT and by the # `logging_hook`. "probabilities": tf.nn.softmax(logits, name="softmax_tensor"), } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate Loss (for both TRAIN and EVAL modes) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) # Configure the Training Op (for TRAIN mode) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) optimizer = smd.get_hook().wrap_optimizer(optimizer) train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # Add evaluation metrics (for EVAL mode) eval_metric_ops = { "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def helper_test_keras_v2_gradienttape(script_mode: bool = False, json_file_contents="{}"): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() with SagemakerSimulator(json_file_contents=json_file_contents) as sim: model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), # WA for TF issue #36279 tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation="softmax"), ]) (x_train, y_train), _ = get_keras_data() dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64))) dataset = dataset.shuffle(1000).batch(64) opt = tf.keras.optimizers.RMSprop() cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True) train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy() n_epochs = 2 if script_mode: if json_file_contents == "{}": hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True) else: hook = smd.KerasHook.create_from_json_file() for epoch in range(n_epochs): print("Epoch %d/%d" % (epoch + 1, n_epochs)) for data, labels in dataset: dataset_labels = labels labels = tf.one_hot(labels, depth=10) with hook.wrap_tape(tf.GradientTape()) as tape: logits = model(data, training=True) # (32,10) loss_value = cce(labels, logits) grads = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) hook.record_tensor_value(tensor_name="accuracy", tensor_value=acc) log = "Epoch %d " % (epoch + 1) log += "Accuracy %.4f" % train_acc_metric.result() print(log) train_acc_metric.reset_states() hook = smd.get_hook() assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0 else: # ZCC doesn't support yet (as of smdebug v0.7.2) for epoch in range(n_epochs): print("Epoch %d/%d" % (epoch + 1, n_epochs)) for data, labels in dataset: dataset_labels = labels labels = tf.one_hot(labels, depth=10) with tf.GradientTape(persistent=True) as tape: logits = model(data, training=True) # (32,10) loss_value = cce(labels, logits) grads = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) log = "Epoch %d " % (epoch + 1) log += "Accuracy %.4f" % train_acc_metric.result() print(log) train_acc_metric.reset_states() hook = smd.get_hook() assert not hook