def test_save_all(out_dir, tf_eager_mode, workers): save_config = SaveConfig(save_steps=[5]) strategy, saved_scalars = train_model( out_dir, include_collections=None, save_all=True, save_config=save_config, steps=["train"], eager=tf_eager_mode, include_workers=workers, ) tr = create_trial_fast_refresh(out_dir) print(tr.tensor_names()) if tf_eager_mode: if is_tf_2_2(): assert len( tr.tensor_names()) == (6 + 2 + 1 + 5 + 1 + 1 + 2 + 8 + 8 if is_tf_2_2() else 6 + 3 + 1 + 5 + 1) # weights, metrics, losses, optimizer variables, scalar, inputs, outputs, gradients, layers else: assert len( tr.tensor_names()) == (6 + 2 + 1 + 5 + 1 if is_tf_2_3() else 6 + 3 + 1 + 5 + 1) else: assert (len(tr.tensor_names()) == 6 + 6 + 5 + 3 + 1 + 3 * strategy.num_replicas_in_sync + 2 * strategy.num_replicas_in_sync) # weights, grads, optimizer_variables, metrics, losses, outputs assert len(tr.steps()) == 3 for tname in tr.tensor_names(): assert len( tr.tensor(tname).workers(0)) == (1 if workers == "one" else strategy.num_replicas_in_sync) verify_files(out_dir, save_config, saved_scalars)
def test_regex_filtering_for_default_collections(out_dir): hook = smd.KerasHook( out_dir, save_config=SaveConfig(save_interval=9), include_collections=[CollectionKeys.LAYERS, CollectionKeys.GRADIENTS], ) hook.get_collection(CollectionKeys.LAYERS).include("^dense") hook.get_collection(CollectionKeys.GRADIENTS).include("gradients/dense") helper_keras_fit( out_dir, hook=hook, save_config=SaveConfig(save_interval=10), steps=["train"], run_eagerly=True, ) tr = create_trial_fast_refresh(out_dir) layer_tnames = tr.tensor_names(collection=CollectionKeys.LAYERS) gradient_tnames = tr.tensor_names(collection=CollectionKeys.GRADIENTS) assert len(layer_tnames) == (4 if is_tf_2_2() else 0) assert len(gradient_tnames) == (4 if is_tf_2_2() else 0) layer_pattern = r"^(dense)(_\d+)?\/(inputs|outputs)" gradient_pattern = r"gradients/dense" for tname in layer_tnames: assert tr.tensor(tname).value(0) is not None assert re.match(pattern=layer_pattern, string=tname) is not None for tname in gradient_tnames: assert tr.tensor(tname).value(0) is not None assert re.match(pattern=gradient_pattern, string=tname) is not None
def test_keras_fit(out_dir, tf_eager_mode, saveall): hook = smd.KerasHook(out_dir=out_dir, save_all=saveall) helper_keras_fit( trial_dir=out_dir, hook=hook, eager=tf_eager_mode, steps=["train", "eval", "predict", "train"], ) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases if tf_eager_mode: assert len(trial.tensor_names()) == (12 if is_tf_2_2() else 13) else: assert len(trial.tensor_names()) == 21 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len( trial.tensor_names( collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 assert ( len( trial.tensor_names( collection=CollectionKeys.OPTIMIZER_VARIABLES, mode=ModeKeys.EVAL)) == 0, "No Optimizer Variables Should be Saved in EVAL Mode", ) else: # save the default losses and metrics assert len(trial.tensor_names()) == (3 if is_tf_2_2() and tf_eager_mode else 4) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == ( 2 if is_tf_2_2() and tf_eager_mode else 3)
def test_include_collections(out_dir, tf_eager_mode): include_collections = [ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.LOSSES, CollectionKeys.OUTPUTS, CollectionKeys.METRICS, CollectionKeys.OPTIMIZER_VARIABLES, ] save_config = SaveConfig(save_interval=3) hook = smd.KerasHook( out_dir, save_config=save_config, include_collections=include_collections, reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), ) helper_keras_fit(out_dir, hook=hook, steps=["train", "eval", "predict"], eager=tf_eager_mode) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x if tf_eager_mode: assert len(trial.tensor_names()) == (7 if is_tf_2_2() else 8) else: assert len(trial.tensor_names()) == 18 assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == ( 2 if is_tf_2_2() and tf_eager_mode else 3 )
def test_keras_fit_pure_eager(out_dir, tf_eager_mode): """ Test save all and save default collection in fit() pure eager mode """ hook = smd.KerasHook(out_dir=out_dir, save_all=True, save_config=SaveConfig(save_interval=3)) helper_keras_fit(trial_dir=out_dir, hook=hook, eager=tf_eager_mode, run_eagerly=True) trial = smd.create_trial(path=out_dir) if is_tf_2_2(): assert len(trial.tensor_names()) == 27 else: assert len(trial.tensor_names()) == (20 if is_tf_2_3() else 21) assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len( trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 assert len(trial.tensor_names( collection=CollectionKeys.INPUTS)) == (1 if is_tf_2_2() else 0) assert len(trial.tensor_names( collection=CollectionKeys.OUTPUTS)) == (2 if is_tf_2_2() else 0)
def test_keras_fit(out_dir, tf_eager_mode, saveall): hook = smd.KerasHook(out_dir=out_dir, save_all=saveall) ts = time.time() hook.save_scalar("foobar", 1, sm_metric=True, timestamp=ts) scalars_to_be_saved = dict() scalars_to_be_saved["scalar/foobar"] = (ts, 0) helper_keras_fit( trial_dir=out_dir, hook=hook, run_eagerly=tf_eager_mode, steps=["train", "eval", "predict", "train"], ) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases, scalar if tf_eager_mode: if is_tf_2_2(): assert len(trial.tensor_names()) == 28 else: assert len(trial.tensor_names()) == (21 if is_tf_2_3() else 14) assert len(trial.tensor_names(collection=CollectionKeys.INPUTS)) == ( 1 if is_tf_2_2() else 0 ) assert len(trial.tensor_names(collection=CollectionKeys.OUTPUTS)) == ( 2 if is_tf_2_2() else 0 ) else: assert len(trial.tensor_names()) == 21 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 assert ( len( trial.tensor_names( collection=CollectionKeys.OPTIMIZER_VARIABLES, mode=ModeKeys.EVAL ) ) == 0, "No Optimizer Variables Should be Saved in EVAL Mode", ) else: # save the default losses and metrics assert len(trial.tensor_names()) == ( 4 if (is_tf_2_2() or is_tf_2_3()) and tf_eager_mode else 5 ) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == ( 2 if (is_tf_2_2() or is_tf_2_3()) and tf_eager_mode else 3 ) for tname in trial.tensor_names(): assert trial.tensor(tname).value(0) is not None
def test_include_regex(out_dir, tf_eager_mode, workers): hook = KerasHook( out_dir=out_dir, save_config=SaveConfig(save_interval=9), include_collections=["custom_coll"], include_workers=workers, ) hook.get_collection("custom_coll").include("dense") strategy, _ = train_model(out_dir, hook=hook, steps=["train"], eager=tf_eager_mode) tr = create_trial_fast_refresh(out_dir) tnames = tr.tensor_names(collection="custom_coll") if tf_eager_mode: assert len(tnames) == (12 if is_tf_2_2() else 4) else: assert len(tnames) == 4 + 3 * strategy.num_replicas_in_sync for tname in tnames: assert tr.tensor(tname).value(0) is not None assert len( tr.tensor(tname).workers(0)) == (1 if workers == "one" else strategy.num_replicas_in_sync)
def test_gradtape_include_collections(out_dir): """ This test ensures that a training script written with GradientTape handles the case where hook config contains all collections mentioned through include collections """ include_collections = [ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.LOSSES, CollectionKeys.OUTPUTS, CollectionKeys.METRICS, CollectionKeys.OPTIMIZER_VARIABLES, ] save_config = SaveConfig(save_interval=3) hook = smd.KerasHook( out_dir, save_config=save_config, include_collections=include_collections, reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), ) helper_keras_gradtape(out_dir, hook=hook) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x assert len(trial.tensor_names()) == (16 if is_tf_2_2() else 15) assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 assert len( trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 1
def test_hook_from_json(out_dir, tf_eager_mode, monkeypatch): monkeypatch.setenv( CONFIG_FILE_PATH_ENV_STR, "tests/tensorflow/hooks/test_json_configs/test_collection_defaults.json", ) hook = smd.KerasHook.create_from_json_file() helper_keras_fit(out_dir, hook=hook, steps=["train"], run_eagerly=tf_eager_mode) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x assert len(trial.tensor_names()) == (5 if (is_tf_2_2() or is_tf_2_3()) and tf_eager_mode else 6) assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == ( 2 if (is_tf_2_2() or is_tf_2_3()) and tf_eager_mode else 3 )
def test_weights_collections(out_dir, tf_eager_mode): hook = smd.KerasHook( out_dir, save_config=SaveConfig(save_interval=3), include_collections=[CollectionKeys.WEIGHTS], ) helper_keras_fit(out_dir, hook=hook, steps=["train"], run_eagerly=tf_eager_mode) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x assert len(trial.tensor_names()) == (5 if (is_tf_2_2() or is_tf_2_3()) and tf_eager_mode else 6) assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == ( 2 if (is_tf_2_2() or is_tf_2_3()) and tf_eager_mode else 3 )
def test_include_collections(out_dir, tf_eager_mode): include_collections = [ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.LOSSES, CollectionKeys.METRICS, CollectionKeys.OPTIMIZER_VARIABLES, "custom_optimizer_variables", ] save_config = SaveConfig(save_interval=3) hook = smd.KerasHook( out_dir, save_config=save_config, include_collections=include_collections, reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), ) hook.get_collection("custom_optimizer_variables").include("Adam") helper_keras_fit(out_dir, hook=hook, steps=["train", "eval", "predict"], run_eagerly=tf_eager_mode) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x if tf_eager_mode: if is_tf_2_2(): assert len(trial.tensor_names()) == 16 else: assert len(trial.tensor_names()) == (12 if is_tf_2_3() else 13) else: assert len(trial.tensor_names()) == 18 assert len( trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4 assert len( trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 assert len( trial.tensor_names(collection="custom_optimizer_variables")) == 5 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == ( 2 if (is_tf_2_2() or is_tf_2_3()) and tf_eager_mode else 3)
def helper_test_keras_v2(script_mode: bool = False, eager_mode: bool = True): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() if not eager_mode and is_tf_2_3() is False and is_tf_2_2() is False: # v1 training APIs are currently not supported # in ZCC mode with smdebug 0.9 and AWS TF 2.3.0 tf.compat.v1.disable_eager_execution() enable_tb = False if (tf.__version__ == "2.0.2" or is_tf_2_3()) else True run_eagerly = None if is_tf_2_2() or is_tf_2_3(): run_eagerly = eager_mode with SagemakerSimulator(enable_tb=enable_tb) as sim: helper_keras_fit(script_mode=script_mode, eager_mode=eager_mode, run_eagerly=run_eagerly, sim=sim) hook = smd.get_hook() assert hook # Check if the hook was executed with the default # hook configuration assert hook.has_default_hook_configuration() hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." # DEFAULT TENSORS SAVED assert len(trial.tensor_names( collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved" assert len(trial.tensor_names( collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved" assert (len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0 ), "Weights were not expected to be saved by default" assert (len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0 ), "Biases were not expected to be saved by default"
def helper_test_keras_v2_gradienttape(script_mode: bool = False, json_file_contents="{}", default=False): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() with SagemakerSimulator(json_file_contents=json_file_contents) as sim: helper_keras_gradienttape_train(script_mode=script_mode, json_file_contents=json_file_contents, sim=sim) hook = smd.get_hook() if script_mode: assert hook if default: assert hook.has_default_hook_configuration() hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0 else: if version.parse(tf.__version__) < version.parse("2.1.2"): assert not hook # only supported on TF 2.1.2 and greater return assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0 if is_tf_2_2() and default is False: # Inputs and Outputs are not saved with the default collection configurations. assert len(trial.tensor_names(collection="inputs")) > 0 assert len(trial.tensor_names(collection="outputs")) > 0 assert trial.tensor_names(collection="outputs") == [ "predictions" ] if "dense_layers" in json_file_contents: # Only assert for test_keras_v2_multi_collections # which defines this custom collection assert len( trial.tensor_names(collection="dense_layers")) > 0 else: assert len( trial.tensor_names(collection="dense_layers")) == 0
def test_keras_fit(out_dir, tf_eager_mode, saveall): hook = smd.KerasHook(out_dir=out_dir, save_all=saveall) helper_keras_fit( trial_dir=out_dir, hook=hook, eager=tf_eager_mode, steps=["train", "eval", "predict", "train"], ) trial = smd.create_trial(path=out_dir) # can't save gradients in TF 2.x eager mode if saveall: # save losses, metrics, weights, biases if tf_eager_mode: assert len(trial.tensor_names()) == (7 if is_tf_2_2() else 8) else: assert len(trial.tensor_names()) == 21 assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 else: # save the default losses and metrics assert len(trial.tensor_names()) == (3 if is_tf_2_2() and tf_eager_mode else 4) assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == ( 2 if is_tf_2_2() and tf_eager_mode else 3 )
def test_include_only_custom_collection(out_dir, tf_eager_mode): include_collections = ["custom_optimizer_variables"] save_config = SaveConfig(save_interval=3) hook = smd.KerasHook( out_dir, save_config=save_config, include_collections=include_collections, reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), ) hook.get_collection("custom_optimizer_variables").include("Adam") helper_keras_fit(out_dir, hook=hook, steps=["train", "eval", "predict"], eager=tf_eager_mode) trial = smd.create_trial(path=out_dir) assert len(trial.tensor_names()) == (8 if is_tf_2_2() and tf_eager_mode else 9) assert len(trial.tensor_names(collection="custom_optimizer_variables")) == 5
def test_clash_with_tb_callback(out_dir): # this test cannot be run in non-eager mode helper_keras_fit( out_dir, save_config=SaveConfig(save_interval=9), steps=["train"], include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.LOSSES, CollectionKeys.METRICS, ], add_callbacks=["tensorboard"], ) tr = create_trial_fast_refresh(out_dir) assert len(tr.tensor_names()) == (7 if (is_tf_2_2() or is_tf_2_3()) else 8)
def test_gradtape_include_regex(out_dir): """ Test custom collection with regex """ hook = smd.KerasHook( out_dir, save_config=SaveConfig(save_interval=9), include_collections=["custom_coll"] ) hook.get_collection("custom_coll").include("dense") helper_keras_gradtape(out_dir, hook=hook, save_config=SaveConfig(save_interval=9)) tr = create_trial_fast_refresh(out_dir) tnames = tr.tensor_names(collection="custom_coll") assert len(tnames) == (12 if is_tf_2_2() else 8) for tname in tnames: assert tr.tensor(tname).value(0) is not None
def test_clash_with_tb_callback(out_dir): train_model( out_dir, save_config=SaveConfig(save_interval=9), include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.LOSSES, CollectionKeys.METRICS, ], steps=["train"], add_callbacks=["tensorboard"], ) tr = create_trial_fast_refresh(out_dir) assert len(tr.tensor_names()) == (10 if is_tf_2_2() else 11)
def test_gradtape_persistent(out_dir, saveall): """ Test save all and save default collection """ hook = smd.KerasHook(out_dir=out_dir, save_all=saveall, save_config=SaveConfig(save_interval=3)) helper_keras_gradtape(trial_dir=out_dir, hook=hook, persistent=True) trial = smd.create_trial(path=out_dir) if saveall: # save losses, metrics, weights, biases assert len(trial.tensor_names()) == (25 if is_tf_2_2() else 15) assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2 assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5 else: # save the default losses and metrics assert len(trial.tensor_names()) == 2 assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1 assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 1
def test_include_regex(out_dir, tf_eager_mode): hook = smd.KerasHook( out_dir, save_config=SaveConfig(save_interval=9), include_collections=["custom_coll"] ) hook.get_collection("custom_coll").include("dense") helper_keras_fit( out_dir, hook=hook, save_config=SaveConfig(save_interval=9), steps=["train"], run_eagerly=tf_eager_mode, ) tr = create_trial_fast_refresh(out_dir) tnames = tr.tensor_names(collection="custom_coll") assert len(tnames) == (12 if is_tf_2_2() else 4) for tname in tnames: assert tr.tensor(tname).value(0) is not None
def test_subclassed_model(out_dir): # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data("MNIST-data") x_train, x_test = x_train / 255.0, x_test / 255.0 # Add a channels dimension x_train = x_train[..., tf.newaxis] x_test = x_test[..., tf.newaxis] # Create an instance of the model model = MyModel() train_ds = (tf.data.Dataset.from_tensor_slices( (x_train, y_train)).shuffle(10000, seed=123).batch(2)) MyModel.hook = smd.KerasHook( out_dir, save_all=True, save_config=smd.SaveConfig(save_steps=[x for x in range(10)], save_interval=1), ) MyModel.hook.register_model(model) model.compile(optimizer="Adam", loss="mse", run_eagerly=True) model.fit(train_ds, epochs=1, steps_per_epoch=10, callbacks=[MyModel.hook]) trial = smd.create_trial(out_dir) assert len(trial.tensor_names(collection=smd.CollectionKeys.LAYERS)) == 8 assert trial.tensor_names(collection=smd.CollectionKeys.LOSSES) == ["loss"] if is_tf_2_2(): # Feature to save model inputs and outputs was first added for TF 2.2.0 assert trial.tensor_names(collection=smd.CollectionKeys.INPUTS) == [ "model_input" ] assert trial.tensor_names(collection=smd.CollectionKeys.OUTPUTS) == [ "labels", "predictions", ] assert len( trial.tensor_names(collection=smd.CollectionKeys.GRADIENTS)) == 6
def mode_allworkers(out_dir, mode): path = build_json(out_dir, include_workers="all", include_collections=["weights", "optimizer_variables"]) num_workers = len(get_available_gpus()) mode_args = list(HOROVOD_KERAS_TEST_SCRIPT_ARGS) + ["--model_dir", out_dir] if mode == "cpu": mode_args += ["--use_only_cpu", "true"] launch_horovod_job( script_file_path=HOROVOD_TF2_TEST_MNIST_SCRIPT, script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) assert len(tr.workers()) == num_workers assert len(tr.tensor_names()) == (13 if is_tf_2_2() else 14) assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
def test_save_all(out_dir, tf_eager_mode): strategy = train_model( out_dir, include_collections=None, save_all=True, save_config=SaveConfig(save_steps=[5]), steps=["train"], eager=tf_eager_mode, ) tr = create_trial_fast_refresh(out_dir) print(tr.tensor_names()) if tf_eager_mode: assert len(tr.tensor_names()) == (6 + 2 + 1 + 5 if is_tf_2_2() else 6 + 3 + 1 + 5) # weights, metrics, losses, optimizer variables else: assert (len(tr.tensor_names()) == 6 + 6 + 5 + 3 + 1 + 3 * strategy.num_replicas_in_sync + 2 * strategy.num_replicas_in_sync) # weights, grads, optimizer_variables, metrics, losses, outputs assert len(tr.steps()) == 3
def exhaustive_check(trial_dir, include_workers="one", eager=True): include_collections = [ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.LOSSES, CollectionKeys.OUTPUTS, CollectionKeys.METRICS, CollectionKeys.OPTIMIZER_VARIABLES, ] strategy, _ = train_model( trial_dir, include_collections=include_collections, steps=["train", "eval", "predict", "train"], include_workers=include_workers, eager=eager, ) tr = create_trial_fast_refresh(trial_dir) print(tr.tensor_names()) if include_workers == "all": assert len(tr.workers()) == strategy.num_replicas_in_sync if eager: if is_tf_2_2(): assert len(tr.tensor_names()) == (6 + 1 + 2 + 5 + 1 + 6 + 2) # 6 weights, 1 loss, 2 metrics, 5 optimizer variables, 6 gradients, 2 outputs for Tf 2.2, 1 scalar else: assert len(tr.tensor_names()) == (6 + 1 + 2 + 5 + 1 if (is_tf_2_2() or is_tf_2_3()) else 6 + 1 + 3 + 5 + 1) # 6 weights, 1 loss, 2 metrics, 5 optimizer variables for Tf 2.3, 1 scalar # 6 weights, 1 loss, 3 metrics, 5 optimizer variables for Tf 2.1, 1 scalar else: assert len( tr.tensor_names()) == (6 + 6 + 1 + 3 + strategy.num_replicas_in_sync * 3 + 5) else: assert len(tr.workers()) == 1 if eager: assert len(tr.tensor_names()) == (6 + 3 + 1) else: assert len(tr.tensor_names()) == (6 + 6 + 1 + 3 + 1 * 3 + 5) # 6 weights, 6 gradients, 1 loss, 3 metrics, 24 outputs (8 for each mode), 5 optimizer variables assert len(tr.modes()) == 3 assert len(tr.steps()) == 14 assert len(tr.steps( ModeKeys.TRAIN)) == 8 # 0, 3, 6, 9, 12, 15, 18, 19(end of epoch) assert len(tr.steps(ModeKeys.EVAL)) == 4 assert len(tr.steps(ModeKeys.PREDICT)) == 2 # ran 4 steps above assert len(tr.tensor_names(collection=CollectionKeys.BIASES)) == 3 wtnames = tr.tensor_names(collection=CollectionKeys.WEIGHTS) assert len(wtnames) == 3 for wtname in wtnames: assert len(tr.tensor(wtname).steps()) == 13, wtname assert len(tr.tensor(wtname).steps(ModeKeys.TRAIN)) == 7 for s in tr.tensor(wtname).steps(ModeKeys.TRAIN): assert tr.tensor(wtname).value(s, mode=ModeKeys.TRAIN) is not None for worker in tr.workers(): assert tr.tensor(wtname).value(s, mode=ModeKeys.TRAIN, worker=worker) is not None assert len(tr.tensor(wtname).steps(ModeKeys.EVAL)) == 4 for s in tr.tensor(wtname).steps(ModeKeys.EVAL): assert tr.tensor(wtname).value(s, mode=ModeKeys.EVAL) is not None for worker in tr.workers(): assert tr.tensor(wtname).value(s, mode=ModeKeys.EVAL, worker=worker) is not None assert len(tr.tensor(wtname).steps(ModeKeys.PREDICT)) == 2 if not eager: gradnames = tr.tensor_names(collection=CollectionKeys.GRADIENTS) assert len(gradnames) == 6 for gradname in gradnames: assert len(tr.tensor(gradname).steps(ModeKeys.TRAIN)) == 7 for s in tr.tensor(gradname).steps(ModeKeys.TRAIN): assert tr.tensor(gradname).value( s, mode=ModeKeys.TRAIN) is not None assert len(tr.tensor(gradname).steps(ModeKeys.EVAL)) == 0 assert len(tr.tensor(gradname).steps(ModeKeys.PREDICT)) == 0 optvarnames = tr.tensor_names( collection=CollectionKeys.OPTIMIZER_VARIABLES) assert len(optvarnames) == 5 for optvarname in optvarnames: assert len(tr.tensor(optvarname).steps(ModeKeys.TRAIN)) == 7 for s in tr.tensor(optvarname).steps(ModeKeys.TRAIN): assert tr.tensor(optvarname).value( s, mode=ModeKeys.TRAIN) is not None assert len(tr.tensor(optvarname).steps(ModeKeys.EVAL)) == 0 assert len(tr.tensor(optvarname).steps(ModeKeys.PREDICT)) == 0 assert len(tr.tensor_names(collection=CollectionKeys.LOSSES)) == 1 loss_name = tr.tensor_names(collection=CollectionKeys.LOSSES)[0] # loss is not in predict mode (so less 2) # add one for end of epoch assert len(tr.tensor(loss_name).steps(ModeKeys.TRAIN)) == 8 assert len(tr.tensor(loss_name).steps(ModeKeys.EVAL)) == 4 assert len(tr.tensor(loss_name).steps(ModeKeys.PREDICT)) == 0 assert len(tr.tensor(loss_name).steps()) == 12 metricnames = tr.tensor_names(collection=CollectionKeys.METRICS) assert len(metricnames) == (2 if (is_tf_2_2() or is_tf_2_3()) else 3)
# Third Party import numpy as np import pytest from tensorflow.python.framework.dtypes import _NP_TO_TF from tests.tensorflow2.utils import is_tf_2_2 # First Party from smdebug.core.tfevent.util import _get_proto_dtype @pytest.mark.skipif(is_tf_2_2() is False, reason="Brain Float Is Unavailable in lower versions of TF" ) def test_tensorflow2_datatypes(): # _NP_TO_TF contains all the mappings # of numpy to tf types try: from tensorflow.python import _pywrap_bfloat16 # TF 2.x.x Implements a Custom Numpy Datatype for Brain Floating Type # Which is currently only supported on TPUs _np_bfloat16 = _pywrap_bfloat16.TF_bfloat16_type() _NP_TO_TF.pop(_np_bfloat16) except (ModuleNotFoundError, ValueError, ImportError): pass for _type in _NP_TO_TF: try: _get_proto_dtype(np.dtype(_type)) except Exception: assert False, f"{_type} not supported"
# Third Party import numpy as np import pytest import tensorflow as tf from tensorflow.python.framework.dtypes import _NP_TO_TF from tests.tensorflow2.utils import is_tf_2_2 # First Party from smdebug.core.tfevent.util import _get_proto_dtype @pytest.mark.skipif( is_tf_2_2() is False, reason="Brain Float Is Unavailable in these versions of TF" ) def test_tensorflow2_datatypes(): # _NP_TO_TF contains all the mappings # of numpy to tf types try: from tensorflow.python import _pywrap_bfloat16 # TF 2.x.x Implements a Custom Numpy Datatype for Brain Floating Type # Which is currently only supported on TPUs _np_bfloat16 = _pywrap_bfloat16.TF_bfloat16_type() _NP_TO_TF.pop(_np_bfloat16) except (ModuleNotFoundError, ValueError, ImportError): pass for _type in _NP_TO_TF: try: _get_proto_dtype(np.dtype(_type)) except Exception:
def helper_test_keras_v2(script_mode: bool = False, eager_mode: bool = True): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() if not eager_mode and is_tf_2_3() is False and is_tf_2_2() is False: # v1 training APIs are currently not supported # in ZCC mode with smdebug 0.9 and AWS TF 2.3.0 tf.compat.v1.disable_eager_execution() enable_tb = False if (tf.__version__ == "2.0.2" or is_tf_2_3()) else True run_eagerly = None if is_tf_2_2() or is_tf_2_3(): run_eagerly = eager_mode with SagemakerSimulator(enable_tb=enable_tb) as sim: model = get_keras_model_v2() (x_train, y_train), (x_test, y_test) = get_keras_data() x_train, x_test = x_train / 255, x_test / 255 opt = tf.keras.optimizers.RMSprop() if script_mode: hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True) opt = hook.wrap_optimizer(opt) model.compile( loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"], run_eagerly=run_eagerly, ) history = model.fit(x_train, y_train, batch_size=64, epochs=1, validation_split=0.2, callbacks=[hook]) test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook]) else: model.compile( loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"], run_eagerly=run_eagerly, ) history = model.fit(x_train, y_train, batch_size=64, epochs=1, validation_split=0.2) test_scores = model.evaluate(x_test, y_test, verbose=2) hook = smd.get_hook() assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." # DEFAULT TENSORS SAVED assert len(trial.tensor_names( collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved" assert len(trial.tensor_names( collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved" assert (len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0 ), "Weights were not expected to be saved by default" assert (len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0 ), "Biases were not expected to be saved by default"
def helper_test_keras_v2_json_config(json_file_contents, script_mode: bool = False, eager_mode: bool = True, custom_classifier=False): """ Tests ZCC with custom hook configs """ smd.del_hook() tf.keras.backend.clear_session() if not eager_mode and is_tf_2_3() is False and is_tf_2_2() is False: # v1 training APIs are currently not supported # in ZCC mode with smdebug 0.9 and AWS TF 2.3.0 tf.compat.v1.disable_eager_execution() run_eagerly = None if is_tf_2_2() or is_tf_2_3(): run_eagerly = eager_mode enable_tb = False if (tf.__version__ == "2.0.2" or is_tf_2_3()) else True with SagemakerSimulator(json_file_contents=json_file_contents, enable_tb=enable_tb) as sim: if custom_classifier: model = CustomClassifierModel([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation="softmax"), ]) else: model = get_keras_model_v2() (x_train, y_train), (x_test, y_test) = get_keras_data() x_train, x_test = x_train / 255, x_test / 255 opt = tf.keras.optimizers.RMSprop() if script_mode: hook = smd.KerasHook.create_from_json_file() opt = hook.wrap_optimizer(opt) model.compile( loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"], run_eagerly=run_eagerly, ) history = model.fit(x_train, y_train, batch_size=64, epochs=2, validation_split=0.2, callbacks=[hook]) test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook]) else: model.compile( loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"], run_eagerly=run_eagerly, ) history = model.fit(x_train, y_train, epochs=2, batch_size=64, validation_split=0.2) test_scores = model.evaluate(x_test, y_test, verbose=2) hook = smd.get_hook() assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." if not eager_mode and is_tf_2_2(): assert len(trial.tensor_names(collection="gradients")) > 0 assert len(trial.tensor_names(collection="weights")) > 0 assert len(trial.tensor_names(collection="losses")) > 0 if is_tf_2_2(): assert len(trial.tensor_names(collection="inputs")) > 0 assert len(trial.tensor_names(collection="outputs")) > 0 if "dense_layers" in json_file_contents: # Only assert for test_keras_v2_multi_collections # which defines this custom collection assert len(trial.tensor_names(collection="dense_layers")) > 0 else: assert len(trial.tensor_names(collection="dense_layers")) == 0
hook = smd.KerasHook(trial_dir, save_all=True) return hook def create_model(): input_layer = tf.keras.layers.Input(name="Image_input", shape=(224), dtype="float32") model = tf.keras.layers.Dense(5)(input_layer) model = tf.keras.layers.Activation("softmax", name="output-softmax")(model) model = tf.keras.models.Model(inputs=input_layer, outputs=[model]) return model @pytest.mark.skipif( is_tf_2_2() is False, reason= "Feature to save model inputs and outputs was first added for TF 2.2.0", ) def test_support_dicts(out_dir): model = create_model() optimizer = tf.keras.optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0) model.compile(loss="categorical_crossentropy", optimizer=optimizer) inputs, labels = get_data() smdebug_hook = create_hook(out_dir) model.fit(inputs, labels, batch_size=16,
def helper_test_keras_v2_gradienttape(script_mode: bool = False, json_file_contents="{}", default=False): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() with SagemakerSimulator(json_file_contents=json_file_contents) as sim: model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), # WA for TF issue #36279 tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation="softmax"), ]) (x_train, y_train), _ = get_keras_data() dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64))) dataset = dataset.shuffle(1000).batch(64) opt = tf.keras.optimizers.RMSprop() cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True) train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy() n_epochs = 1 if script_mode: if json_file_contents == "{}": hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True) else: hook = smd.KerasHook.create_from_json_file() for epoch in range(n_epochs): print("Epoch %d/%d" % (epoch + 1, n_epochs)) for data, labels in dataset: dataset_labels = labels labels = tf.one_hot(labels, depth=10) with hook.wrap_tape(tf.GradientTape()) as tape: logits = model(data, training=True) # (32,10) loss_value = cce(labels, logits) grads = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) hook.save_tensor(tensor_name="accuracy", tensor_value=acc, collections_to_write="metrics") log = "Epoch %d " % (epoch + 1) log += "Accuracy %.4f" % train_acc_metric.result() print(log) train_acc_metric.reset_states() hook = smd.get_hook() assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0 else: # ZCC support added from smdebug v0.8.0) for epoch in range(n_epochs): print("Epoch %d/%d" % (epoch + 1, n_epochs)) for data, labels in dataset: dataset_labels = labels labels = tf.one_hot(labels, depth=10) with tf.GradientTape(persistent=True) as tape: logits = model(data, training=True) # (32,10) loss_value = cce(labels, logits) grads = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) log = "Epoch %d " % (epoch + 1) log += "Accuracy %.4f" % train_acc_metric.result() print(log) train_acc_metric.reset_states() hook = smd.get_hook() if not (is_tf_2_2() or is_tf_2_3()): assert not hook # only supported on TF 2.2 and greater return assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0 if is_tf_2_2() and default is False: # Inputs and Outputs are not saved with the default collection configurations. assert len(trial.tensor_names(collection="inputs")) > 0 assert len(trial.tensor_names(collection="outputs")) > 0 assert trial.tensor_names(collection="outputs") == [ "predictions" ] if "dense_layers" in json_file_contents: # Only assert for test_keras_v2_multi_collections # which defines this custom collection assert len( trial.tensor_names(collection="dense_layers")) > 0 else: assert len( trial.tensor_names(collection="dense_layers")) == 0