def test_lstm_and_generator(out_dir): # init hook hook = KerasHook( out_dir, include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.LOSSES, CollectionKeys.GRADIENTS, ], save_config=SaveConfig(save_steps=[0, 1, 2, 3]), ) # init model num_steps = 100 hidden_size = 100 vocabulary = 1000 model = Sequential() model.add(Embedding(vocabulary, hidden_size, input_length=num_steps)) model.add(LSTM(hidden_size, return_sequences=True)) model.add(LSTM(hidden_size, return_sequences=True)) model.add(Dropout(0.2)) model.add(TimeDistributed(Dense(vocabulary))) model.add(Activation("softmax")) model.compile( loss="categorical_crossentropy", optimizer=hook.wrap_optimizer(Adam()), metrics=["categorical_accuracy"], ) train(3, 32, model, num_steps, hook) tr = create_trial(out_dir) assert len(tr.tensor_names(collection=CollectionKeys.LOSSES)) > 0 assert len(tr.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0
def test_tf2_profiler_by_time(tf2_profiler_config_parser_by_time, out_dir): """ This test executes a TF2 training script, enables detailed TF profiling by time, and verifies the number of events. """ assert tf2_profiler_config_parser_by_time.profiling_enabled hook = Hook(out_dir=out_dir) helper_keras_fit(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) hook.close() # get tensorboard timeline files files = [] for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob(f"*{TENSORBOARDTIMELINE_SUFFIX}"): files.append(path) assert len(files) == 1 trace_file = str(files[0]) t_events = TensorboardProfilerEvents() t_events.read_events_from_file(trace_file) all_trace_events = t_events.get_all_events() num_trace_events = len(all_trace_events) print(f"Number of events read = {num_trace_events}") # The number of events is varying by a small number on # consecutive runs. Hence, the approximation in the below asserts. assert num_trace_events >= 700
def helper_tensorflow_tests(use_keras, collection, save_config, with_timestamp): coll_name, coll_regex = collection run_id = "trial_" + coll_name + "-" + datetime.now().strftime( "%Y%m%d-%H%M%S%f") trial_dir = os.path.join(SMDEBUG_TF_HOOK_TESTS_DIR, run_id) if use_keras: hook = TF_KerasHook( out_dir=trial_dir, include_collections=[coll_name], save_config=save_config, export_tensorboard=True, ) saved_scalars = simple_tf_model(hook, with_timestamp=with_timestamp) else: hook = TF_SessionHook( out_dir=trial_dir, include_collections=[coll_name], save_config=save_config, export_tensorboard=True, ) saved_scalars = tf_session_model(hook, with_timestamp=with_timestamp) tf.reset_default_graph() hook.close() verify_files(trial_dir, save_config, saved_scalars) if with_timestamp: check_tf_events(trial_dir, saved_scalars)
def test_tf2_profiler_by_time(tf2_profiler_config_parser_by_time, out_dir): """ This test executes a TF2 training script, enables detailed TF profiling by time, and verifies the number of events. """ assert tf2_profiler_config_parser_by_time.profiling_enabled hook = Hook(out_dir=out_dir) helper_keras_fit(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) hook.close() verify_detailed_profiling(out_dir, 700)
def main(): _ = KerasHook( out_dir="" ) # need this line so that import doesn't get removed by pre-commit parser = argparse.ArgumentParser(description="Train resnet50 cifar10") parser.add_argument("--batch_size", type=int, default=256) parser.add_argument("--epoch", type=int, default=5) parser.add_argument("--data_augmentation", type=bool, default=False) parser.add_argument("--model_dir", type=str, default="./model_keras_resnet") parser.add_argument("--enable_bottleneck", type=bool, default=True) args = parser.parse_args() mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10) opt = tf.keras.optimizers.Adam(learning_rate=0.001) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) # start the training. train(args.batch_size, args.epoch, model, args.enable_bottleneck, args.data_augmentation)
def helper_tensorflow_tests(use_keras, collection, save_config): coll_name, coll_regex = collection run_id = "trial_" + coll_name + "-" + datetime.now().strftime("%Y%m%d-%H%M%S%f") trial_dir = os.path.join(SMDEBUG_TF_HOOK_TESTS_DIR, run_id) if use_keras: hook = TF_KerasHook( out_dir=trial_dir, include_collections=[coll_name], save_config=save_config, export_tensorboard=True, ) simple_tf_model(hook) saved_scalars = [ "scalar/tf_keras_num_steps", "scalar/tf_keras_before_train", "scalar/tf_keras_after_train", ] else: hook = TF_SessionHook( out_dir=trial_dir, include_collections=[coll_name], save_config=save_config, export_tensorboard=True, ) tf_session_model(hook) tf.reset_default_graph() saved_scalars = [ "scalar/tf_session_num_steps", "scalar/tf_session_before_train", "scalar/tf_session_after_train", ] hook.close() verify_files(trial_dir, save_config, saved_scalars)
def helper_tensorflow_tests(collection, save_config): coll_name, coll_regex = collection run_id = "trial_" + coll_name + "-" + datetime.now().strftime( "%Y%m%d-%H%M%S%f") trial_dir = os.path.join(SMDEBUG_TF_HOOK_TESTS_DIR, run_id) hook = TF_Hook(out_dir=trial_dir, include_collections=[coll_name], export_tensorboard=True) coll = hook.get_collection(coll_name) coll.save_config = save_config save_steps = save_config.get_save_config(ModeKeys.TRAIN).save_steps if not save_steps: save_interval = save_config.get_save_config( ModeKeys.TRAIN).save_interval save_steps = [i for i in range(0, 10, save_interval)] simple_tf_model(hook) hook.close() saved_scalars = ["loss"] check_trials(trial_dir, save_steps, coll_name, saved_scalars) check_metrics_file(saved_scalars)
def test_native_tf2_profiling( monkeypatch, python_profiler_name, model_type, use_mirrored_strategy, get_model, native_tf2_cprofile_profiler_config_parser, native_tf2_pyinstrument_profiler_config_parser, out_dir, mnist_dataset, tf_eager_mode, ): """ Enable all types of profiling and validate the output artfacts. Parametrizes on the type of Python profiler used for Python profiling as well as the model used for training. We cannot test dataloader profiling in pytest, because the resource config needs to be configured at /opt/ml/input/config/resourceconfig.json before tensorflow is even imported. """ if python_profiler_name == CPROFILE_NAME: profiler_config_parser = native_tf2_cprofile_profiler_config_parser else: profiler_config_parser = native_tf2_pyinstrument_profiler_config_parser assert profiler_config_parser.profiling_enabled profiler_config_parser.load_config() hook = Hook(out_dir=out_dir, save_all=True) # Known issue where logging in a python callback function (i.e. atexit) during pytest causes logging errors. # See https://github.com/pytest-dev/pytest/issues/5502 for more information. hook.profiler_config_parser = profiler_config_parser hook.logger.disabled = True if use_mirrored_strategy: strategy = tf.distribute.MirroredStrategy() num_devices = strategy.num_replicas_in_sync with strategy.scope(): model = get_model(model_type) optimizer = tf.optimizers.Adam() train_step_func = _distributed_train_step else: strategy = None num_devices = 1 model = get_model(model_type) optimizer = tf.optimizers.Adam() train_step_func = _train_step optimizer = hook.wrap_optimizer(optimizer) _training_loop(hook, profiler_config_parser, model, optimizer, mnist_dataset, train_step_func, strategy) # Sanity check debugger output _verify_tensor_names(out_dir) # Validate all timeline files _verify_timeline_files(out_dir) # Validate detailed profiling expected_event_count = 90 if use_mirrored_strategy else 230 verify_detailed_profiling(out_dir, expected_event_count) # The expected number of stats directories during is ((num_steps * 2) + 2) * num_devices. This includes profiling # for both phases of each step and pre-step zero python profiling and post-hook-close python profiling. expected_stats_dir_count = ( (profiler_config_parser.config.python_profiling_config.num_steps * 2) + 2) * num_devices python_stats_dir = os.path.join(out_dir, "framework", "tensorflow", python_profiler_name) validate_python_profiling_stats(python_stats_dir, python_profiler_name, expected_stats_dir_count)