def test_lstm_and_generator(out_dir): # init hook hook = KerasHook( out_dir, include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.LOSSES, CollectionKeys.GRADIENTS, ], save_config=SaveConfig(save_steps=[0, 1, 2, 3]), ) # init model num_steps = 100 hidden_size = 100 vocabulary = 1000 model = Sequential() model.add(Embedding(vocabulary, hidden_size, input_length=num_steps)) model.add(LSTM(hidden_size, return_sequences=True)) model.add(LSTM(hidden_size, return_sequences=True)) model.add(Dropout(0.2)) model.add(TimeDistributed(Dense(vocabulary))) model.add(Activation("softmax")) model.compile( loss="categorical_crossentropy", optimizer=hook.wrap_optimizer(Adam()), metrics=["categorical_accuracy"], ) train(3, 32, model, num_steps, hook) tr = create_trial(out_dir) assert len(tr.tensor_names(collection=CollectionKeys.LOSSES)) > 0 assert len(tr.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0
def test_native_tf2_profiling( monkeypatch, python_profiler_name, model_type, use_mirrored_strategy, get_model, native_tf2_cprofile_profiler_config_parser, native_tf2_pyinstrument_profiler_config_parser, out_dir, mnist_dataset, tf_eager_mode, ): """ Enable all types of profiling and validate the output artfacts. Parametrizes on the type of Python profiler used for Python profiling as well as the model used for training. We cannot test dataloader profiling in pytest, because the resource config needs to be configured at /opt/ml/input/config/resourceconfig.json before tensorflow is even imported. """ if python_profiler_name == CPROFILE_NAME: profiler_config_parser = native_tf2_cprofile_profiler_config_parser else: profiler_config_parser = native_tf2_pyinstrument_profiler_config_parser assert profiler_config_parser.profiling_enabled profiler_config_parser.load_config() hook = Hook(out_dir=out_dir, save_all=True) # Known issue where logging in a python callback function (i.e. atexit) during pytest causes logging errors. # See https://github.com/pytest-dev/pytest/issues/5502 for more information. hook.profiler_config_parser = profiler_config_parser hook.logger.disabled = True if use_mirrored_strategy: strategy = tf.distribute.MirroredStrategy() num_devices = strategy.num_replicas_in_sync with strategy.scope(): model = get_model(model_type) optimizer = tf.optimizers.Adam() train_step_func = _distributed_train_step else: strategy = None num_devices = 1 model = get_model(model_type) optimizer = tf.optimizers.Adam() train_step_func = _train_step optimizer = hook.wrap_optimizer(optimizer) _training_loop(hook, profiler_config_parser, model, optimizer, mnist_dataset, train_step_func, strategy) # Sanity check debugger output _verify_tensor_names(out_dir) # Validate all timeline files _verify_timeline_files(out_dir) # Validate detailed profiling expected_event_count = 90 if use_mirrored_strategy else 230 verify_detailed_profiling(out_dir, expected_event_count) # The expected number of stats directories during is ((num_steps * 2) + 2) * num_devices. This includes profiling # for both phases of each step and pre-step zero python profiling and post-hook-close python profiling. expected_stats_dir_count = ( (profiler_config_parser.config.python_profiling_config.num_steps * 2) + 2) * num_devices python_stats_dir = os.path.join(out_dir, "framework", "tensorflow", python_profiler_name) validate_python_profiling_stats(python_stats_dir, python_profiler_name, expected_stats_dir_count)