def create_hook(): # With the following SaveConfig, we will save tensors for every 100 steps save_config = SaveConfig(save_interval=100) # Create a hook that logs weights, biases and gradients while training the model. hook = Hook(save_config=save_config, save_all=True) return hook
def test_hook_save_every_step(tmpdir): save_config = SaveConfig(save_interval=1) out_dir = os.path.join(tmpdir, str(uuid.uuid4())) hook = Hook(out_dir=out_dir, save_config=save_config) run_xgboost_model(hook=hook) trial = create_trial(out_dir) assert trial.steps() == list(range(10))
def test_lstm_and_generator(out_dir): # init hook hook = KerasHook( out_dir, include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.LOSSES, CollectionKeys.GRADIENTS, ], save_config=SaveConfig(save_steps=[0, 1, 2, 3]), ) # init model num_steps = 100 hidden_size = 100 vocabulary = 1000 model = Sequential() model.add(Embedding(vocabulary, hidden_size, input_length=num_steps)) model.add(LSTM(hidden_size, return_sequences=True)) model.add(LSTM(hidden_size, return_sequences=True)) model.add(Dropout(0.2)) model.add(TimeDistributed(Dense(vocabulary))) model.add(Activation("softmax")) model.compile( loss="categorical_crossentropy", optimizer=hook.wrap_optimizer(Adam()), metrics=["categorical_accuracy"], ) train(3, 32, model, num_steps, hook) tr = create_trial(out_dir) assert len(tr.tensor_names(collection=CollectionKeys.LOSSES)) > 0 assert len(tr.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0
def test_loss_collection_with_no_other_collections(): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/" + run_id hook = t_hook(out_dir=out_dir, save_config=save_config, include_collections=[]) assert has_training_ended(out_dir) == False run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, register_to_loss_block=True) print("Created the trial with out_dir {0}".format(out_dir)) tr = create_trial(out_dir) assert tr assert len(tr.steps()) == 4 print(tr.tensor_names()) tname = tr.tensor_names(regex=".*loss")[0] loss_tensor = tr.tensor(tname) loss_val = loss_tensor.value(step_num=1) assert len(loss_val) > 0 shutil.rmtree(out_dir)
def helper_test_modes(hook=None, out_dir="/tmp/test_output/test_hook_modes/"): prefix = str(uuid.uuid4()) device = torch.device("cpu") save_steps = [i for i in range(5)] model = Net(to_save=save_steps).to(device) json = hook is not None if hook is None: out_dir = str(Path(out_dir, prefix)) hook = Hook( out_dir=out_dir, save_config=SaveConfig({modes.TRAIN: SaveConfigMode(save_steps=save_steps)}), include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.DEFAULT, CollectionKeys.LOSSES, ], ) hook.register_module(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) hook.set_mode(mode=modes.TRAIN) train(model, device, optimizer, num_steps=10, save_steps=save_steps) trial = create_trial(path=out_dir, name="test output") assert len(trial.modes()) == 1 assert len(trial.steps()) == 5 assert len(trial.steps(mode=modes.TRAIN)) == 5 assert len(trial.steps(mode=modes.EVAL)) == 0 if hook is None: shutil.rmtree(out_dir)
def test_hook_all_zero(hook=None, out_dir=None): hook_created = False if hook is None: hook_created = True save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/newlogsRunTest/" + run_id print("Registering the hook with out_dir {0}".format(out_dir)) shutil.rmtree(out_dir, ignore_errors=True) hook = t_hook( out_dir=out_dir, save_config=save_config, include_collections=[ "ReluActivation", "weights", "biases", "gradients" ], ) hook.get_collection("ReluActivation").include(["relu*", "input_*"]) run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, make_input_zero=True) print("Created the trial with out_dir {0}".format(out_dir)) tr = create_trial(out_dir) assert tr assert len(tr.steps()) == 4 tnames = tr.tensor_names(regex="conv._input") tname = tr.tensor_names(regex="conv._input")[0] conv_tensor_value = tr.tensor(tname).value(step_num=0) is_zero = np.all(conv_tensor_value == 0) assert is_zero == True if hook_created: shutil.rmtree(out_dir)
def test_hook_save_config_collections(tmpdir): out_dir = os.path.join(tmpdir, str(uuid.uuid4())) hook = Hook(out_dir=out_dir, include_collections=["metrics", "feature_importance"]) hook.get_collection("metrics").save_config = SaveConfig(save_interval=2) hook.get_collection("feature_importance").save_config = SaveConfig( save_interval=3) run_xgboost_model(hook=hook) trial = create_trial(out_dir) metric_steps = trial.tensor("train-rmse").steps() assert all(step % 2 == 0 for step in metric_steps[:-1]) fimps = [ t for t in trial.tensor_names() if t.startswith("feature_importance/") ] fimp_steps = trial.tensor(fimps[0]).steps() assert all(step % 3 == 0 for step in fimp_steps[:-1])
def create_hook(output_dir, module=None, hook_type="saveall", save_steps=None): # Create a hook that logs weights, biases, gradients and inputs/ouputs of model if hook_type == "saveall": hook = Hook(out_dir=output_dir, save_config=SaveConfig(save_steps=save_steps), save_all=True) elif hook_type == "module-input-output": # The names of input and output tensors of a module are in following format # Inputs : <module_name>_input_<input_index>, and # Output : <module_name>_output # In order to log the inputs and output of a module, we will create a collection as follows: assert module is not None # Create a hook that logs weights, biases, gradients and inputs/outputs of model hook = Hook( out_dir=output_dir, save_config=SaveConfig(save_steps=save_steps), include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.GRADIENTS, CollectionKeys.BIASES, "l_mod", ], ) hook.get_collection("l_mod").add_module_tensors(module, inputs=True, outputs=True) elif hook_type == "weights-bias-gradients": save_config = SaveConfig(save_steps=save_steps) # Create a hook that logs ONLY weights, biases, and gradients hook = Hook( out_dir=output_dir, save_config=save_config, include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.DEFAULT, CollectionKeys.LOSSES, ], ) return hook
def test_hook(): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/newlogsRunTest/" + run_id hook = t_hook(out_dir=out_dir, save_config=save_config) assert has_training_ended(out_dir) == False run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, register_to_loss_block=True) shutil.rmtree(out_dir)
def create_smdebug_hook(out_dir, train_data=None, validation_data=None, frequency=1, collections=None,): save_config = SaveConfig(save_interval=frequency) hook = Hook( out_dir=out_dir, train_data=train_data, validation_data=validation_data, save_config=save_config, include_collections=collections, ) return hook
def create_hook_from_json_config(hook_cls, json_config_path, default_values=None): """Returns a SessionHook object corresponding to either TF, PT, or MXNet. If json_config_path is None, an environment variable must be set. Here we compare HookParameters with CollectionConfiguration and set all the defaults. """ params_dict = get_json_config_as_dict(json_config_path=json_config_path) hook_params = collect_hook_config_params(params_dict) out_dir = hook_params.get("out_dir") dry_run = hook_params.get("dry_run", False) reduction_config = hook_params.get(CONFIG_RDN_CFG_KEY, None) save_config = SaveConfig.from_dict(hook_params.get("save_config_modes"), default_values) include_regex = hook_params.get(CONFIG_INCLUDE_REGEX_KEY) include_collections = get_include_collections(params_dict) save_all = hook_params.get(CONFIG_SAVE_ALL_KEY, False) include_workers = hook_params.get(CONFIG_INCLUDE_WORKERS_KEY, "one") # If Sagemaker, emit TB only if JSON file exists if is_sagemaker_job(): tensorboard_dir = get_tensorboard_dir_from_json_config() export_tensorboard = bool(tensorboard_dir is not None) # Otherwise, place TB artifacts in out_dir else: tensorboard_dir = hook_params[TENSORBOARD_DIR_KEY] export_tensorboard = hook_params[EXPORT_TENSORBOARD_KEY] hook = hook_cls( out_dir=out_dir, export_tensorboard=export_tensorboard, tensorboard_dir=tensorboard_dir, dry_run=dry_run, reduction_config=reduction_config, save_config=save_config, include_regex=include_regex, include_collections=include_collections, include_workers=include_workers, save_all=save_all, ) add_collections_to_manager(hook.collection_manager, params_dict, hook_params) return hook
def test_hook(): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/newlogsRunTest/" + run_id hook = t_hook(out_dir=out_dir, save_config=save_config) run_mnist_gluon_model( hook=hook, num_steps_train=10, num_steps_eval=10, register_to_loss_block=True, save_custom_tensor=True, ) trial = create_trial(out_dir) custom_tensors = trial.tensor_names(collection=CollectionKeys.DEFAULT) all_tensors = trial.tensor_names() assert len(custom_tensors) == 2 assert len(all_tensors) == 4 shutil.rmtree(out_dir)
def test_hook_save_all(tmpdir): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) out_dir = os.path.join(tmpdir, str(uuid.uuid4())) hook = Hook(out_dir=out_dir, save_config=save_config, save_all=True) run_xgboost_model(hook=hook) trial = create_trial(out_dir) collections = trial.collections() tensors = trial.tensor_names() assert len(tensors) > 0 assert len(trial.steps()) == 4 assert "all" in collections assert "metrics" in collections assert "feature_importance" in collections assert "train-rmse" in tensors assert any(t.startswith("feature_importance/") for t in tensors) assert any(t.startswith("trees/") for t in tensors) assert len(collections["all"].tensor_names) == len(tensors)
def test_hook(tmpdir): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) out_dir = os.path.join(tmpdir, str(uuid.uuid4())) hook = Hook(out_dir=out_dir, save_config=save_config) assert has_training_ended(out_dir) is False run_xgboost_model(hook=hook)
def test_spot_hook(): os.environ[ CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR] = "./tests/mxnet/test_json_configs/checkpointconfig.json" checkpoint_path = "/tmp/savedParams" if not os.path.exists(checkpoint_path): os.mkdir(checkpoint_path) save_config = SaveConfig( save_steps=[10, 11, 12, 13, 14, 40, 50, 60, 70, 80]) """ Run the training for 2 epochs and save the parameter after every epoch. We expect that steps 0 to 14 will be written. """ run_id_1 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir_1 = "/tmp/newlogsRunTest/" + run_id_1 hook = t_hook(out_dir=out_dir_1, save_config=save_config, include_collections=["weights", "gradients"]) assert has_training_ended(out_dir_1) == False run_mnist( hook=hook, num_steps_train=10, num_steps_eval=10, epochs=2, save_interval=1, save_path=checkpoint_path, ) """ Run the training again for 4 epochs and save the parameter after every epoch. We DONOT expect that steps 0 to 14 are written. We expect to read steps 40, 50, 60, 70 and 80 """ run_id_2 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir_2 = "/tmp/newlogsRunTest/" + run_id_2 hook = t_hook(out_dir=out_dir_2, save_config=save_config, include_collections=["weights", "gradients"]) assert has_training_ended(out_dir_2) == False run_mnist( hook=hook, num_steps_train=10, num_steps_eval=10, epochs=4, save_interval=1, save_path=checkpoint_path, ) # Unset the environ variable before validation so that it won't affect the other scripts in py test environment. del os.environ[CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR] # Validation print("Created the trial with out_dir {0} for the first training".format( out_dir_1)) tr = create_trial(out_dir_1) assert tr available_steps_1 = tr.steps() assert 40 not in available_steps_1 assert 80 not in available_steps_1 print(available_steps_1) print("Created the trial with out_dir {0} for the second training".format( out_dir_2)) tr = create_trial(out_dir_2) assert tr available_steps_2 = tr.steps() assert 40 in available_steps_2 assert 50 in available_steps_2 assert 60 in available_steps_2 assert 70 in available_steps_2 assert 80 in available_steps_2 assert 0 not in available_steps_2 assert 10 not in available_steps_2 assert 11 not in available_steps_2 assert 12 not in available_steps_2 print(available_steps_2) print("Cleaning up.") shutil.rmtree(os.path.dirname(out_dir_1)) shutil.rmtree(checkpoint_path, ignore_errors=True)