def helper_test_weights_bias_gradients(hook=None): prefix = str(uuid.uuid4()) hook_type = "weights-bias-gradients" device = torch.device("cpu") save_steps = [i * 20 for i in range(5)] model = Net(mode=hook_type, to_save=save_steps).to(device) json = hook is not None if not json: hook = create_hook( "/tmp/test_output/test_hook_save_weightsbiasgradients/" + prefix, model, hook_type, save_steps=save_steps, ) hook.register_hook(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, device, optimizer, num_steps=101, save_steps=save_steps) if not json: trial = create_trial( path="/tmp/test_output/test_hook_save_weightsbiasgradients/" + prefix, name="test output", ) else: trial = create_trial( path= "/tmp/test_output/test_hook_save_weightsbiasgradients/jsonloading", name="test output", ) grads = [ "gradient/Net_fc1.weight", "gradient/Net_fc2.weight", "gradient/Net_fc3.weight", "gradient/Net_fc1.bias", "gradient/Net_fc2.bias", "gradient/Net_fc3.bias", ] weights = ["Net_fc1.weight", "Net_fc2.weight", "Net_fc3.weight"] bias = ["Net_fc1.bias", "Net_fc2.bias", "Net_fc3.bias"] tensors = grads + bias + weights assert len(trial.steps()) == len(save_steps) for step in trial.steps(): for tname in tensors: assert tname in trial.tensor_names() assert step in trial.tensor(tname).steps() saved_tensor = trial.tensor(tname).value(step) in_memory = model.saved[tname][step] assert np.allclose(in_memory, saved_tensor) if not json: addendum = prefix else: addendum = "jsonloading" hook._cleanup() delete_local_trials( ["/tmp/test_output/test_hook_save_weightsbiasgradients/" + addendum])
def test_whitespace_handling_in_path_str(): _id = str(uuid.uuid4()) path = os.path.join("ts_output/train/", _id) dummy_trial_creator(trial_dir=path, num_workers=1, job_ended=True) # Test Leading Whitespace Handling create_trial(" " + path) # Test Trailing Whitespace Handling create_trial(path + " ")
def test_load_collection_files_from_completed_job_with_missing_files(): """ Number of collection files : 1446 Training_has_ended.ts : Present Some of the collection files have been removed in the test dataset. The number of expected collection files is supposed to 2001 but the training_has_ended file is present so we stop waiting :return: """ path = "s3://smdebug-testing/resources/collection-tests/collection-files-missing/" with pytest.raises(MissingCollectionFiles): create_trial(path)
def helper_test_modes(hook=None, out_dir="/tmp/test_output/test_hook_modes/"): prefix = str(uuid.uuid4()) device = torch.device("cpu") save_steps = [i for i in range(5)] model = Net(to_save=save_steps).to(device) json = hook is not None if hook is None: out_dir = str(Path(out_dir, prefix)) hook = Hook( out_dir=out_dir, save_config=SaveConfig({modes.TRAIN: SaveConfigMode(save_steps=save_steps)}), include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.DEFAULT, CollectionKeys.LOSSES, ], ) hook.register_module(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) hook.set_mode(mode=modes.TRAIN) train(model, device, optimizer, num_steps=10, save_steps=save_steps) trial = create_trial(path=out_dir, name="test output") assert len(trial.modes()) == 1 assert len(trial.steps()) == 5 assert len(trial.steps(mode=modes.TRAIN)) == 5 assert len(trial.steps(mode=modes.EVAL)) == 0 if hook is None: shutil.rmtree(out_dir)
def test_hook_all_zero(hook=None, out_dir=None): hook_created = False if hook is None: hook_created = True save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/newlogsRunTest/" + run_id print("Registering the hook with out_dir {0}".format(out_dir)) shutil.rmtree(out_dir, ignore_errors=True) hook = t_hook( out_dir=out_dir, save_config=save_config, include_collections=[ "ReluActivation", "weights", "biases", "gradients" ], ) hook.get_collection("ReluActivation").include(["relu*", "input_*"]) run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, make_input_zero=True) print("Created the trial with out_dir {0}".format(out_dir)) tr = create_trial(out_dir) assert tr assert len(tr.steps()) == 4 tnames = tr.tensor_names(regex="conv._input") tname = tr.tensor_names(regex="conv._input")[0] conv_tensor_value = tr.tensor(tname).value(step_num=0) is_zero = np.all(conv_tensor_value == 0) assert is_zero == True if hook_created: shutil.rmtree(out_dir)
def helper_test_reductions(trial_dir, hook, save_raw_tensor): simple_model(hook) _, files = get_dirs_files(trial_dir) from smdebug.trials import create_trial tr = create_trial(trial_dir) assert len(tr.tensor_names()) == 3, tr.tensor_names() for step in tr.steps(): assert len(tr.tensor_names(step=step)) == 3, tr.tensor_names() for tname in tr.tensor_names(): t = tr.tensor(tname) if tname in tr.tensor_names(collection="losses"): # no reductions assert t.value(0) is not None else: if save_raw_tensor is True: assert t.value(0) is not None else: try: print(t.value(0)) assert False, (tname, e) except TensorUnavailableForStep as e: pass assert len(t.reduction_values(0)) == 18 for r in ALLOWED_REDUCTIONS + ALLOWED_NORMS: for b in [False, True]: assert t.reduction_value(0, reduction_name=r, abs=b, worker=None) is not None
def test_no_failure_with_torch_mp(out_dir): shutil.rmtree(out_dir, ignore_errors=True) path = build_json(out_dir, save_all=True, save_interval="1") path = str(path) os.environ["SMDEBUG_CONFIG_FILE_PATH"] = path device = "cpu" dataloader_kwargs = {} cpu_count = 2 if mp.cpu_count() > 2 else mp.cpu_count() torch.manual_seed(1) model = Net().to(device) model.share_memory( ) # gradients are allocated lazily, so they are not shared here processes = [] for rank in range(cpu_count): p = mp.Process(target=train, args=(rank, model, device, dataloader_kwargs)) # We first train the model across `num_processes` processes p.start() processes.append(p) for p in processes: p.join() trial = create_trial(out_dir) assert trial.num_workers == 1 # Ensure only one worker saved data assert len(trial.tensor_names()) > 20 # Ensure that data was saved assert trial.steps() == [0, 1, 2, 3] # Ensure that steps were saved shutil.rmtree(out_dir, ignore_errors=True) shutil.rmtree(data_dir, ignore_errors=True)
def test_hook_from_json_config_for_losses(tmpdir, monkeypatch, params): out_dir = tmpdir.join("test_hook_from_json_config_for_losses") config_file = tmpdir.join("config.json") config_file.write(get_json_config_for_losses(str(out_dir))) monkeypatch.setenv(CONFIG_FILE_PATH_ENV_STR, str(config_file)) hook = Hook.create_from_json_file() assert has_training_ended(out_dir) is False run_xgboost_model(hook=hook, params=params) trial = create_trial(str(out_dir)) eval_metric = params["eval_metric"] test_metric = f"test-{eval_metric}" train_metric = f"train-{eval_metric}" if eval_metric == "rmse": assert train_metric in trial.tensor_names( collection=CollectionKeys.METRICS) assert train_metric in trial.tensor_names( collection=CollectionKeys.LOSSES) assert test_metric in trial.tensor_names( collection=CollectionKeys.METRICS) assert test_metric in trial.tensor_names( collection=CollectionKeys.LOSSES) if eval_metric == "auc" or eval_metric == "map": assert train_metric in trial.tensor_names( collection=CollectionKeys.METRICS) assert train_metric not in trial.tensor_names( collection=CollectionKeys.LOSSES) assert test_metric in trial.tensor_names( collection=CollectionKeys.METRICS) assert test_metric not in trial.tensor_names( collection=CollectionKeys.LOSSES)
def help_test_refresh_with_range(path): trial_name = str(uuid.uuid4()) num_steps = 8 num_tensors = 10 for i in range(num_steps): generate_data( path=path, trial=trial_name, num_tensors=num_tensors, step=i, tname_prefix="foo", worker="algo-1", shape=(3, 3, 3), ) tr = create_trial(path + trial_name, range_steps=(0, 5)) assert len(tr.steps()) == 5 for i in range(num_steps, num_steps * 2): generate_data( path=path, trial=trial_name, num_tensors=num_tensors, step=i, tname_prefix="foo", worker="algo-1", shape=(3, 3, 3), export_colls=False, ) assert len(tr.steps()) == 5
def test_loss_collection_with_no_other_collections(): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/" + run_id hook = t_hook(out_dir=out_dir, save_config=save_config, include_collections=[]) assert has_training_ended(out_dir) == False run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, register_to_loss_block=True) print("Created the trial with out_dir {0}".format(out_dir)) tr = create_trial(out_dir) assert tr assert len(tr.steps()) == 4 print(tr.tensor_names()) tname = tr.tensor_names(regex=".*loss")[0] loss_tensor = tr.tensor(tname) loss_val = loss_tensor.value(step_num=1) assert len(loss_val) > 0 shutil.rmtree(out_dir)
def test_data_parallel(): shutil.rmtree(out_dir, ignore_errors=True) hook = smd.Hook( out_dir=out_dir, save_config=smd.SaveConfig(save_steps=[0, 1, 5]), save_all=True, include_workers="one", ) device = "cuda" if torch.cuda.is_available() else "cpu" model = Net().to(device) if device == "cuda": model = DataParallel(model) hook.register_module(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, hook, torch.device(device), optimizer, num_steps=10) trial = create_trial(out_dir) assert trial.steps() == [0, 1, 5] if device == "cpu": assert len(trial.tensor_names()) == 38 else: assert len(trial.tensor_names()) > 37 shutil.rmtree(out_dir, ignore_errors=True)
def test_new_graph(out_dir): # tests that we can correctly interpret an explicitly created graph g1 = tf.get_default_graph() g = tf.Graph() with g.as_default(): assert g != g1 assert g == tf.get_default_graph() hook = smd.SessionHook( out_dir, include_collections=["weights", "losses", "scalars"], save_config=smd.SaveConfig(save_steps=[0, 1, 2, 3]), ) with tf.name_scope("foobar"): x = tf.placeholder(shape=(None, 2), dtype=tf.float32) w = tf.Variable(initial_value=[[10.0], [10.0]], name="weight1") with tf.name_scope("foobaz"): w0 = [[1], [1.0]] y = tf.matmul(x, w0) loss = tf.reduce_mean((tf.matmul(x, w) - y)**2, name="loss") hook.get_collection("losses").add(loss) global_step = tf.Variable(17, name="global_step", trainable=False) increment_global_step_op = tf.assign(global_step, global_step + 1) optimizer = tf.train.AdamOptimizer(0.1) optimizer = hook.wrap_optimizer(optimizer) optimizer_op = optimizer.minimize(loss, global_step=increment_global_step_op) sess = tf.train.MonitoredSession(hooks=[hook]) for i in range(5): x_ = np.random.random((10, 2)) * 0.1 sess.run([loss, optimizer_op, increment_global_step_op], {x: x_}) sess.close() tr = create_trial(out_dir) assert len(tr.tensor_names())
def _run_net_distributed(out_dir, include_workers="one", test_timeline=False): """Runs a single linear layer on 2 processes.""" # torch.distributed is empty on Mac on Torch <= 1.2 if not hasattr(dist, "is_initialized"): return multiprocessing.set_start_method("spawn", force=True) size = 2 processes = [] for rank in range(size): p = Process(target=init_processes, args=(out_dir, rank, size, include_workers, test_timeline, run)) p.start() processes.append(p) for p in processes: p.join() # WARNING: assert statements do not cause test failure inside subprocesses # https://stackoverflow.com/questions/13400546/py-test-how-to-automatically-detect-an-exception-in-a-child-process assert all([not p.exitcode for p in processes ]), f"Some processes failed. processes={processes}" trial = create_trial(path=out_dir) return trial
def basic_test(out_dir, mode): path = build_json(out_dir, include_workers="one", include_collections=["weights", "gradients"]) num_workers = len(get_available_gpus()) mode_args = list(HOROVOD_MNIST_ARGS) + [ "--model_dir", os.path.join(out_dir, "checkpoint") ] if mode == "cpu": mode_args += ["--use_only_cpu", "true"] launch_horovod_job( script_file_path= f"examples/tensorflow/sagemaker_official_container/{HOROVOD_MNIST_SCRIPT_NAME}", script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) print(tr.tensor_names()) assert len(tr.workers()) == 1 assert len(tr.tensor_names()) == 13 assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == 1
def test_three_writers_not_all_steps_written_but_later_step_written_complete_job( ): """Test Scenario Description" workers : [a,b,c] steps :{ 1: [a,b,c], 2: [a,b,c], 3: [a,c], 4: [a,c], 5: [a,c], 6: [a,b,c] } END_OF_JOB.ts --> Present """ path = "s3://smdebug-testing/resources/has_step_scenarios/three-writers-not-all-steps-written-but-later-step-written-complete-job" trial = create_trial(path) num_workers = len(trial.workers()) assert num_workers == 3 assert trial.loaded_all_steps is True all_steps = trial.steps(show_incomplete_steps=True) completed_steps = trial.steps() assert all_steps == [0, 1, 2, 3, 4, 5, 6] assert completed_steps == all_steps assert trial.has_passed_step(2) == StepState.AVAILABLE assert trial.last_complete_step == 6 assert trial.has_passed_step(4) == StepState.AVAILABLE assert trial.has_passed_step(6) == StepState.AVAILABLE assert trial.has_passed_step(8) == StepState.UNAVAILABLE assert ( trial.last_index_token == "resources/has_step_scenarios/three-writers-not-all-steps-written-but-later-step-written-complete-job/index/000000000/000000000006_worker_2.json" )
def test_single_writer_all_steps_written_incomplete_job(): """Test Scenario Description" workers : [a] steps :{ 1: [a], 2: [a], 3: [a], 4: [a], 5: [a], 6: [a] } END_OF_JOB.ts --> Absent """ path = "s3://smdebug-testing/resources/has_step_scenarios/single-writer-all-steps-written-incomplete-job" trial = create_trial(path) num_workers = len(trial.workers()) assert num_workers == 1 assert trial.loaded_all_steps is False all_steps = trial.steps(show_incomplete_steps=True) completed_steps = trial.steps() assert all_steps == [0, 1, 2, 3, 4, 5, 6] assert all_steps == completed_steps assert trial.has_passed_step(3) == StepState.AVAILABLE assert trial.has_passed_step(8) == StepState.NOT_YET_AVAILABLE assert ( trial.last_index_token == "resources/has_step_scenarios/single-writer-all-steps-written-incomplete-job/index/000000000/000000000006_worker_0.json" ) assert trial.last_complete_step == 6 try: trial.wait_for_steps([0, 1, 2, 3, 4, 5, 6]) except Exception: # All the requested steps are available, do not raise an exception assert False
def test_single_writer_not_all_steps_written_incomplete_job(): """Test Scenario Description" workers : [a] steps :{ 1: [a], 2: [a], 3: [a], 4: [], 5: [a], 6: [a] } END_OF_JOB.ts --> Absent """ path = "s3://smdebug-testing/resources/has_step_scenarios/single-writer-not-all-steps-written-incomplete" trial = create_trial(path) num_workers = len(trial.workers()) assert num_workers == 1 assert trial.loaded_all_steps is False all_steps = trial.steps(show_incomplete_steps=True) completed_steps = trial.steps() assert all_steps == [0, 1, 2, 3, 5, 6] # step 4 is missing assert completed_steps == all_steps assert trial.has_passed_step(3) == StepState.AVAILABLE assert trial.has_passed_step(4) == StepState.UNAVAILABLE assert trial.has_passed_step(8) == StepState.NOT_YET_AVAILABLE assert ( trial.last_index_token == "resources/has_step_scenarios/single-writer-not-all-steps-written-incomplete/index/000000000/000000000006_worker_0.json" ) assert trial.last_complete_step == 6
def test_three_writers_one_step_missing_but_later_steps_written_partially_incomplete_job( ): """Test Scenario Description" workers : [a,b,c] steps :{ 1: [a,b,c], 2: [a,b,c], 3: [], 4: [a,c], 5: [a,c], 6: [a,c] } END_OF_JOB.ts --> Absent """ path = "s3://smdebug-testing/resources/has_step_scenarios/three_writers_one_step_missing_but_later_steps_written_partially_incomplete_job" trial = create_trial(path) num_workers = len(trial.workers()) assert num_workers == 3 assert trial.loaded_all_steps is False all_steps = trial.steps(show_incomplete_steps=True) completed_steps = trial.steps() assert all_steps == [0, 1, 2, 4, 5, 6] assert completed_steps == [0, 1, 2] assert trial.has_passed_step(2) == StepState.AVAILABLE assert trial.has_passed_step(3) == StepState.NOT_YET_AVAILABLE assert trial.last_complete_step == 2 assert trial.has_passed_step(4) == StepState.NOT_YET_AVAILABLE assert trial.has_passed_step(6) == StepState.NOT_YET_AVAILABLE assert trial.has_passed_step(8) == StepState.NOT_YET_AVAILABLE assert ( trial.last_index_token == "resources/has_step_scenarios/three_writers_one_step_missing_but_later_steps_written_partially_incomplete_job/index/000000000/000000000002_worker_2.json" )
def helper_save_config_modes(trial_dir, hook): help_test_mnist(trial_dir, hook=hook, num_steps=2, num_eval_steps=3) tr = create_trial(trial_dir) for tname in tr.tensor_names(collection="weights"): t = tr.tensor(tname) assert len(t.steps(mode=modes.TRAIN)) == 2 assert len(t.steps(mode=modes.EVAL)) == 1
def validate(): try: from smdebug.trials import create_trial from smdebug.mxnet import get_hook hook = get_hook() out_dir = hook.out_dir print("Created the trial with out_dir {0}".format(out_dir)) tr = create_trial(out_dir) global_steps = tr.steps() print("Global steps: " + str(global_steps)) loss_tensor_name = tr.tensor_names(regex="softmaxcrossentropyloss._output_.")[0] print("Obtained the loss tensor " + loss_tensor_name) assert loss_tensor_name == "softmaxcrossentropyloss0_output_0" mean_loss_tensor_value_first_step = tr.tensor(loss_tensor_name).reduction_value( step_num=global_steps[0], reduction_name="mean", abs=False ) mean_loss_tensor_value_last_step = tr.tensor(loss_tensor_name).reduction_value( step_num=global_steps[-1], reduction_name="mean", abs=False ) print("Mean validation loss first step = " + str(mean_loss_tensor_value_first_step)) print("Mean validation loss last step = " + str(mean_loss_tensor_value_last_step)) assert mean_loss_tensor_value_first_step >= mean_loss_tensor_value_last_step except ImportError: print("smdebug libraries do not exist. Skipped Validation.") print("Validation Complete")
def mode_allworkers_saveall(out_dir, mode): path = build_json(out_dir, include_workers="all", save_all=True, include_collections=["weights", "gradients"]) num_workers = len(get_available_gpus()) mode_args = list(HOROVOD_ESTIMATOR_TEST_SCRIPT_ARGS) + [ "--model_dir", os.path.join(out_dir, "checkpoint"), ] if mode == "cpu": mode_args += ["--use_only_cpu", "true"] launch_horovod_job( script_file_path= f"examples/tensorflow/sagemaker_official_container/{HOROVOD_ESTIMATOR_TEST_SCRIPT_PATH}", script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) assert len(tr.workers()) == num_workers assert len(tr.tensor_names()) > 99 assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers assert len(tr.tensor( tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
def test_pytorch(script_mode, use_loss_module): smd.del_hook() sim_class = ScriptSimulator if script_mode else SagemakerSimulator with sim_class() as sim: helper_torch_train(sim=sim, script_mode=script_mode, use_loss_module=use_loss_module) print("Finished Training") hook = smd.get_hook() print(f"hook = {hook}") # Check if the hook was executed with the default # hook configuration assert hook.has_default_hook_configuration() from smdebug.trials import create_trial trial = create_trial(path=sim.out_dir) print(f"trial.steps() = {trial.steps()}") print(f"trial.tensor_names() = {trial.tensor_names()}") print(f"collection_manager = {hook.collection_manager}") losses_tensors = hook.collection_manager.get("losses").tensor_names print(f"'losses' collection tensor_names = {losses_tensors}") assert len(losses_tensors) > 0 assert all([ name in trial.tensor_names() for name in hook.collection_manager.get("losses").tensor_names ])
def test_hook_save_every_step(tmpdir): save_config = SaveConfig(save_interval=1) out_dir = os.path.join(tmpdir, str(uuid.uuid4())) hook = Hook(out_dir=out_dir, save_config=save_config) run_xgboost_model(hook=hook) trial = create_trial(out_dir) assert trial.steps() == list(range(10))
def helper_test_multi_collections(hook, out_dir): device = torch.device("cpu") hook_type = "saveall" save_steps = [i for i in range(10)] model = Net(mode=hook_type, to_save=save_steps).to(device) hook.register_hook(model) optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) train(model, device, optimizer, num_steps=101, save_steps=save_steps) trial = create_trial(path=out_dir, name="test output") grads = [ "gradient/Net_fc1.weight", "gradient/Net_fc2.weight", "gradient/Net_fc3.weight", "gradient/Net_fc1.bias", "gradient/Net_fc2.bias", "gradient/Net_fc3.bias", ] weights = ["Net_fc1.weight", "Net_fc2.weight", "Net_fc3.weight"] bias = ["Net_fc1.bias", "Net_fc2.bias", "Net_fc3.bias"] inputs = ["fc1_input_0", "relu1_input_0", "relu2_input_0"] outputs = ["fc1_output_0", "relu1_output_0", "relu2_output_0"] tensors = grads + bias + weights + inputs + outputs assert len(trial.steps()) == len(save_steps) for tname in tensors: assert tname in trial.tensor_names()
def test_hook_validation(tmpdir): np.random.seed(42) train_data = np.random.rand(5, 10) train_label = np.random.randint(2, size=5) dtrain = xgboost.DMatrix(train_data, label=train_label) valid_data = np.random.rand(5, 10) valid_label = np.random.randint(2, size=5) dvalid = xgboost.DMatrix(valid_data, label=valid_label) out_dir = os.path.join(tmpdir, str(uuid.uuid4())) hook = Hook( out_dir=out_dir, include_collections=["labels", "predictions"], train_data=dtrain, validation_data=dvalid, ) run_xgboost_model(hook=hook) trial = create_trial(out_dir) tensors = trial.tensor_names() assert len(tensors) > 0 assert "labels" in trial.collections() assert "predictions" in trial.collections() assert "labels" in tensors assert "predictions" in tensors
def test_hook_shap(tmpdir): np.random.seed(42) train_data = np.random.rand(10, 10) train_label = np.random.randint(2, size=10) dtrain = xgboost.DMatrix(train_data, label=train_label) out_dir = os.path.join(tmpdir, str(uuid.uuid4())) hook = Hook(out_dir=out_dir, include_collections=["average_shap", "full_shap"], train_data=dtrain) run_xgboost_model(hook=hook) trial = create_trial(out_dir) tensors = trial.tensor_names() assert len(tensors) > 0 assert "average_shap" in trial.collections() assert "full_shap" in trial.collections() assert any(t.startswith("average_shap/") for t in tensors) assert any(t.startswith("full_shap/") for t in tensors) assert not any(t.endswith("/bias") for t in tensors) average_shap_tensors = [ t for t in tensors if t.startswith("average_shap/") ] average_shap_tensor_name = average_shap_tensors.pop() assert trial.tensor(average_shap_tensor_name).value(0).shape == (1, ) full_shap_tensors = [t for t in tensors if t.startswith("full_shap/")] full_shap_tensor_name = full_shap_tensors.pop() # full shap values should have 10 rows with 10 features + 1 bias assert trial.tensor(full_shap_tensor_name).value(0).shape == (10, 11)
def test_lstm_and_generator(out_dir): # init hook hook = KerasHook( out_dir, include_collections=[ CollectionKeys.WEIGHTS, CollectionKeys.LOSSES, CollectionKeys.GRADIENTS, ], save_config=SaveConfig(save_steps=[0, 1, 2, 3]), ) # init model num_steps = 100 hidden_size = 100 vocabulary = 1000 model = Sequential() model.add(Embedding(vocabulary, hidden_size, input_length=num_steps)) model.add(LSTM(hidden_size, return_sequences=True)) model.add(LSTM(hidden_size, return_sequences=True)) model.add(Dropout(0.2)) model.add(TimeDistributed(Dense(vocabulary))) model.add(Activation("softmax")) model.compile( loss="categorical_crossentropy", optimizer=hook.wrap_optimizer(Adam()), metrics=["categorical_accuracy"], ) train(3, 32, model, num_steps, hook) tr = create_trial(out_dir) assert len(tr.tensor_names(collection=CollectionKeys.LOSSES)) > 0 assert len(tr.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0
def test_pytorch(script_mode, use_loss_module): smd.del_hook() sim_class = ScriptSimulator if script_mode else SagemakerSimulator with sim_class() as sim: trainloader, testloader = get_dataloaders() net = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) if script_mode: hook = smd.Hook(out_dir=sim.out_dir) hook.register_module(net) hook.register_loss(criterion) for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) if use_loss_module: loss = criterion(outputs, labels) else: loss = F.cross_entropy(outputs, labels) if script_mode: hook.record_tensor_value(tensor_name="loss", tensor_value=loss) loss.backward() optimizer.step() if i == 499: # print every 2000 mini-batches break print("Finished Training") hook = smd.get_hook() print(f"hook = {hook}") from smdebug.trials import create_trial trial = create_trial(path=sim.out_dir) print(f"trial.steps() = {trial.steps()}") print(f"trial.tensor_names() = {trial.tensor_names()}") print(f"collection_manager = {hook.collection_manager}") losses_tensors = hook.collection_manager.get("losses").tensor_names print(f"'losses' collection tensor_names = {losses_tensors}") assert len(losses_tensors) > 0 assert all( [ name in trial.tensor_names() for name in hook.collection_manager.get("losses").tensor_names ] )
def test_single_writer_all_steps_written_complete_job_two_modes(): """Test Scenario Description" workers : [a] modes: TRAIN, EVAL steps :{ 0: [worker:a, mode: TRAIN, mode_step: 0], 10: [worker:a, mode: TRAIN, mode_step: 10], 20: [worker:a, mode: TRAIN, mode_step: 20], 30: [worker:a, mode: TRAIN, mode_step: 30], 40: [worker:a, mode: EVAL, mode_step: 0], 50: [worker:a, mode: EVAL, mode_step: 10], 60: [worker:a, mode: EVAL, mode_step: 20], 70: [worker:a, mode: EVAL, mode_step: 30] } END_OF_JOB.ts --> Present """ path = os.path.join("ts_output/train/", str(uuid.uuid4())) dummy_trial_creator(trial_dir=path, num_workers=1, job_ended=True) for i in range(0, 31, 10): dummy_step_creator(trial_dir=path, global_step=i, mode="TRAIN", mode_step=i, worker_name="worker_0") for i in range(0, 31, 10): dummy_step_creator(trial_dir=path, global_step=i + 40, mode="EVAL", mode_step=i, worker_name="worker_0") trial = create_trial(path) num_workers = len(trial.workers()) assert num_workers == 1 assert trial.loaded_all_steps is True all_steps = trial.steps(show_incomplete_steps=True) completed_steps = trial.steps() assert all_steps == [0, 10, 20, 30, 40, 50, 60, 70] assert completed_steps == all_steps assert trial.has_passed_step(30) == StepState.AVAILABLE assert trial.has_passed_step(23, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE assert trial.has_passed_step(40, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE assert trial.has_passed_step(30, mode=ModeKeys.EVAL) == StepState.AVAILABLE assert trial.has_passed_step(23, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE assert trial.has_passed_step(80) == StepState.UNAVAILABLE assert trial.has_passed_step(80, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE assert trial.has_passed_step(80, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE assert trial.last_index_token == os.path.join( path, "index/000000000/000000000070_worker_0.json") assert trial.last_complete_step == 70 shutil.rmtree(path, ignore_errors=True)
def help_test_refresh(path): trial_name = str(uuid.uuid4()) num_steps = 8 num_tensors = 10 for i in range(num_steps): generate_data( path=path, trial=trial_name, num_tensors=num_tensors, step=i, tname_prefix="foo", worker="algo-1", shape=(3, 3, 3), ) tr = create_trial(path + trial_name) assert "foo_" + str(num_tensors + 1) not in tr.tensor_names() assert "foo_1" in tr.tensor_names() assert len(tr.steps()) == num_steps assert len(tr.tensor("foo_1").steps()) == num_steps for i in range(num_steps, num_steps * 2): generate_data( path=path, trial=trial_name, num_tensors=num_tensors, step=i, tname_prefix="foo", worker="algo-1", shape=(3, 3, 3), export_colls=False, ) assert len(tr.tensor("foo_1").steps()) == num_steps * 2 assert len(tr.steps()) == num_steps * 2 generate_data( path=path, trial=trial_name, num_tensors=num_tensors, step=num_steps * 2 + 1, tname_prefix="foo", worker="algo-1", shape=(3, 3, 3), export_colls=False, ) assert len(tr.steps()) == num_steps * 2 + 1 generate_data( path=path, trial=trial_name, num_tensors=num_tensors + 3, step=num_steps * 2 + 2, tname_prefix="foo", worker="algo-1", shape=(3, 3, 3), export_colls=False, ) assert tr.tensor("foo_" + str(num_tensors + 1)) is not None