def helper_test_weights_bias_gradients(hook=None):
    prefix = str(uuid.uuid4())
    hook_type = "weights-bias-gradients"
    device = torch.device("cpu")
    save_steps = [i * 20 for i in range(5)]
    model = Net(mode=hook_type, to_save=save_steps).to(device)
    json = hook is not None
    if not json:
        hook = create_hook(
            "/tmp/test_output/test_hook_save_weightsbiasgradients/" + prefix,
            model,
            hook_type,
            save_steps=save_steps,
        )

    hook.register_hook(model)
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    train(model, device, optimizer, num_steps=101, save_steps=save_steps)
    if not json:
        trial = create_trial(
            path="/tmp/test_output/test_hook_save_weightsbiasgradients/" +
            prefix,
            name="test output",
        )
    else:
        trial = create_trial(
            path=
            "/tmp/test_output/test_hook_save_weightsbiasgradients/jsonloading",
            name="test output",
        )
    grads = [
        "gradient/Net_fc1.weight",
        "gradient/Net_fc2.weight",
        "gradient/Net_fc3.weight",
        "gradient/Net_fc1.bias",
        "gradient/Net_fc2.bias",
        "gradient/Net_fc3.bias",
    ]
    weights = ["Net_fc1.weight", "Net_fc2.weight", "Net_fc3.weight"]
    bias = ["Net_fc1.bias", "Net_fc2.bias", "Net_fc3.bias"]

    tensors = grads + bias + weights

    assert len(trial.steps()) == len(save_steps)
    for step in trial.steps():
        for tname in tensors:
            assert tname in trial.tensor_names()
            assert step in trial.tensor(tname).steps()
            saved_tensor = trial.tensor(tname).value(step)
            in_memory = model.saved[tname][step]
            assert np.allclose(in_memory, saved_tensor)
    if not json:
        addendum = prefix
    else:
        addendum = "jsonloading"
    hook._cleanup()
    delete_local_trials(
        ["/tmp/test_output/test_hook_save_weightsbiasgradients/" + addendum])
Exemplo n.º 2
0
def test_whitespace_handling_in_path_str():
    _id = str(uuid.uuid4())
    path = os.path.join("ts_output/train/", _id)
    dummy_trial_creator(trial_dir=path, num_workers=1, job_ended=True)

    # Test Leading Whitespace Handling
    create_trial("   " + path)

    # Test Trailing Whitespace Handling
    create_trial(path + "  ")
Exemplo n.º 3
0
def test_load_collection_files_from_completed_job_with_missing_files():
    """
    Number of collection files : 1446
    Training_has_ended.ts : Present

    Some of the collection files have been removed in the test dataset.
    The number of expected collection files is supposed to 2001
    but the training_has_ended file is present so we stop waiting
    :return:
    """
    path = "s3://smdebug-testing/resources/collection-tests/collection-files-missing/"
    with pytest.raises(MissingCollectionFiles):
        create_trial(path)
Exemplo n.º 4
0
def helper_test_modes(hook=None, out_dir="/tmp/test_output/test_hook_modes/"):
    prefix = str(uuid.uuid4())
    device = torch.device("cpu")
    save_steps = [i for i in range(5)]
    model = Net(to_save=save_steps).to(device)
    json = hook is not None
    if hook is None:
        out_dir = str(Path(out_dir, prefix))
        hook = Hook(
            out_dir=out_dir,
            save_config=SaveConfig({modes.TRAIN: SaveConfigMode(save_steps=save_steps)}),
            include_collections=[
                CollectionKeys.WEIGHTS,
                CollectionKeys.BIASES,
                CollectionKeys.GRADIENTS,
                CollectionKeys.DEFAULT,
                CollectionKeys.LOSSES,
            ],
        )

    hook.register_module(model)
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    hook.set_mode(mode=modes.TRAIN)
    train(model, device, optimizer, num_steps=10, save_steps=save_steps)

    trial = create_trial(path=out_dir, name="test output")

    assert len(trial.modes()) == 1
    assert len(trial.steps()) == 5
    assert len(trial.steps(mode=modes.TRAIN)) == 5
    assert len(trial.steps(mode=modes.EVAL)) == 0

    if hook is None:
        shutil.rmtree(out_dir)
Exemplo n.º 5
0
def test_hook_all_zero(hook=None, out_dir=None):
    hook_created = False
    if hook is None:
        hook_created = True
        save_config = SaveConfig(save_steps=[0, 1, 2, 3])
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        out_dir = "/tmp/newlogsRunTest/" + run_id
        print("Registering the hook with out_dir {0}".format(out_dir))
        shutil.rmtree(out_dir, ignore_errors=True)
        hook = t_hook(
            out_dir=out_dir,
            save_config=save_config,
            include_collections=[
                "ReluActivation", "weights", "biases", "gradients"
            ],
        )
        hook.get_collection("ReluActivation").include(["relu*", "input_*"])
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          make_input_zero=True)

    print("Created the trial with out_dir {0}".format(out_dir))
    tr = create_trial(out_dir)
    assert tr
    assert len(tr.steps()) == 4

    tnames = tr.tensor_names(regex="conv._input")
    tname = tr.tensor_names(regex="conv._input")[0]
    conv_tensor_value = tr.tensor(tname).value(step_num=0)
    is_zero = np.all(conv_tensor_value == 0)
    assert is_zero == True
    if hook_created:
        shutil.rmtree(out_dir)
Exemplo n.º 6
0
def helper_test_reductions(trial_dir, hook, save_raw_tensor):
    simple_model(hook)
    _, files = get_dirs_files(trial_dir)
    from smdebug.trials import create_trial

    tr = create_trial(trial_dir)
    assert len(tr.tensor_names()) == 3, tr.tensor_names()
    for step in tr.steps():
        assert len(tr.tensor_names(step=step)) == 3, tr.tensor_names()
    for tname in tr.tensor_names():
        t = tr.tensor(tname)
        if tname in tr.tensor_names(collection="losses"):
            # no reductions
            assert t.value(0) is not None
        else:
            if save_raw_tensor is True:
                assert t.value(0) is not None
            else:
                try:
                    print(t.value(0))
                    assert False, (tname, e)
                except TensorUnavailableForStep as e:
                    pass
            assert len(t.reduction_values(0)) == 18
            for r in ALLOWED_REDUCTIONS + ALLOWED_NORMS:
                for b in [False, True]:
                    assert t.reduction_value(0,
                                             reduction_name=r,
                                             abs=b,
                                             worker=None) is not None
def test_no_failure_with_torch_mp(out_dir):
    shutil.rmtree(out_dir, ignore_errors=True)
    path = build_json(out_dir, save_all=True, save_interval="1")
    path = str(path)
    os.environ["SMDEBUG_CONFIG_FILE_PATH"] = path
    device = "cpu"
    dataloader_kwargs = {}
    cpu_count = 2 if mp.cpu_count() > 2 else mp.cpu_count()

    torch.manual_seed(1)

    model = Net().to(device)
    model.share_memory(
    )  # gradients are allocated lazily, so they are not shared here

    processes = []
    for rank in range(cpu_count):
        p = mp.Process(target=train,
                       args=(rank, model, device, dataloader_kwargs))
        # We first train the model across `num_processes` processes
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    trial = create_trial(out_dir)

    assert trial.num_workers == 1  # Ensure only one worker saved data
    assert len(trial.tensor_names()) > 20  # Ensure that data was saved
    assert trial.steps() == [0, 1, 2, 3]  # Ensure that steps were saved
    shutil.rmtree(out_dir, ignore_errors=True)
    shutil.rmtree(data_dir, ignore_errors=True)
Exemplo n.º 8
0
def test_hook_from_json_config_for_losses(tmpdir, monkeypatch, params):
    out_dir = tmpdir.join("test_hook_from_json_config_for_losses")
    config_file = tmpdir.join("config.json")
    config_file.write(get_json_config_for_losses(str(out_dir)))
    monkeypatch.setenv(CONFIG_FILE_PATH_ENV_STR, str(config_file))
    hook = Hook.create_from_json_file()
    assert has_training_ended(out_dir) is False
    run_xgboost_model(hook=hook, params=params)
    trial = create_trial(str(out_dir))
    eval_metric = params["eval_metric"]
    test_metric = f"test-{eval_metric}"
    train_metric = f"train-{eval_metric}"
    if eval_metric == "rmse":
        assert train_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert train_metric in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
        assert test_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert test_metric in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
    if eval_metric == "auc" or eval_metric == "map":
        assert train_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert train_metric not in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
        assert test_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert test_metric not in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
Exemplo n.º 9
0
def help_test_refresh_with_range(path):
    trial_name = str(uuid.uuid4())
    num_steps = 8
    num_tensors = 10
    for i in range(num_steps):
        generate_data(
            path=path,
            trial=trial_name,
            num_tensors=num_tensors,
            step=i,
            tname_prefix="foo",
            worker="algo-1",
            shape=(3, 3, 3),
        )
    tr = create_trial(path + trial_name, range_steps=(0, 5))
    assert len(tr.steps()) == 5
    for i in range(num_steps, num_steps * 2):
        generate_data(
            path=path,
            trial=trial_name,
            num_tensors=num_tensors,
            step=i,
            tname_prefix="foo",
            worker="algo-1",
            shape=(3, 3, 3),
            export_colls=False,
        )
    assert len(tr.steps()) == 5
Exemplo n.º 10
0
def test_loss_collection_with_no_other_collections():
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir = "/tmp/" + run_id
    hook = t_hook(out_dir=out_dir,
                  save_config=save_config,
                  include_collections=[])
    assert has_training_ended(out_dir) == False
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          register_to_loss_block=True)

    print("Created the trial with out_dir {0}".format(out_dir))
    tr = create_trial(out_dir)
    assert tr
    assert len(tr.steps()) == 4

    print(tr.tensor_names())
    tname = tr.tensor_names(regex=".*loss")[0]
    loss_tensor = tr.tensor(tname)
    loss_val = loss_tensor.value(step_num=1)
    assert len(loss_val) > 0

    shutil.rmtree(out_dir)
Exemplo n.º 11
0
def test_data_parallel():
    shutil.rmtree(out_dir, ignore_errors=True)

    hook = smd.Hook(
        out_dir=out_dir,
        save_config=smd.SaveConfig(save_steps=[0, 1, 5]),
        save_all=True,
        include_workers="one",
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = Net().to(device)
    if device == "cuda":
        model = DataParallel(model)

    hook.register_module(model)

    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    train(model, hook, torch.device(device), optimizer, num_steps=10)

    trial = create_trial(out_dir)
    assert trial.steps() == [0, 1, 5]
    if device == "cpu":
        assert len(trial.tensor_names()) == 38
    else:
        assert len(trial.tensor_names()) > 37

    shutil.rmtree(out_dir, ignore_errors=True)
Exemplo n.º 12
0
def test_new_graph(out_dir):
    # tests that we can correctly interpret an explicitly created graph
    g1 = tf.get_default_graph()
    g = tf.Graph()
    with g.as_default():
        assert g != g1
        assert g == tf.get_default_graph()
        hook = smd.SessionHook(
            out_dir,
            include_collections=["weights", "losses", "scalars"],
            save_config=smd.SaveConfig(save_steps=[0, 1, 2, 3]),
        )
        with tf.name_scope("foobar"):
            x = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            w = tf.Variable(initial_value=[[10.0], [10.0]], name="weight1")
        with tf.name_scope("foobaz"):
            w0 = [[1], [1.0]]
            y = tf.matmul(x, w0)
        loss = tf.reduce_mean((tf.matmul(x, w) - y)**2, name="loss")
        hook.get_collection("losses").add(loss)
        global_step = tf.Variable(17, name="global_step", trainable=False)
        increment_global_step_op = tf.assign(global_step, global_step + 1)

        optimizer = tf.train.AdamOptimizer(0.1)
        optimizer = hook.wrap_optimizer(optimizer)
        optimizer_op = optimizer.minimize(loss,
                                          global_step=increment_global_step_op)
        sess = tf.train.MonitoredSession(hooks=[hook])
        for i in range(5):
            x_ = np.random.random((10, 2)) * 0.1
            sess.run([loss, optimizer_op, increment_global_step_op], {x: x_})
        sess.close()
        tr = create_trial(out_dir)
        assert len(tr.tensor_names())
Exemplo n.º 13
0
def _run_net_distributed(out_dir, include_workers="one", test_timeline=False):
    """Runs a single linear layer on 2 processes."""
    # torch.distributed is empty on Mac on Torch <= 1.2
    if not hasattr(dist, "is_initialized"):
        return
    multiprocessing.set_start_method("spawn", force=True)
    size = 2
    processes = []
    for rank in range(size):
        p = Process(target=init_processes,
                    args=(out_dir, rank, size, include_workers, test_timeline,
                          run))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    # WARNING: assert statements do not cause test failure inside subprocesses
    # https://stackoverflow.com/questions/13400546/py-test-how-to-automatically-detect-an-exception-in-a-child-process
    assert all([not p.exitcode for p in processes
                ]), f"Some processes failed. processes={processes}"

    trial = create_trial(path=out_dir)
    return trial
Exemplo n.º 14
0
def basic_test(out_dir, mode):
    path = build_json(out_dir,
                      include_workers="one",
                      include_collections=["weights", "gradients"])
    num_workers = len(get_available_gpus())
    mode_args = list(HOROVOD_MNIST_ARGS) + [
        "--model_dir", os.path.join(out_dir, "checkpoint")
    ]
    if mode == "cpu":
        mode_args += ["--use_only_cpu", "true"]
    launch_horovod_job(
        script_file_path=
        f"examples/tensorflow/sagemaker_official_container/{HOROVOD_MNIST_SCRIPT_NAME}",
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )

    tr = create_trial(out_dir)
    print(tr.tensor_names())
    assert len(tr.workers()) == 1
    assert len(tr.tensor_names()) == 13
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == 1
Exemplo n.º 15
0
def test_three_writers_not_all_steps_written_but_later_step_written_complete_job(
):
    """Test Scenario Description"
     workers : [a,b,c]
     steps :{
        1: [a,b,c], 2: [a,b,c], 3: [a,c], 4: [a,c], 5: [a,c], 6: [a,b,c]
        }
    END_OF_JOB.ts --> Present
    """
    path = "s3://smdebug-testing/resources/has_step_scenarios/three-writers-not-all-steps-written-but-later-step-written-complete-job"
    trial = create_trial(path)
    num_workers = len(trial.workers())
    assert num_workers == 3
    assert trial.loaded_all_steps is True
    all_steps = trial.steps(show_incomplete_steps=True)
    completed_steps = trial.steps()
    assert all_steps == [0, 1, 2, 3, 4, 5, 6]
    assert completed_steps == all_steps
    assert trial.has_passed_step(2) == StepState.AVAILABLE
    assert trial.last_complete_step == 6
    assert trial.has_passed_step(4) == StepState.AVAILABLE
    assert trial.has_passed_step(6) == StepState.AVAILABLE
    assert trial.has_passed_step(8) == StepState.UNAVAILABLE
    assert (
        trial.last_index_token ==
        "resources/has_step_scenarios/three-writers-not-all-steps-written-but-later-step-written-complete-job/index/000000000/000000000006_worker_2.json"
    )
Exemplo n.º 16
0
def test_single_writer_all_steps_written_incomplete_job():
    """Test Scenario Description"
     workers : [a]
     steps :{
        1: [a], 2: [a], 3: [a], 4: [a], 5: [a], 6: [a]
        }
    END_OF_JOB.ts --> Absent
    """

    path = "s3://smdebug-testing/resources/has_step_scenarios/single-writer-all-steps-written-incomplete-job"
    trial = create_trial(path)
    num_workers = len(trial.workers())
    assert num_workers == 1
    assert trial.loaded_all_steps is False
    all_steps = trial.steps(show_incomplete_steps=True)
    completed_steps = trial.steps()
    assert all_steps == [0, 1, 2, 3, 4, 5, 6]
    assert all_steps == completed_steps
    assert trial.has_passed_step(3) == StepState.AVAILABLE
    assert trial.has_passed_step(8) == StepState.NOT_YET_AVAILABLE
    assert (
        trial.last_index_token ==
        "resources/has_step_scenarios/single-writer-all-steps-written-incomplete-job/index/000000000/000000000006_worker_0.json"
    )
    assert trial.last_complete_step == 6

    try:
        trial.wait_for_steps([0, 1, 2, 3, 4, 5, 6])
    except Exception:
        # All the requested steps are available, do not raise an exception
        assert False
Exemplo n.º 17
0
def test_single_writer_not_all_steps_written_incomplete_job():
    """Test Scenario Description"
     workers : [a]
     steps :{
        1: [a], 2: [a], 3: [a], 4: [], 5: [a], 6: [a]
        }
    END_OF_JOB.ts --> Absent
    """

    path = "s3://smdebug-testing/resources/has_step_scenarios/single-writer-not-all-steps-written-incomplete"
    trial = create_trial(path)
    num_workers = len(trial.workers())
    assert num_workers == 1
    assert trial.loaded_all_steps is False
    all_steps = trial.steps(show_incomplete_steps=True)
    completed_steps = trial.steps()
    assert all_steps == [0, 1, 2, 3, 5, 6]  # step 4 is missing
    assert completed_steps == all_steps
    assert trial.has_passed_step(3) == StepState.AVAILABLE
    assert trial.has_passed_step(4) == StepState.UNAVAILABLE
    assert trial.has_passed_step(8) == StepState.NOT_YET_AVAILABLE
    assert (
        trial.last_index_token ==
        "resources/has_step_scenarios/single-writer-not-all-steps-written-incomplete/index/000000000/000000000006_worker_0.json"
    )
    assert trial.last_complete_step == 6
Exemplo n.º 18
0
def test_three_writers_one_step_missing_but_later_steps_written_partially_incomplete_job(
):
    """Test Scenario Description"
     workers : [a,b,c]
     steps :{
        1: [a,b,c], 2: [a,b,c], 3: [], 4: [a,c], 5: [a,c], 6: [a,c]
        }
    END_OF_JOB.ts --> Absent
    """
    path = "s3://smdebug-testing/resources/has_step_scenarios/three_writers_one_step_missing_but_later_steps_written_partially_incomplete_job"
    trial = create_trial(path)
    num_workers = len(trial.workers())
    assert num_workers == 3
    assert trial.loaded_all_steps is False
    all_steps = trial.steps(show_incomplete_steps=True)
    completed_steps = trial.steps()
    assert all_steps == [0, 1, 2, 4, 5, 6]
    assert completed_steps == [0, 1, 2]
    assert trial.has_passed_step(2) == StepState.AVAILABLE
    assert trial.has_passed_step(3) == StepState.NOT_YET_AVAILABLE
    assert trial.last_complete_step == 2
    assert trial.has_passed_step(4) == StepState.NOT_YET_AVAILABLE
    assert trial.has_passed_step(6) == StepState.NOT_YET_AVAILABLE
    assert trial.has_passed_step(8) == StepState.NOT_YET_AVAILABLE
    assert (
        trial.last_index_token ==
        "resources/has_step_scenarios/three_writers_one_step_missing_but_later_steps_written_partially_incomplete_job/index/000000000/000000000002_worker_2.json"
    )
def helper_save_config_modes(trial_dir, hook):
    help_test_mnist(trial_dir, hook=hook, num_steps=2, num_eval_steps=3)
    tr = create_trial(trial_dir)
    for tname in tr.tensor_names(collection="weights"):
        t = tr.tensor(tname)
        assert len(t.steps(mode=modes.TRAIN)) == 2
        assert len(t.steps(mode=modes.EVAL)) == 1
def validate():
    try:
        from smdebug.trials import create_trial
        from smdebug.mxnet import get_hook

        hook = get_hook()
        out_dir = hook.out_dir
        print("Created the trial with out_dir {0}".format(out_dir))
        tr = create_trial(out_dir)
        global_steps = tr.steps()
        print("Global steps: " + str(global_steps))

        loss_tensor_name = tr.tensor_names(regex="softmaxcrossentropyloss._output_.")[0]
        print("Obtained the loss tensor " + loss_tensor_name)
        assert loss_tensor_name == "softmaxcrossentropyloss0_output_0"

        mean_loss_tensor_value_first_step = tr.tensor(loss_tensor_name).reduction_value(
            step_num=global_steps[0], reduction_name="mean", abs=False
        )

        mean_loss_tensor_value_last_step = tr.tensor(loss_tensor_name).reduction_value(
            step_num=global_steps[-1], reduction_name="mean", abs=False
        )

        print("Mean validation loss first step = " + str(mean_loss_tensor_value_first_step))
        print("Mean validation loss last step = " + str(mean_loss_tensor_value_last_step))
        assert mean_loss_tensor_value_first_step >= mean_loss_tensor_value_last_step

    except ImportError:
        print("smdebug libraries do not exist. Skipped Validation.")

    print("Validation Complete")
Exemplo n.º 21
0
def mode_allworkers_saveall(out_dir, mode):
    path = build_json(out_dir,
                      include_workers="all",
                      save_all=True,
                      include_collections=["weights", "gradients"])
    num_workers = len(get_available_gpus())
    mode_args = list(HOROVOD_ESTIMATOR_TEST_SCRIPT_ARGS) + [
        "--model_dir",
        os.path.join(out_dir, "checkpoint"),
    ]
    if mode == "cpu":
        mode_args += ["--use_only_cpu", "true"]
    launch_horovod_job(
        script_file_path=
        f"examples/tensorflow/sagemaker_official_container/{HOROVOD_ESTIMATOR_TEST_SCRIPT_PATH}",
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )
    tr = create_trial(out_dir)
    assert len(tr.workers()) == num_workers
    assert len(tr.tensor_names()) > 99
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
    assert len(tr.tensor(
        tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
Exemplo n.º 22
0
def test_pytorch(script_mode, use_loss_module):
    smd.del_hook()

    sim_class = ScriptSimulator if script_mode else SagemakerSimulator
    with sim_class() as sim:
        helper_torch_train(sim=sim,
                           script_mode=script_mode,
                           use_loss_module=use_loss_module)

        print("Finished Training")

        hook = smd.get_hook()
        print(f"hook = {hook}")
        # Check if the hook was executed with the default
        # hook configuration
        assert hook.has_default_hook_configuration()

        from smdebug.trials import create_trial

        trial = create_trial(path=sim.out_dir)
        print(f"trial.steps() = {trial.steps()}")
        print(f"trial.tensor_names() = {trial.tensor_names()}")

        print(f"collection_manager = {hook.collection_manager}")

        losses_tensors = hook.collection_manager.get("losses").tensor_names
        print(f"'losses' collection tensor_names = {losses_tensors}")
        assert len(losses_tensors) > 0

        assert all([
            name in trial.tensor_names()
            for name in hook.collection_manager.get("losses").tensor_names
        ])
Exemplo n.º 23
0
def test_hook_save_every_step(tmpdir):
    save_config = SaveConfig(save_interval=1)
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir, save_config=save_config)
    run_xgboost_model(hook=hook)
    trial = create_trial(out_dir)
    assert trial.steps() == list(range(10))
Exemplo n.º 24
0
def helper_test_multi_collections(hook, out_dir):
    device = torch.device("cpu")
    hook_type = "saveall"
    save_steps = [i for i in range(10)]
    model = Net(mode=hook_type, to_save=save_steps).to(device)
    hook.register_hook(model)
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    train(model, device, optimizer, num_steps=101, save_steps=save_steps)
    trial = create_trial(path=out_dir, name="test output")
    grads = [
        "gradient/Net_fc1.weight",
        "gradient/Net_fc2.weight",
        "gradient/Net_fc3.weight",
        "gradient/Net_fc1.bias",
        "gradient/Net_fc2.bias",
        "gradient/Net_fc3.bias",
    ]
    weights = ["Net_fc1.weight", "Net_fc2.weight", "Net_fc3.weight"]
    bias = ["Net_fc1.bias", "Net_fc2.bias", "Net_fc3.bias"]
    inputs = ["fc1_input_0", "relu1_input_0", "relu2_input_0"]
    outputs = ["fc1_output_0", "relu1_output_0", "relu2_output_0"]
    tensors = grads + bias + weights + inputs + outputs

    assert len(trial.steps()) == len(save_steps)

    for tname in tensors:
        assert tname in trial.tensor_names()
Exemplo n.º 25
0
def test_hook_validation(tmpdir):
    np.random.seed(42)
    train_data = np.random.rand(5, 10)
    train_label = np.random.randint(2, size=5)
    dtrain = xgboost.DMatrix(train_data, label=train_label)
    valid_data = np.random.rand(5, 10)
    valid_label = np.random.randint(2, size=5)
    dvalid = xgboost.DMatrix(valid_data, label=valid_label)

    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(
        out_dir=out_dir,
        include_collections=["labels", "predictions"],
        train_data=dtrain,
        validation_data=dvalid,
    )
    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    tensors = trial.tensor_names()
    assert len(tensors) > 0
    assert "labels" in trial.collections()
    assert "predictions" in trial.collections()
    assert "labels" in tensors
    assert "predictions" in tensors
Exemplo n.º 26
0
def test_hook_shap(tmpdir):
    np.random.seed(42)
    train_data = np.random.rand(10, 10)
    train_label = np.random.randint(2, size=10)
    dtrain = xgboost.DMatrix(train_data, label=train_label)

    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir,
                include_collections=["average_shap", "full_shap"],
                train_data=dtrain)
    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    tensors = trial.tensor_names()
    assert len(tensors) > 0
    assert "average_shap" in trial.collections()
    assert "full_shap" in trial.collections()
    assert any(t.startswith("average_shap/") for t in tensors)
    assert any(t.startswith("full_shap/") for t in tensors)
    assert not any(t.endswith("/bias") for t in tensors)
    average_shap_tensors = [
        t for t in tensors if t.startswith("average_shap/")
    ]
    average_shap_tensor_name = average_shap_tensors.pop()
    assert trial.tensor(average_shap_tensor_name).value(0).shape == (1, )
    full_shap_tensors = [t for t in tensors if t.startswith("full_shap/")]
    full_shap_tensor_name = full_shap_tensors.pop()
    # full shap values should have 10 rows with 10 features + 1 bias
    assert trial.tensor(full_shap_tensor_name).value(0).shape == (10, 11)
Exemplo n.º 27
0
def test_lstm_and_generator(out_dir):
    # init hook
    hook = KerasHook(
        out_dir,
        include_collections=[
            CollectionKeys.WEIGHTS,
            CollectionKeys.LOSSES,
            CollectionKeys.GRADIENTS,
        ],
        save_config=SaveConfig(save_steps=[0, 1, 2, 3]),
    )

    # init model
    num_steps = 100
    hidden_size = 100
    vocabulary = 1000
    model = Sequential()
    model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
    model.add(LSTM(hidden_size, return_sequences=True))
    model.add(LSTM(hidden_size, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(TimeDistributed(Dense(vocabulary)))
    model.add(Activation("softmax"))

    model.compile(
        loss="categorical_crossentropy",
        optimizer=hook.wrap_optimizer(Adam()),
        metrics=["categorical_accuracy"],
    )

    train(3, 32, model, num_steps, hook)

    tr = create_trial(out_dir)
    assert len(tr.tensor_names(collection=CollectionKeys.LOSSES)) > 0
    assert len(tr.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0
Exemplo n.º 28
0
def test_pytorch(script_mode, use_loss_module):
    smd.del_hook()

    sim_class = ScriptSimulator if script_mode else SagemakerSimulator
    with sim_class() as sim:
        trainloader, testloader = get_dataloaders()
        net = Net()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

        if script_mode:
            hook = smd.Hook(out_dir=sim.out_dir)
            hook.register_module(net)
            hook.register_loss(criterion)

        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            if use_loss_module:
                loss = criterion(outputs, labels)
            else:
                loss = F.cross_entropy(outputs, labels)
                if script_mode:
                    hook.record_tensor_value(tensor_name="loss", tensor_value=loss)
            loss.backward()
            optimizer.step()

            if i == 499:  # print every 2000 mini-batches
                break

        print("Finished Training")

        hook = smd.get_hook()
        print(f"hook = {hook}")

        from smdebug.trials import create_trial

        trial = create_trial(path=sim.out_dir)
        print(f"trial.steps() = {trial.steps()}")
        print(f"trial.tensor_names() = {trial.tensor_names()}")

        print(f"collection_manager = {hook.collection_manager}")

        losses_tensors = hook.collection_manager.get("losses").tensor_names
        print(f"'losses' collection tensor_names = {losses_tensors}")
        assert len(losses_tensors) > 0

        assert all(
            [
                name in trial.tensor_names()
                for name in hook.collection_manager.get("losses").tensor_names
            ]
        )
Exemplo n.º 29
0
def test_single_writer_all_steps_written_complete_job_two_modes():
    """Test Scenario Description"
     workers : [a]
     modes: TRAIN, EVAL
     steps :{
        0: [worker:a, mode: TRAIN, mode_step: 0],
        10: [worker:a, mode: TRAIN, mode_step: 10],
        20: [worker:a, mode: TRAIN, mode_step: 20],
        30: [worker:a, mode: TRAIN, mode_step: 30],
        40: [worker:a, mode: EVAL, mode_step: 0],
        50: [worker:a, mode: EVAL, mode_step: 10],
        60: [worker:a, mode: EVAL, mode_step: 20],
        70: [worker:a, mode: EVAL, mode_step: 30]
        }
    END_OF_JOB.ts --> Present
    """

    path = os.path.join("ts_output/train/", str(uuid.uuid4()))
    dummy_trial_creator(trial_dir=path, num_workers=1, job_ended=True)
    for i in range(0, 31, 10):
        dummy_step_creator(trial_dir=path,
                           global_step=i,
                           mode="TRAIN",
                           mode_step=i,
                           worker_name="worker_0")

    for i in range(0, 31, 10):
        dummy_step_creator(trial_dir=path,
                           global_step=i + 40,
                           mode="EVAL",
                           mode_step=i,
                           worker_name="worker_0")

    trial = create_trial(path)
    num_workers = len(trial.workers())
    assert num_workers == 1
    assert trial.loaded_all_steps is True
    all_steps = trial.steps(show_incomplete_steps=True)
    completed_steps = trial.steps()
    assert all_steps == [0, 10, 20, 30, 40, 50, 60, 70]
    assert completed_steps == all_steps
    assert trial.has_passed_step(30) == StepState.AVAILABLE
    assert trial.has_passed_step(23,
                                 mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
    assert trial.has_passed_step(40,
                                 mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
    assert trial.has_passed_step(30, mode=ModeKeys.EVAL) == StepState.AVAILABLE
    assert trial.has_passed_step(23,
                                 mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
    assert trial.has_passed_step(80) == StepState.UNAVAILABLE
    assert trial.has_passed_step(80,
                                 mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
    assert trial.has_passed_step(80,
                                 mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
    assert trial.last_index_token == os.path.join(
        path, "index/000000000/000000000070_worker_0.json")
    assert trial.last_complete_step == 70
    shutil.rmtree(path, ignore_errors=True)
Exemplo n.º 30
0
def help_test_refresh(path):
    trial_name = str(uuid.uuid4())
    num_steps = 8
    num_tensors = 10
    for i in range(num_steps):
        generate_data(
            path=path,
            trial=trial_name,
            num_tensors=num_tensors,
            step=i,
            tname_prefix="foo",
            worker="algo-1",
            shape=(3, 3, 3),
        )
    tr = create_trial(path + trial_name)

    assert "foo_" + str(num_tensors + 1) not in tr.tensor_names()
    assert "foo_1" in tr.tensor_names()
    assert len(tr.steps()) == num_steps
    assert len(tr.tensor("foo_1").steps()) == num_steps

    for i in range(num_steps, num_steps * 2):
        generate_data(
            path=path,
            trial=trial_name,
            num_tensors=num_tensors,
            step=i,
            tname_prefix="foo",
            worker="algo-1",
            shape=(3, 3, 3),
            export_colls=False,
        )
    assert len(tr.tensor("foo_1").steps()) == num_steps * 2
    assert len(tr.steps()) == num_steps * 2

    generate_data(
        path=path,
        trial=trial_name,
        num_tensors=num_tensors,
        step=num_steps * 2 + 1,
        tname_prefix="foo",
        worker="algo-1",
        shape=(3, 3, 3),
        export_colls=False,
    )
    assert len(tr.steps()) == num_steps * 2 + 1

    generate_data(
        path=path,
        trial=trial_name,
        num_tensors=num_tensors + 3,
        step=num_steps * 2 + 2,
        tname_prefix="foo",
        worker="algo-1",
        shape=(3, 3, 3),
        export_colls=False,
    )
    assert tr.tensor("foo_" + str(num_tensors + 1)) is not None