Python t_hook 예제들, smdebug.mxnet.hook.t_hook Python 예제들

예제 #1

0

파일 보기

def test_hook_timeline_file_write(set_up_smprofiler_config_path, out_dir):
    """
    This test is meant to test TimelineFileWriter through a MXNet hook.
    """
    hook = t_hook(out_dir=out_dir)

    for i in range(1, 11):
        n = "event" + str(i)
        hook.record_trace_events(
            training_phase="MXNet_TimelineFileWriteTest",
            op_name=n,
            step_num=i,
            timestamp=time.time(),
        )

    # need to explicitly close hook for the test here so that the JSON file is written and
    # can be read back below.
    # In training scripts, this is not necessary as _cleanup will take care of closing the trace file.
    hook.close()

    files = []
    for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"):
        files.append(path)

    assert len(files) == 1

    with open(files[0]) as timeline_file:
        events_dict = json.load(timeline_file)

    assert events_dict

예제 #2

0

파일 보기

def test_loss_collection_with_no_other_collections():
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir = "/tmp/" + run_id
    hook = t_hook(out_dir=out_dir,
                  save_config=save_config,
                  include_collections=[])
    assert has_training_ended(out_dir) == False
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          register_to_loss_block=True)

    print("Created the trial with out_dir {0}".format(out_dir))
    tr = create_trial(out_dir)
    assert tr
    assert len(tr.steps()) == 4

    print(tr.tensor_names())
    tname = tr.tensor_names(regex=".*loss")[0]
    loss_tensor = tr.tensor(tname)
    loss_val = loss_tensor.value(step_num=1)
    assert len(loss_val) > 0

    shutil.rmtree(out_dir)

예제 #3

0

파일 보기

def test_hook_all_zero(hook=None, out_dir=None):
    hook_created = False
    if hook is None:
        hook_created = True
        save_config = SaveConfig(save_steps=[0, 1, 2, 3])
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        out_dir = "/tmp/newlogsRunTest/" + run_id
        print("Registering the hook with out_dir {0}".format(out_dir))
        shutil.rmtree(out_dir, ignore_errors=True)
        hook = t_hook(
            out_dir=out_dir,
            save_config=save_config,
            include_collections=[
                "ReluActivation", "weights", "biases", "gradients"
            ],
        )
        hook.get_collection("ReluActivation").include(["relu*", "input_*"])
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          make_input_zero=True)

    print("Created the trial with out_dir {0}".format(out_dir))
    tr = create_trial(out_dir)
    assert tr
    assert len(tr.steps()) == 4

    tnames = tr.tensor_names(regex="conv._input")
    tname = tr.tensor_names(regex="conv._input")[0]
    conv_tensor_value = tr.tensor(tname).value(step_num=0)
    is_zero = np.all(conv_tensor_value == 0)
    assert is_zero == True
    if hook_created:
        shutil.rmtree(out_dir)

예제 #4

0

파일 보기

def test_hook_custom_collection():
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir = "/tmp/" + run_id
    hook = t_hook(out_dir=out_dir, save_config=save_config, include_collections=["ReluActivation"])
    hook.get_collection("ReluActivation").include(["relu*", "input_*"])
    run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10)
    shutil.rmtree(out_dir)

예제 #5

0

파일 보기

def test_hook():
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir = "/tmp/newlogsRunTest/" + run_id
    hook = t_hook(out_dir=out_dir, save_config=save_config)
    assert has_training_ended(out_dir) == False
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          register_to_loss_block=True)
    shutil.rmtree(out_dir)

예제 #6

0

파일 보기

def test_save_shapes(out_dir):
    global_reduce_config = ReductionConfig(save_shape=True)
    global_save_config = SaveConfig(save_steps=[0, 1])

    hook = t_hook(
        out_dir=out_dir,
        save_config=global_save_config,
        save_all=True,
        reduction_config=global_reduce_config,
    )
    run_mnist_gluon_model(hook=hook, num_steps_train=5)
    verify_shapes(out_dir, 0)
    verify_shapes(out_dir, 1)
    shutil.rmtree(out_dir)

예제 #7

0

파일 보기

파일: test_custom_tensor.py 프로젝트: thaisep/sagemaker-debugger

def test_hook():
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir = "/tmp/newlogsRunTest/" + run_id
    hook = t_hook(out_dir=out_dir, save_config=save_config)
    run_mnist_gluon_model(
        hook=hook,
        num_steps_train=10,
        num_steps_eval=10,
        register_to_loss_block=True,
        save_custom_tensor=True,
    )
    trial = create_trial(out_dir)
    custom_tensors = trial.tensor_names(collection=CollectionKeys.DEFAULT)
    all_tensors = trial.tensor_names()
    assert len(custom_tensors) == 2
    assert len(all_tensors) == 4
    shutil.rmtree(out_dir)

예제 #8

0

파일 보기

파일: test_hook_save_config.py 프로젝트: vandanavk/sagemaker-debugger

def test_save_config(hook=None):
    if hook is None:
        save_config_collection = SaveConfig(save_steps=[4, 5, 6])
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        out_dir = "/tmp/" + run_id
        save_config = SaveConfig(save_steps=[0, 1, 2, 3])
        hook = t_hook(
            out_dir=out_dir,
            save_config=save_config,
            include_collections=[
                "ReluActivation", "weights", "biases", "gradients", "default"
            ],
        )
        custom_collect = hook.get_collection("ReluActivation")
        custom_collect.save_config = save_config_collection
        custom_collect.include(["relu*", "input_*", "output*"])

    run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10)
    if hook is None:
        shutil.rmtree(out_dir)

예제 #9

0

파일 보기

def test_save_all(hook=None, out_dir=None):
    hook_created = False
    if hook is None:
        hook_created = True
        save_config = SaveConfig(save_steps=[0, 1, 2, 3])
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        out_dir = "/tmp/" + run_id
        print("Registering the hook with out_dir {}".format(out_dir))
        hook = t_hook(out_dir=out_dir, save_config=save_config, save_all=True)
    run_mnist_gluon_model(hook=hook, num_steps_train=7, num_steps_eval=5)
    # assert for steps and tensor_names
    print("Created the trial with out_dir {}".format(out_dir))
    tr = create_trial(out_dir)
    tensor_list = tr.tensor_names()
    assert tr
    assert len(tr.steps()) == 4
    # some tensor names, like input and output, can't be retrieved from training session, so here we only assert for tensor numbers
    # 46 is gotten from index file
    # if no assertion failure, then the script could save all tensors
    assert len(tensor_list) == 46
    if hook_created:
        shutil.rmtree(out_dir)

예제 #10

0

파일 보기

def test_modes(hook=None, path=None):
    if hook is None:
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        path = "/tmp/" + run_id
        hook = t_hook(
            out_dir=path,
            save_config=SaveConfig({
                modes.TRAIN: SaveConfigMode(save_interval=2),
                modes.EVAL: SaveConfigMode(save_interval=3),
            }),
            include_collections=["gradients", "weights"],
        )
    run_mnist_gluon_model(hook=hook,
                          set_modes=True,
                          register_to_loss_block=True,
                          num_steps_train=6,
                          num_steps_eval=6)

    tr = create_trial(path)
    assert len(tr.modes()) == 2
    assert len(tr.steps()) == 5, tr.steps()
    assert len(tr.steps(mode=modes.TRAIN)) == 3
    assert len(tr.steps(mode=modes.EVAL)) == 2, tr.steps()

    # Ensure that the gradients are available in TRAIN modes only.
    grad_tns_name = tr.tensor_names(regex="^gradient.")[0]
    grad_tns = tr.tensor(grad_tns_name)
    grad_train_steps = grad_tns.steps(mode=modes.TRAIN)
    grad_eval_steps = grad_tns.steps(mode=modes.EVAL)
    assert len(grad_train_steps) == 3
    assert grad_eval_steps == []

    # Ensure that the weights are available in TRAIN and EVAL  modes.
    wt_tns_name = tr.tensor_names(regex="conv\d+_weight")[0]
    wt_tns = tr.tensor(wt_tns_name)
    wt_train_steps = wt_tns.steps(mode=modes.TRAIN)
    wt_eval_steps = wt_tns.steps(mode=modes.EVAL)
    assert len(wt_train_steps) == 3
    assert len(wt_eval_steps) == 2

예제 #11

0

파일 보기

파일: test_spot_training.py 프로젝트: vandanavk/sagemaker-debugger

def test_spot_hook():
    os.environ[
        CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR] = "./tests/mxnet/test_json_configs/checkpointconfig.json"
    checkpoint_path = "/tmp/savedParams"
    if not os.path.exists(checkpoint_path):
        os.mkdir(checkpoint_path)
    save_config = SaveConfig(
        save_steps=[10, 11, 12, 13, 14, 40, 50, 60, 70, 80])
    """
    Run the training for 2 epochs and save the parameter after every epoch.
    We expect that steps 0 to 14 will be written.
    """

    run_id_1 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir_1 = "/tmp/newlogsRunTest/" + run_id_1
    hook = t_hook(out_dir=out_dir_1,
                  save_config=save_config,
                  include_collections=["weights", "gradients"])
    assert has_training_ended(out_dir_1) == False
    run_mnist(
        hook=hook,
        num_steps_train=10,
        num_steps_eval=10,
        epochs=2,
        save_interval=1,
        save_path=checkpoint_path,
    )
    """
    Run the training again for 4 epochs and save the parameter after every epoch.
    We DONOT expect that steps 0 to 14 are written.
    We expect to read steps 40, 50, 60, 70 and 80
    """
    run_id_2 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir_2 = "/tmp/newlogsRunTest/" + run_id_2
    hook = t_hook(out_dir=out_dir_2,
                  save_config=save_config,
                  include_collections=["weights", "gradients"])
    assert has_training_ended(out_dir_2) == False
    run_mnist(
        hook=hook,
        num_steps_train=10,
        num_steps_eval=10,
        epochs=4,
        save_interval=1,
        save_path=checkpoint_path,
    )
    # Unset the environ variable before validation so that it won't affect the other scripts in py test environment.
    del os.environ[CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR]

    # Validation
    print("Created the trial with out_dir {0} for the first training".format(
        out_dir_1))
    tr = create_trial(out_dir_1)
    assert tr
    available_steps_1 = tr.steps()
    assert 40 not in available_steps_1
    assert 80 not in available_steps_1
    print(available_steps_1)

    print("Created the trial with out_dir {0} for the second training".format(
        out_dir_2))
    tr = create_trial(out_dir_2)
    assert tr
    available_steps_2 = tr.steps()
    assert 40 in available_steps_2
    assert 50 in available_steps_2
    assert 60 in available_steps_2
    assert 70 in available_steps_2
    assert 80 in available_steps_2
    assert 0 not in available_steps_2
    assert 10 not in available_steps_2
    assert 11 not in available_steps_2
    assert 12 not in available_steps_2
    print(available_steps_2)

    print("Cleaning up.")
    shutil.rmtree(os.path.dirname(out_dir_1))
    shutil.rmtree(checkpoint_path, ignore_errors=True)

예제 #12

0

파일 보기

def test_save_config(hook=None, out_dir=None):
    hook_created = False
    if hook is None:
        hook_created = True
        global_reduce_config = ReductionConfig(reductions=["max", "mean"])
        global_save_config = SaveConfig(save_steps=[0, 1, 2, 3])

        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        out_dir = "/tmp/newlogsRunTest/" + run_id
        print("Registering the hook with out_dir {0}".format(out_dir))
        hook = t_hook(
            out_dir=out_dir,
            save_config=global_save_config,
            save_all=True,
            include_collections=[
                "weights",
                "biases",
                "gradients",
                "default",
                "ReluActivation",
                "flatten",
            ],
            reduction_config=global_reduce_config,
        )
        hook.get_collection("ReluActivation").include(["relu*"])
        hook.get_collection("ReluActivation").save_config = SaveConfig(
            save_steps=[4, 5, 6])
        hook.get_collection(
            "ReluActivation").reduction_config = ReductionConfig(
                reductions=["min"], abs_reductions=["max"])

        hook.get_collection("flatten").include(["flatten*"])
        hook.get_collection("flatten").save_config = SaveConfig(
            save_steps=[4, 5, 6])
        hook.get_collection("flatten").reduction_config = ReductionConfig(
            norms=["l1"], abs_norms=["l2"])

    run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10)

    # Testing
    print("Created the trial with out_dir {0}".format(out_dir))
    tr = create_trial(out_dir)
    assert tr
    assert len(tr.steps()) == 7

    print(tr.tensor_names())
    tname = tr.tensor_names(regex=r"conv\d+_weight")[0]
    # Global reduction with max and mean
    weight_tensor = tr.tensor(tname)
    max_val = weight_tensor.reduction_value(step_num=1,
                                            abs=False,
                                            reduction_name="max")
    assert max_val is not None
    mean_val = weight_tensor.reduction_value(step_num=1,
                                             abs=False,
                                             reduction_name="mean")
    assert mean_val is not None

    # custom reduction at step 4 with reduction = 'min' and abs reduction = 'max'
    tname = tr.tensor_names(regex=r"conv\d+_relu_input_0")[0]
    relu_input = tr.tensor(tname)
    min_val = relu_input.reduction_value(step_num=4,
                                         abs=False,
                                         reduction_name="min")
    assert min_val is not None
    abs_max_val = relu_input.reduction_value(step_num=4,
                                             abs=True,
                                             reduction_name="max")
    assert abs_max_val is not None

    # Custom reduction with normalization
    tname = tr.tensor_names(regex=r"flatten\d+_input_0")[0]
    flatten_input = tr.tensor(tname)
    l1_norm = flatten_input.reduction_value(step_num=4,
                                            abs=False,
                                            reduction_name="l1")
    assert l1_norm is not None
    l2_norm = flatten_input.reduction_value(step_num=4,
                                            abs=True,
                                            reduction_name="l2")
    assert l2_norm is not None
    if hook_created:
        shutil.rmtree(out_dir)