def test_no_failure_with_torch_mp(out_dir):
    shutil.rmtree(out_dir, ignore_errors=True)
    path = build_json(out_dir, save_all=True, save_interval="1")
    path = str(path)
    os.environ["SMDEBUG_CONFIG_FILE_PATH"] = path
    device = "cpu"
    dataloader_kwargs = {}
    cpu_count = 2 if mp.cpu_count() > 2 else mp.cpu_count()

    torch.manual_seed(1)

    model = Net().to(device)
    model.share_memory(
    )  # gradients are allocated lazily, so they are not shared here

    processes = []
    for rank in range(cpu_count):
        p = mp.Process(target=train,
                       args=(rank, model, device, dataloader_kwargs))
        # We first train the model across `num_processes` processes
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    trial = create_trial(out_dir)

    assert trial.num_workers == 1  # Ensure only one worker saved data
    assert len(trial.tensor_names()) > 20  # Ensure that data was saved
    assert trial.steps() == [0, 1, 2, 3]  # Ensure that steps were saved
    shutil.rmtree(out_dir, ignore_errors=True)
    shutil.rmtree(data_dir, ignore_errors=True)
示例#2
0
def mode_allworkers_saveall(out_dir, mode):
    path = build_json(out_dir,
                      include_workers="all",
                      save_all=True,
                      include_collections=["weights", "gradients"])
    num_workers = len(get_available_gpus())
    mode_args = list(HOROVOD_ESTIMATOR_TEST_SCRIPT_ARGS) + [
        "--model_dir",
        os.path.join(out_dir, "checkpoint"),
    ]
    if mode == "cpu":
        mode_args += ["--use_only_cpu", "true"]
    launch_horovod_job(
        script_file_path=
        f"examples/tensorflow/sagemaker_official_container/{HOROVOD_ESTIMATOR_TEST_SCRIPT_PATH}",
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )
    tr = create_trial(out_dir)
    assert len(tr.workers()) == num_workers
    assert len(tr.tensor_names()) > 99
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
    assert len(tr.tensor(
        tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
示例#3
0
def basic_test(out_dir, mode):
    path = build_json(out_dir,
                      include_workers="one",
                      include_collections=["weights", "gradients"])
    num_workers = len(get_available_gpus())
    mode_args = list(HOROVOD_MNIST_ARGS) + [
        "--model_dir", os.path.join(out_dir, "checkpoint")
    ]
    if mode == "cpu":
        mode_args += ["--use_only_cpu", "true"]
    launch_horovod_job(
        script_file_path=
        f"examples/tensorflow/sagemaker_official_container/{HOROVOD_MNIST_SCRIPT_NAME}",
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )

    tr = create_trial(out_dir)
    print(tr.tensor_names())
    assert len(tr.workers()) == 1
    assert len(tr.tensor_names()) == 13
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == 1
示例#4
0
def test_training_with_no_grad_updates():
    temp_dir = TemporaryDirectory().name
    path = build_json(temp_dir,
                      include_collections=["losses"],
                      save_interval="1")
    os.environ["SMDEBUG_CONFIG_FILE_PATH"] = str(path)
    do_training()
    trial = create_trial(temp_dir)
    assert len(trial.steps()) == 99
def mode_allworkers_saveall(out_dir, mode):
    path = build_json(out_dir, include_workers="all", save_all=True)
    num_workers = len(get_available_gpus())
    mode_args = ["--model_dir", out_dir]
    launch_smdataparallel_job(
        script_file_path=SMDATAPARALLEL_TF2_TEST_MNIST_SCRIPT,
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )
    tr = create_trial(out_dir)
    assert len(tr.workers()) == num_workers
    assert len(tr.tensor_names()) == 35
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
    assert len(tr.tensor("loss").workers(0)) == num_workers
def mode_allworkers_saveall(out_dir, mode):
    path = build_json(out_dir, include_workers="all", save_all=True)
    num_workers = 1 if bool(device_count()) is False else device_count()
    mode_args = list(SMDATAPARALLEL_PYTORCH_TEST_MNIST_ARGS)
    launch_smdataparallel_job(
        script_file_path=SMDATAPARALLEL_PYTORCH_TEST_MNIST_SCRIPT,
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )
    tr = create_trial(out_dir)
    assert len(tr.workers()) == num_workers
    assert len(tr.tensor_names()) > 25
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
    assert len(tr.tensor(
        tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
def mode_allworkers_default_collections(out_dir, mode):
    path = build_json(out_dir,
                      include_workers="all",
                      include_collections=TF_DEFAULT_SAVED_COLLECTIONS)
    num_workers = len(get_available_gpus())
    mode_args = ["--model_dir", out_dir]
    launch_smdataparallel_job(
        script_file_path=SMDATAPARALLEL_TF2_TEST_MNIST_SCRIPT,
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )
    tr = create_trial(out_dir)
    assert len(tr.workers()) == num_workers
    assert len(tr.tensor_names()) == 1
    assert len(tr.tensor(
        tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
def mode_allworkers(out_dir, mode):
    path = build_json(out_dir,
                      include_workers="all",
                      include_collections=["weights", "optimizer_variables"])
    num_workers = len(get_available_gpus())
    mode_args = ["--model_dir", out_dir]
    launch_smdataparallel_job(
        script_file_path=SMDATAPARALLEL_TF2_TEST_MNIST_SCRIPT,
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )
    tr = create_trial(out_dir)
    assert len(tr.workers()) == num_workers
    print("tensor names: ", tr.tensor_names())
    assert len(tr.tensor_names()) == 5
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
示例#9
0
def test_no_failure_with_torch_mp(out_dir):
    shutil.rmtree(out_dir, ignore_errors=True)
    shutil.rmtree(data_dir, ignore_errors=True)

    print("Downloading the MNIST dataset")
    os.system(f"mkdir {data_dir}")
    s3_client = boto3.client("s3")
    s3_client.download_file("smdebug-testing", "datasets/MNIST_pytorch.tar.gz",
                            f"{data_dir}/MNIST_pytorch.tar.gz")
    os.system(f"tar -zxf {data_dir}/MNIST_pytorch.tar.gz")
    os.system(f"mv MNIST {data_dir}")
    path = build_json(out_dir, save_all=True, save_interval="1")
    path = str(path)
    os.environ["SMDEBUG_CONFIG_FILE_PATH"] = path
    device = "cpu"
    dataloader_kwargs = {}
    cpu_count = 2 if mp.cpu_count() > 2 else mp.cpu_count()

    torch.manual_seed(1)
    model = Net().to(device)
    model.share_memory(
    )  # gradients are allocated lazily, so they are not shared here

    processes = []
    print(f"Starting the training for {cpu_count} ")
    for rank in range(cpu_count):
        p = mp.Process(target=train,
                       args=(rank, model, device, dataloader_kwargs))
        # We first train the model across `num_processes` processes
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
    print("Finished the training..")

    trial = create_trial(out_dir)

    assert trial.num_workers == 1  # Ensure only one worker saved data
    assert len(trial.tensor_names()) > 20  # Ensure that data was saved
    assert trial.steps() == [0, 1, 2, 3]  # Ensure that steps were saved
    shutil.rmtree(out_dir, ignore_errors=True)
    shutil.rmtree(data_dir, ignore_errors=True)
示例#10
0
def mode_allworkers(out_dir, mode):
    path = build_json(out_dir,
                      include_workers="all",
                      include_collections=["weights", "optimizer_variables"])
    num_workers = len(get_available_gpus())
    mode_args = list(HOROVOD_KERAS_TEST_SCRIPT_ARGS) + ["--model_dir", out_dir]
    if mode == "cpu":
        mode_args += ["--use_only_cpu", "true"]
    launch_horovod_job(
        script_file_path=HOROVOD_TF2_TEST_MNIST_SCRIPT,
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )
    tr = create_trial(out_dir)
    assert len(tr.workers()) == num_workers
    assert len(tr.tensor_names()) == (13 if is_tf_2_2() else 14)
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
示例#11
0
def mode_allworkers_saveall(out_dir, mode):
    path = build_json(out_dir, include_workers="all", save_all=True)
    num_workers = 1 if bool(device_count()) is False else device_count()
    mode_args = []
    if mode == "cpu":
        mode_args += ["--use_only_cpu", "true"]
    launch_horovod_job(
        script_file_path=HOROVOD_PYTORCH_TEST_MNIST_SCRIPT,
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )
    tr = create_trial(out_dir)
    assert len(tr.workers()) == num_workers
    assert len(tr.tensor_names()) > 25
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
    assert len(tr.tensor(
        tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
示例#12
0
def mode_one_worker(out_dir, mode):
    path = build_json(out_dir,
                      include_workers="one",
                      include_collections=["weights", "gradients"])
    num_workers = device_count()
    mode_args = []
    if mode == "cpu":
        mode_args += ["--use_only_cpu", "true"]
    launch_horovod_job(
        script_file_path=HOROVOD_PYTORCH_TEST_MNIST_SCRIPT,
        script_args=mode_args,
        num_workers=num_workers,
        config_file_path=path,
        mode=mode,
    )

    tr = create_trial(out_dir)
    assert len(tr.workers()) == 1  # We expect only one worker because
    # it has been configured so in HOROVOD_MNIST_SCRIPT_NAME
    assert len(tr.tensor_names()) == 13
    assert len(tr.tensor(
        tr.tensor_names(collection="weights")[0]).workers(0)) == 1
    assert len(tr.tensor(
        tr.tensor_names(collection="losses")[0]).workers(0)) == 1