예제 #1
0
def test_mnist_estimator_warm_start(tf2: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial["id"]

    assert len(first_trial["steps"]) == 1
    first_checkpoint_id = first_trial["steps"][0]["checkpoint"]["id"]

    config_obj = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))

    config_obj["searcher"]["source_trial_id"] = first_trial_id
    config_obj = conf.set_tf2_image(config_obj) if tf2 else conf.set_tf1_image(
        config_obj)

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config_obj, conf.cv_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    assert trials[0]["warm_start_checkpoint_id"] == first_checkpoint_id
def test_mnist_estimator_warm_start(tf2: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial.trial.id

    assert len(first_trial.workloads) == 3
    checkpoint_workloads = exp.workloads_with_checkpoint(first_trial.workloads)
    first_checkpoint_uuid = checkpoint_workloads[0].uuid

    config_obj = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))

    config_obj["searcher"]["source_trial_id"] = first_trial_id
    config_obj = conf.set_tf2_image(config_obj) if tf2 else conf.set_tf1_image(
        config_obj)

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config_obj, conf.cv_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    assert trials[0].trial.warmStartCheckpointUuid == first_checkpoint_uuid
예제 #3
0
def test_tf_keras_const_warm_start(tf2: bool) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 1000})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial["id"]

    assert len(first_trial["steps"]) == 2
    first_checkpoint_id = first_trial["steps"][1]["checkpoint"]["id"]

    # Add a source trial ID to warm start from.
    config["searcher"]["source_trial_id"] = first_trial_id

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)

    # The new  trials should have a warm start checkpoint ID.
    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    for trial in trials:
        assert trial["warm_start_checkpoint_id"] == first_checkpoint_id
예제 #4
0
def test_cifar10_pytorch_distributed() -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_pytorch/distributed.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_pytorch"), 1)
예제 #5
0
def test_unets_tf_keras_distributed() -> None:
    config = conf.load_config(
        conf.cv_examples_path("unets_tf_keras/distributed.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("unets_tf_keras"), 1)
예제 #6
0
def test_mnist_estimator_distributed() -> None:
    config = conf.load_config(
        conf.cv_examples_path("mnist_estimator/distributed.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_estimator"), 1)
예제 #7
0
def test_mnist_pytorch_multi_output() -> None:
    config = conf.load_config(
        conf.cv_examples_path("mnist_multi_output_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_multi_output_pytorch"), 1)
def test_pl_mnist() -> None:
    exp_dir = "mnist_pl"
    config = conf.load_config(conf.cv_examples_path(exp_dir + "/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tf2_image(config)

    exp.run_basic_test_with_temp_config(config, conf.cv_examples_path(exp_dir), 1)
예제 #9
0
def test_tf_keras_const_warm_start(
        tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 1000})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_profiling_enabled(config)

    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial.trial.id

    assert len(first_trial.workloads) == 4
    checkpoints = exp.workloads_with_checkpoint(first_trial.workloads)
    first_checkpoint_uuid = checkpoints[0].uuid

    # Add a source trial ID to warm start from.
    config["searcher"]["source_trial_id"] = first_trial_id

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)

    # The new  trials should have a warm start checkpoint ID.
    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    for t in trials:
        assert t.trial.warmStartCheckpointUuid != ""
        assert t.trial.warmStartCheckpointUuid == first_checkpoint_uuid
    trial_id = trials[0].trial.id
    collect_trial_profiles(trial_id)
예제 #10
0
def test_deformabledetr_coco_pytorch_const() -> None:
    config = conf.load_config(
        conf.cv_examples_path("deformabledetr_coco_pytorch/const_fake.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("deformabledetr_coco_pytorch"), 1)
def test_cifar10_byol_pytorch_accuracy() -> None:
    config = conf.load_config(
        conf.cv_examples_path("byol_pytorch/const-cifar10.yaml"))
    # Limit convergence time, since was running over 30 minute limit.
    config["searcher"]["max_length"]["epochs"] = 20
    config["hyperparameters"]["classifier"]["train_epochs"] = 1
    config = conf.set_random_seed(config, 1591280374)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("byol_pytorch"), 1)

    trials = exp.experiment_trials(experiment_id)
    trial_metrics = exp.trial_metrics(trials[0].trial.id)

    validation_accuracies = [
        step["validation"]["metrics"]["validation_metrics"]["test_accuracy"]
        for step in trial_metrics["steps"] if step.get("validation")
    ]

    # Accuracy reachable within limited convergence time -- goes higher given full training.
    target_accuracy = 0.40
    assert max(validation_accuracies) > target_accuracy, (
        "cifar10_byol_pytorch did not reach minimum target accuracy {} in {} steps."
        " full validation accuracy history: {}".format(
            target_accuracy, len(trial_metrics["steps"]),
            validation_accuracies))
예제 #12
0
def test_launch_layer_cifar(
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 1)
    config = conf.set_profiling_enabled(config)
    config = conf.set_entrypoint(
        config,
        "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial"
    )

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    (Determined(conf.make_master_url()).get_trial(
        trials[0].trial.id).select_checkpoint(latest=True).load(
            map_location="cpu"))

    collect_trial_profiles(trials[0].trial.id)

    assert exp.check_if_string_present_in_trial_logs(
        trials[0].trial.id,
        "allocation stopped after resources exited successfully with a zero exit code",
    )
예제 #13
0
def test_mmdetection_pytorch_const() -> None:
    config = conf.load_config(
        conf.cv_examples_path("mmdetection_pytorch/const_fake_data.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mmdetection_pytorch"), 1)
예제 #14
0
def test_detr_coco_pytorch_distributed() -> None:
    config = conf.load_config(
        conf.cv_examples_path("detr_coco_pytorch/const_fake.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 2)

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("detr_coco_pytorch"), 1)
예제 #15
0
def test_tf_keras_single_gpu(tf2: bool) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 1)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
예제 #16
0
def test_pytorch_cifar10_parallel() -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 8)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    (Determined(conf.make_master_url()).get_trial(
        trials[0]["id"]).select_checkpoint(latest=True).load(
            map_location="cpu"))
예제 #17
0
def test_tf_keras_parallel(aggregation_frequency: int, tf2: bool) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_aggregation_frequency(config, aggregation_frequency)
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
def test_unets_tf_keras_distributed() -> None:
    config = conf.load_config(conf.cv_examples_path("unets_tf_keras/distributed.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    download_dir = "/tmp/data"
    url = "https://s3-us-west-2.amazonaws.com/determined-ai-datasets/oxford_iiit_pet/oxford_iiit_pet.tar.gz"  # noqa

    with tempfile.TemporaryDirectory() as tmpdir:
        copy_destination = os.path.join(tmpdir, "example")
        shutil.copytree(conf.cv_examples_path("unets_tf_keras"), copy_destination)
        with open(os.path.join(copy_destination, "startup-hook.sh"), "a") as f:
            f.write("\n")
            f.write(f"wget -O /tmp/data.tar.gz {url}\n")
            f.write(f"mkdir {download_dir}\n")
            f.write(f"tar -xzvf /tmp/data.tar.gz -C {download_dir}\n")
        exp.run_basic_test_with_temp_config(config, copy_destination, 1)
예제 #19
0
def test_launch_layer_exit(
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_pytorch/const.yaml"))
    config = conf.set_entrypoint(
        config, "python3 -m nonexistent_launch_module model_def:CIFARTrial")

    experiment_id = exp.run_failure_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_pytorch"))
    trials = exp.experiment_trials(experiment_id)
    Determined(conf.make_master_url()).get_trial(trials[0].trial.id)

    collect_trial_profiles(trials[0].trial.id)

    assert exp.check_if_string_present_in_trial_logs(
        trials[0].trial.id, "container failed with non-zero exit code: 1")
예제 #20
0
def test_tf_keras_single_gpu(
        tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 1)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_profiling_enabled(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Test exporting a checkpoint.
    export_and_load_model(experiment_id)
    collect_trial_profiles(trials[0].trial.id)
예제 #21
0
def test_mnist_tf_layers_accuracy() -> None:
    config = conf.load_config(
        conf.cv_examples_path("mnist_tf_layers/const.yaml"))
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_tf_layers"), 1)

    trials = exp.experiment_trials(experiment_id)
    trial_metrics = exp.trial_metrics(trials[0]["id"])

    validation_errors = [
        step["validation"]["metrics"]["validation_metrics"]["error"]
        for step in trial_metrics["steps"] if step.get("validation")
    ]

    target_error = 0.04
    assert min(validation_errors) < target_error, (
        "mnist_estimator did not reach minimum target error {} in {} steps."
        " full validation error history: {}".format(
            target_error, len(trial_metrics["steps"]), validation_errors))
예제 #22
0
def test_mnist_estimator_accuracy() -> None:
    config = conf.load_config(
        conf.cv_examples_path("mnist_estimator/const.yaml"))
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id)
    trial_metrics = exp.trial_metrics(trials[0]["id"])

    validation_accuracies = [
        step["validation"]["metrics"]["validation_metrics"]["accuracy"]
        for step in trial_metrics["steps"] if step.get("validation")
    ]

    target_accuracy = 0.95
    assert max(validation_accuracies) > target_accuracy, (
        "mnist_estimator did not reach minimum target accuracy {} in {} steps."
        " full validation accuracy history: {}".format(
            target_accuracy, len(trial_metrics["steps"]),
            validation_accuracies))
예제 #23
0
def test_fasterrcnn_coco_pytorch_accuracy() -> None:
    config = conf.load_config(
        conf.cv_examples_path("fasterrcnn_coco_pytorch/const.yaml"))
    config = conf.set_random_seed(config, 1590497309)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("fasterrcnn_coco_pytorch"), 1)

    trials = exp.experiment_trials(experiment_id)
    trial_metrics = exp.trial_metrics(trials[0]["id"])

    validation_iou = [
        step["validation"]["metrics"]["validation_metrics"]["val_avg_iou"]
        for step in trial_metrics["steps"] if step.get("validation")
    ]

    target_iou = 0.42
    assert max(validation_iou) > target_iou, (
        "fasterrcnn_coco_pytorch did not reach minimum target accuracy {} in {} steps."
        " full validation avg_iou history: {}".format(
            target_iou, len(trial_metrics["steps"]), validation_iou))
def test_mnist_estimator_const_parallel(tf2: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single-multi-slot.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_perform_initial_validation(config, True)

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_estimator"), 1)
    exp.assert_performed_initial_validation(exp_id)
예제 #25
0
def test_mnist_estimator_load() -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))
    config = conf.set_tf1_image(config)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id)
    model = Determined(conf.make_master_url()).get_trial(
        trials[0]["id"]).top_checkpoint().load()
    assert isinstance(model, AutoTrackable)
예제 #26
0
def test_unets_tf_keras_accuracy() -> None:
    config = conf.load_config(
        conf.cv_examples_path("unets_tf_keras/const.yaml"))
    config = conf.set_random_seed(config, 1591280374)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("unets_tf_keras"), 1)

    trials = exp.experiment_trials(experiment_id)
    trial_metrics = exp.trial_metrics(trials[0]["id"])

    validation_accuracies = [
        step["validation"]["metrics"]["validation_metrics"]["val_accuracy"]
        for step in trial_metrics["steps"] if step.get("validation")
    ]

    target_accuracy = 0.85
    assert max(validation_accuracies) > target_accuracy, (
        "unets_tf_keras did not reach minimum target accuracy {} in {} steps."
        " full validation accuracy history: {}".format(
            target_accuracy, len(trial_metrics["steps"]),
            validation_accuracies))
def test_cifar10_tf_keras_accuracy() -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_random_seed(config, 1591110586)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1, None, 6000)
    trials = exp.experiment_trials(experiment_id)
    trial_metrics = exp.trial_metrics(trials[0].trial.id)

    validation_accuracies = [
        step["validation"]["metrics"]["validation_metrics"]
        ["val_categorical_accuracy"] for step in trial_metrics["steps"]
        if step.get("validation")
    ]

    target_accuracy = 0.73
    assert max(validation_accuracies) > target_accuracy, (
        "cifar10_pytorch did not reach minimum target accuracy {} in {} steps."
        " full validation accuracy history: {}".format(
            target_accuracy, len(trial_metrics["steps"]),
            validation_accuracies))
예제 #28
0
def test_tf_keras_parallel(
        aggregation_frequency: int, tf2: bool,
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_aggregation_frequency(config, aggregation_frequency)
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_profiling_enabled(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Test exporting a checkpoint.
    export_and_load_model(experiment_id)
    collect_trial_profiles(trials[0].trial.id)

    # Check on record/batch counts we emitted in logs.
    validation_size = 10000
    global_batch_size = config["hyperparameters"]["global_batch_size"]
    num_workers = config.get("resources", {}).get("slots_per_trial", 1)
    global_batch_size = config["hyperparameters"]["global_batch_size"]
    scheduling_unit = config.get("scheduling_unit", 100)
    per_slot_batch_size = global_batch_size // num_workers
    exp_val_batches = (validation_size +
                       (per_slot_batch_size - 1)) // per_slot_batch_size
    patterns = [
        # Expect two copies of matching training reports.
        f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches",
        f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches",
        f"validated: {validation_size} records.*in {exp_val_batches} batches",
    ]
    exp.assert_patterns_in_trial_logs(trials[0].trial.id, patterns)
예제 #29
0
def test_mnist_estimmator_const_parallel(native_parallel: bool,
                                         tf2: bool) -> None:
    if tf2 and native_parallel:
        pytest.skip("TF2 native parallel training is not currently supported.")

    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single-multi-slot.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_native_parallel(config, native_parallel)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_perform_initial_validation(config, True)

    exp_id = exp.run_basic_test_with_temp_config(
        config,
        conf.cv_examples_path("mnist_estimator"),
        1,
        has_zeroth_step=True)
    exp.assert_performed_initial_validation(exp_id)
예제 #30
0
def test_invalid_experiment() -> None:
    completed_process = exp.maybe_create_experiment(
        conf.fixtures_path("invalid_experiment/const.yaml"),
        conf.cv_examples_path("mnist_tf"))
    assert completed_process.returncode != 0