示例#1
0
def test_streaming_observability_metrics_apis(
    framework_base_experiment: str, framework_timings_enabled: bool
) -> None:
    # TODO: refactor tests to not use cli singleton auth.
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True)

    config_path = conf.tutorials_path(f"../{framework_base_experiment}/const.yaml")
    model_def_path = conf.tutorials_path(f"../{framework_base_experiment}")

    config_obj = conf.load_config(config_path)
    config_obj = conf.set_profiling_enabled(config_obj)
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)
        experiment_id = exp.create_experiment(
            tf.name,
            model_def_path,
        )

    exp.wait_for_experiment_state(experiment_id, "COMPLETED")
    trials = exp.experiment_trials(experiment_id)
    trial_id = trials[0]["id"]

    gpu_enabled = conf.GPU_ENABLED

    request_profiling_metric_labels(trial_id, framework_timings_enabled, gpu_enabled)
    if gpu_enabled:
        request_profiling_system_metrics(trial_id, "gpu_util")
    if framework_timings_enabled:
        request_profiling_pytorch_timing_metrics(trial_id, "train_batch")
示例#2
0
def test_tf_keras_const_warm_start(
        tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 1000})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_profiling_enabled(config)

    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial.trial.id

    assert len(first_trial.workloads) == 4
    checkpoints = exp.workloads_with_checkpoint(first_trial.workloads)
    first_checkpoint_uuid = checkpoints[0].uuid

    # Add a source trial ID to warm start from.
    config["searcher"]["source_trial_id"] = first_trial_id

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)

    # The new  trials should have a warm start checkpoint ID.
    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    for t in trials:
        assert t.trial.warmStartCheckpointUuid != ""
        assert t.trial.warmStartCheckpointUuid == first_checkpoint_uuid
    trial_id = trials[0].trial.id
    collect_trial_profiles(trial_id)
示例#3
0
def test_launch_layer_cifar(
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 1)
    config = conf.set_profiling_enabled(config)
    config = conf.set_entrypoint(
        config,
        "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial"
    )

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    (Determined(conf.make_master_url()).get_trial(
        trials[0].trial.id).select_checkpoint(latest=True).load(
            map_location="cpu"))

    collect_trial_profiles(trials[0].trial.id)

    assert exp.check_if_string_present_in_trial_logs(
        trials[0].trial.id,
        "allocation stopped after resources exited successfully with a zero exit code",
    )
def test_pytorch_const_with_amp(
        api_style: str, collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.fixtures_path("pytorch_amp/" + api_style + "_amp.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_profiling_enabled(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("pytorch_amp"), 1)
    trial_id = exp.experiment_trials(experiment_id)[0].trial.id
    collect_trial_profiles(trial_id)
def test_pytorch_load(collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"))
    config = conf.set_profiling_enabled(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("mnist_pytorch"), 1)

    (Determined(conf.make_master_url()).get_experiment(
        experiment_id).top_checkpoint().load(map_location="cpu"))
    trial_id = exp.experiment_trials(experiment_id)[0].trial.id
    collect_trial_profiles(trial_id)
示例#6
0
def test_tf_keras_mnist_parallel(
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.tutorials_path("fashion_mnist_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_profiling_enabled(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("fashion_mnist_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
    collect_trial_profiles(trials[0].trial.id)
示例#7
0
def run_tf_keras_dcgan_example(
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.gan_examples_path("dcgan_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_tf2_image(config)
    config = conf.set_profiling_enabled(config)

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.gan_examples_path("dcgan_tf_keras"), 1)
    trial_id = exp.experiment_trials(exp_id)[0].trial.id
    collect_trial_profiles(trial_id)
示例#8
0
def run_tf_keras_mnist_data_layer_test(tf2: bool, storage_type: str) -> int:
    config = conf.load_config(
        conf.fixtures_path("data_layer_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 1000})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_profiling_enabled(config)

    if storage_type == "lfs":
        config = conf.set_shared_fs_data_layer(config)
    else:
        config = conf.set_s3_data_layer(config)

    return exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("data_layer_tf_keras"), 1)
示例#9
0
def test_tf_keras_tf2_disabled(
        collect_trial_profiles: Callable[[int], None]) -> None:
    """Keras on tf2 with tf2 and eager execution disabled."""
    config = conf.load_config(
        conf.fixtures_path("keras_tf2_disabled_no_op/const.yaml"))
    config = conf.set_max_length(config, {"batches": 1})
    config = conf.set_tf2_image(config)
    config = conf.set_profiling_enabled(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("keras_tf2_disabled_no_op"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
    export_and_load_model(experiment_id)
    collect_trial_profiles(trials[0].trial.id)
def test_pytorch_gan_parallel(
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.gan_examples_path("gan_mnist_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_profiling_enabled(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.gan_examples_path("gan_mnist_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    (Determined(conf.make_master_url()).get_trial(
        trials[0].trial.id).select_checkpoint(latest=True).load(
            map_location="cpu"))
    collect_trial_profiles(trials[0].trial.id)
示例#11
0
def test_tf_keras_single_gpu(
        tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 1)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_profiling_enabled(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Test exporting a checkpoint.
    export_and_load_model(experiment_id)
    collect_trial_profiles(trials[0].trial.id)
示例#12
0
def test_tf_keras_mnist_data_layer_parallel(
    tf2: bool,
    storage_type: str,
    secrets: Dict[str, str],
    collect_trial_profiles: Callable[[int], None],
) -> None:
    config = conf.load_config(
        conf.fixtures_path("data_layer_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_profiling_enabled(config)

    if storage_type == "lfs":
        config = conf.set_shared_fs_data_layer(config)
    else:
        config = conf.set_s3_data_layer(config)

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("data_layer_tf_keras"), 1)

    trial_id = exp.experiment_trials(exp_id)[0].trial.id
    collect_trial_profiles(trial_id)
示例#13
0
def test_tf_keras_parallel(
        aggregation_frequency: int, tf2: bool,
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_aggregation_frequency(config, aggregation_frequency)
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_profiling_enabled(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Test exporting a checkpoint.
    export_and_load_model(experiment_id)
    collect_trial_profiles(trials[0].trial.id)

    # Check on record/batch counts we emitted in logs.
    validation_size = 10000
    global_batch_size = config["hyperparameters"]["global_batch_size"]
    num_workers = config.get("resources", {}).get("slots_per_trial", 1)
    global_batch_size = config["hyperparameters"]["global_batch_size"]
    scheduling_unit = config.get("scheduling_unit", 100)
    per_slot_batch_size = global_batch_size // num_workers
    exp_val_batches = (validation_size +
                       (per_slot_batch_size - 1)) // per_slot_batch_size
    patterns = [
        # Expect two copies of matching training reports.
        f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches",
        f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches",
        f"validated: {validation_size} records.*in {exp_val_batches} batches",
    ]
    exp.assert_patterns_in_trial_logs(trials[0].trial.id, patterns)
def test_pytorch_11_const(
        aggregation_frequency: int, using_k8s: bool,
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"))
    config = conf.set_aggregation_frequency(config, aggregation_frequency)
    config = conf.set_profiling_enabled(config)

    if using_k8s:
        pod_spec = {
            "metadata": {
                "labels": {
                    "ci": "testing"
                }
            },
            "spec": {
                "containers": [{
                    "name":
                    "determined-container",
                    "volumeMounts": [{
                        "name": "temp1",
                        "mountPath": "/random"
                    }],
                }],
                "volumes": [{
                    "name": "temp1",
                    "emptyDir": {}
                }],
            },
        }
        config = conf.set_pod_spec(config, pod_spec)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("mnist_pytorch"), 1)
    trial_id = exp.experiment_trials(experiment_id)[0].trial.id
    collect_trial_profiles(trial_id)