Пример #1
0
def test_launch_layer_cifar(
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 1)
    config = conf.set_profiling_enabled(config)
    config = conf.set_entrypoint(
        config,
        "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial"
    )

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    (Determined(conf.make_master_url()).get_trial(
        trials[0].trial.id).select_checkpoint(latest=True).load(
            map_location="cpu"))

    collect_trial_profiles(trials[0].trial.id)

    assert exp.check_if_string_present_in_trial_logs(
        trials[0].trial.id,
        "allocation stopped after resources exited successfully with a zero exit code",
    )
def test_pytorch_parallel() -> None:
    config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tensor_auto_tuning(config, True)
    config = conf.set_perform_initial_validation(config, True)

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("mnist_pytorch"), 1)
    exp.assert_performed_initial_validation(exp_id)

    # Check on record/batch counts we emitted in logs.
    validation_size = 10000
    global_batch_size = config["hyperparameters"]["global_batch_size"]
    num_workers = config.get("resources", {}).get("slots_per_trial", 1)
    global_batch_size = config["hyperparameters"]["global_batch_size"]
    scheduling_unit = config.get("scheduling_unit", 100)
    per_slot_batch_size = global_batch_size // num_workers
    exp_val_batches = (validation_size +
                       (per_slot_batch_size - 1)) // per_slot_batch_size
    patterns = [
        # Expect two copies of matching training reports.
        f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches",
        f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches",
        f"validated: {validation_size} records.*in {exp_val_batches} batches",
    ]
    trial_id = exp.experiment_trials(exp_id)[0].trial.id
    exp.assert_patterns_in_trial_logs(trial_id, patterns)
Пример #3
0
def test_epoch_sync(num_workers: int, global_batch_size: int,
                    dataset_len: int) -> None:
    """
    Test that epoch_idx is synchronized across all workers regardless of whether the
    number of batches is evenly divisible by the number of workers.
    """
    config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml"))
    config = conf.set_slots_per_trial(config, num_workers)
    max_len_batches = 10
    config = conf.set_max_length(config, {"batches": max_len_batches})
    config = conf.set_hparam(config, "dataset_len", dataset_len)
    config = conf.set_global_batch_size(config, global_batch_size)

    e_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("pytorch_no_op"), 1)
    t_id = exp.experiment_trials(e_id)[0].trial.id

    batches_per_epoch = (dataset_len + global_batch_size -
                         1) // global_batch_size  # ceil

    for batch_idx in range(max_len_batches):
        epoch_idx = batch_idx // batches_per_epoch
        for rank in range(config["resources"]["slots_per_trial"]):
            assert exp.check_if_string_present_in_trial_logs(
                t_id,
                f"rank {rank} finished batch {batch_idx} in epoch {epoch_idx}")
Пример #4
0
def test_detr_coco_pytorch_distributed() -> None:
    config = conf.load_config(
        conf.cv_examples_path("detr_coco_pytorch/const_fake.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 2)

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("detr_coco_pytorch"), 1)
Пример #5
0
def test_faster_rcnn() -> None:
    config = conf.load_config(conf.experimental_path("trial/FasterRCNN_tp/16-gpus.yaml"))
    config = conf.set_max_length(config, {"batches": 128})
    config = conf.set_slots_per_trial(config, 1)

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("trial/FasterRCNN_tp"), 1, max_wait_secs=4800
    )
Пример #6
0
def test_pytorch_const_native_parallel() -> None:
    config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_native_parallel(config, True)
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(config,
                                        conf.tutorials_path("mnist_pytorch"),
                                        1)
Пример #7
0
def test_language_modeling_mlm() -> None:
    example_path = conf.model_hub_examples_path("huggingface/language-modeling")
    config = conf.load_config(os.path.join(example_path, "mlm_config.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_global_batch_size(config, 16)
    config = conf.set_max_length(config, {"batches": 200})
    config = set_docker_image(config)

    exp.run_basic_test_with_temp_config(config, example_path, 1)
Пример #8
0
def test_multiple_choice_swag() -> None:
    example_path = conf.model_hub_examples_path("huggingface/multiple-choice")
    config = conf.load_config(os.path.join(example_path, "swag_config.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_global_batch_size(config, 64)
    config = conf.set_max_length(config, {"batches": 200})
    config = set_docker_image(config)

    exp.run_basic_test_with_temp_config(config, example_path, 1)
Пример #9
0
def test_pytorch_const_native_parallel() -> None:
    config = conf.load_config(
        conf.official_examples_path("mnist_pytorch/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_native_parallel(config, True)
    config = conf.set_max_steps(config, 2)

    exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("mnist_pytorch"), 1)
Пример #10
0
def test_squad_v2_with_beam_search() -> None:
    example_path = conf.model_hub_examples_path("huggingface/question-answering")
    config = conf.load_config(os.path.join(example_path, "squad_v2_beam_search.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_global_batch_size(config, 16)
    config = conf.set_max_length(config, {"batches": 200})
    config = set_docker_image(config)

    exp.run_basic_test_with_temp_config(config, example_path, 1)
Пример #11
0
def test_token_classification_ner() -> None:
    example_path = conf.model_hub_examples_path("huggingface/token-classification")
    config = conf.load_config(os.path.join(example_path, "ner_config.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_global_batch_size(config, 32)
    config = conf.set_max_length(config, {"batches": 200})
    config = set_docker_image(config)

    exp.run_basic_test_with_temp_config(config, example_path, 1)
Пример #12
0
def run_tf_keras_dcgan_example() -> None:
    config = conf.load_config(
        conf.gan_examples_path("dcgan_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_tf2_image(config)

    exp.run_basic_test_with_temp_config(
        config, conf.gan_examples_path("dcgan_tf_keras"), 1)
Пример #13
0
def test_tf_keras_mnist_parallel() -> None:
    config = conf.load_config(
        conf.tutorials_path("fashion_mnist_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("fashion_mnist_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
Пример #14
0
def test_squad_amp() -> None:
    example_path = conf.model_hub_examples_path("huggingface/question-answering")
    config = conf.load_config(os.path.join(example_path, "squad.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_global_batch_size(config, 64)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_hparam(config, "use_apex_amp", True)
    config = set_docker_image(config)

    exp.run_basic_test_with_temp_config(config, example_path, 1)
Пример #15
0
def test_text_classification_xnli_amp() -> None:
    example_path = conf.model_hub_examples_path("huggingface/text-classification")
    config = conf.load_config(os.path.join(example_path, "xnli_config.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_global_batch_size(config, 128)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_hparam(config, "use_apex_amp", True)
    config = set_docker_image(config)

    exp.run_basic_test_with_temp_config(config, example_path, 1)
Пример #16
0
def test_pytorch_parallel() -> None:
    config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tensor_auto_tuning(config, True)
    config = conf.set_perform_initial_validation(config, True)

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("mnist_pytorch"), 1, has_zeroth_step=True)
    exp.assert_performed_initial_validation(exp_id)
Пример #17
0
def test_pytorch_parallel() -> None:
    config = conf.load_config(conf.official_examples_path("trial/mnist_pytorch/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_native_parallel(config, False)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tensor_auto_tuning(config, True)

    exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/mnist_pytorch"), 1
    )
Пример #18
0
def test_tensorpack_native_parallel() -> None:
    config = conf.load_config(
        conf.official_examples_path("trial/mnist_tp/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_native_parallel(config, True)
    config = conf.set_max_length(config, {"batches": 32})

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/mnist_tp"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
Пример #19
0
def test_estimator_when_detecting_gpus() -> None:
    config = conf.load_config(
        conf.fixtures_path("estimator_gpu_detection/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config,
        conf.fixtures_path("estimator_gpu_detection/"),
        1,
        has_zeroth_step=False)
Пример #20
0
def test_tf_keras_mnist_parallel() -> None:
    config = conf.load_config(conf.official_examples_path("fashion_mnist_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_native_parallel(config, False)
    config = conf.set_max_steps(config, 2)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("fashion_mnist_tf_keras"), 1
    )
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
Пример #21
0
def test_tf_keras_single_gpu(tf2: bool) -> None:
    config = conf.load_config(conf.official_examples_path("cifar10_cnn_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 1)
    config = conf.set_max_steps(config, 2)
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1
    )
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
def test_on_trial_close_callback() -> None:
    config = conf.load_config(conf.fixtures_path("estimator_no_op/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 3})

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("estimator_no_op"), 1)

    assert exp.check_if_string_present_in_trial_logs(
        exp.experiment_trials(exp_id)[0].trial.id,
        "rank 0 has completed on_trial_close")
def test_mnist_estimator_const_parallel(tf2: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single-multi-slot.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_perform_initial_validation(config, True)

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_estimator"), 1)
    exp.assert_performed_initial_validation(exp_id)
Пример #24
0
def test_pytorch_gan_parallel() -> None:
    config = conf.load_config(
        conf.gan_examples_path("gan_mnist_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 8)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.gan_examples_path("gan_mnist_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    (Determined(conf.make_master_url()).get_trial(
        trials[0]["id"]).select_checkpoint(latest=True).load(
            map_location="cpu"))
Пример #25
0
def test_tensorpack_parallel(aggregation_frequency: int) -> None:
    config = conf.load_config(
        conf.official_examples_path("mnist_tp/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_native_parallel(config, False)
    config = conf.set_max_steps(config, 2)
    config = conf.set_aggregation_frequency(config, aggregation_frequency)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("mnist_tp"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
Пример #26
0
def test_pytorch_const_parallel(aggregation_frequency: int, use_amp: bool) -> None:
    if use_amp and aggregation_frequency > 1:
        pytest.skip("Mixed precision is not support with aggregation frequency > 1.")

    config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_aggregation_frequency(config, aggregation_frequency)
    if use_amp:
        config = conf.set_amp_level(config, "O1")

    exp.run_basic_test_with_temp_config(config, conf.tutorials_path("mnist_pytorch"), 1)
Пример #27
0
def test_tf_keras_parallel(aggregation_frequency: int, tf2: bool) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_aggregation_frequency(config, aggregation_frequency)
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
Пример #28
0
def test_pytorch_cifar10_parallel() -> None:
    config = conf.load_config(
        conf.official_examples_path("trial/cifar10_cnn_pytorch/const.yaml"))
    config = conf.set_max_steps(config, 2)
    config = conf.set_slots_per_trial(config, 8)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/cifar10_cnn_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    nn = (Determined(conf.make_master_url()).get_trial(
        trials[0]["id"]).select_checkpoint(latest=True).load())
    assert isinstance(nn, torch.nn.Module)
Пример #29
0
def test_distributed_logging() -> None:
    config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 1})

    e_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("pytorch_no_op"), 1)
    t_id = exp.experiment_trials(e_id)[0]["id"]

    for i in range(config["resources"]["slots_per_trial"]):
        assert exp.check_if_string_present_in_trial_logs(
            t_id, "finished train_batch for rank {}".format(i))
Пример #30
0
def test_tf_keras_native_parallel(tf2: bool) -> None:
    config = conf.load_config(
        conf.official_examples_path("trial/cifar10_cnn_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_native_parallel(config, True)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/cifar10_cnn_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1