예제 #1
0
def test_mask_rcnn_64_slots() -> None:
    experiment_id = exp.run_basic_test(
        conf.experimental_path("FasterRCNN_tp/64-gpus.yaml"),
        conf.experimental_path("FasterRCNN_tp/"),
        1,
        max_wait_secs=5 * 60 * 60,
    )

    validation_metric_name = "mAP(bbox)/IoU=0.5:0.95"
    validation_metric = exp.get_validation_metric_from_last_step(
        experiment_id, 0, validation_metric_name)
    durations = exp.get_experiment_durations(experiment_id, 0)
    wait_for_agents_time = (durations.experiment_duration -
                            durations.training_duration -
                            durations.validation_duration -
                            durations.checkpoint_duration)

    print(validation_metric_name, validation_metric)
    print(durations)
    print(f"wait for agents duration: {wait_for_agents_time}")

    assert validation_metric > 0.375
    assert durations.training_duration < datetime.timedelta(hours=2,
                                                            minutes=45)
    assert durations.validation_duration < datetime.timedelta(hours=1,
                                                              minutes=15)
예제 #2
0
def test_pytorch_const_multi_output() -> None:
    config = conf.load_config(
        conf.experimental_path("mnist_pytorch_multi_output/const.yaml"))
    config = conf.set_max_steps(config, 2)

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("mnist_pytorch_multi_output"), 1)
예제 #3
0
def test_resnet50() -> None:
    config = conf.load_config(
        conf.experimental_path("resnet50_tf_keras/const.yaml"))
    config = conf.set_max_steps(config, 2)

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("resnet50_tf_keras"), 1)
예제 #4
0
def test_bert_glue() -> None:
    config = conf.load_config(
        conf.experimental_path("bert_glue_pytorch/const.yaml"))
    config = conf.set_max_steps(config, 2)

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("bert_glue_pytorch/"), 1)
예제 #5
0
def test_mnist_tp_to_estimator() -> None:
    config = conf.load_config(
        conf.experimental_path("mnist_tp_to_estimator/const.yaml"))
    config = conf.set_max_steps(config, 2)

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("mnist_tp_to_estimator"), 1)
예제 #6
0
def test_nas_search() -> None:
    config = conf.load_config(
        conf.experimental_path("nas_search/train_one_arch.yaml"))
    config = conf.set_max_steps(config, 2)

    exp.run_basic_test_with_temp_config(config,
                                        conf.experimental_path("nas_search"),
                                        1)
예제 #7
0
def test_faster_rcnn() -> None:
    config = conf.load_config(
        conf.experimental_path("FasterRCNN_tp/16-gpus.yaml"))
    config = conf.set_max_steps(config, 2)
    config = conf.set_slots_per_trial(config, 1)

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("FasterRCNN_tp"), 1, max_wait_secs=4800)
예제 #8
0
def run_tf_keras_mnist_data_layer_test(tf2: bool, storage_type: str) -> None:
    config = conf.load_config(
        conf.experimental_path("data_layer_mnist_tf_keras/const.yaml"))
    config = conf.set_max_steps(config, 2)
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    if storage_type == "lfs":
        config = conf.set_shared_fs_data_layer(config)
    else:
        config = conf.set_s3_data_layer(config)

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("data_layer_mnist_tf_keras"), 1)
예제 #9
0
def test_mnist_estimator_data_layer_parallel(storage_type: str) -> None:
    config = conf.load_config(
        conf.experimental_path("data_layer_mnist_estimator/const.yaml"))
    config = conf.set_max_steps(config, 2)
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_tf1_image(config)
    if storage_type == "lfs":
        config = conf.set_shared_fs_data_layer(config)
    else:
        config = conf.set_s3_data_layer(config)

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("data_layer_mnist_estimator"), 1)
예제 #10
0
def test_mnist_estimator_adaptive_with_data_layer() -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/adaptive.yaml"))
    config = conf.set_tf2_image(config)
    config = conf.set_shared_fs_data_layer(config)

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("data_layer_mnist_estimator"), None)
예제 #11
0
class NativeImplementations:
    PytorchMNISTCNNSingleGeneric = NativeImplementation(
        cwd=conf.experimental_path("native_mnist_pytorch"),
        command=[
            "python",
            conf.experimental_path("native_mnist_pytorch/trial_impl.py")
        ],
        configuration={
            "checkpoint_storage": experiment.shared_fs_checkpoint_config(),
            "searcher": {
                "name": "single",
                "max_steps": 1,
                "metric": "validation_error"
            },
            "max_restarts": 0,
        },
        num_expected_steps_per_trial=1,
        num_expected_trials=1,
        min_num_gpus_required=0,
    )
    TFEstimatorMNISTCNNSingle = NativeImplementation(
        cwd=conf.experimental_path("native_mnist_estimator"),
        command=[
            "python",
            conf.experimental_path("native_mnist_estimator/native_impl.py")
        ],
        configuration={
            "batches_per_step": 4,
            "checkpoint_storage": experiment.shared_fs_checkpoint_config(),
            "searcher": {
                "name": "single",
                "max_steps": 1,
                "metric": "accuracy"
            },
            "max_restarts": 0,
        },
        num_expected_steps_per_trial=1,
        num_expected_trials=1,
        min_num_gpus_required=0,
    )

    TFEstimatorMNISTCNNSingleGeneric = NativeImplementation(
        cwd=conf.experimental_path("native_mnist_estimator"),
        command=[
            "python",
            conf.experimental_path("native_mnist_estimator/trial_impl.py")
        ],
        configuration={
            "batches_per_step": 4,
            "checkpoint_storage": experiment.shared_fs_checkpoint_config(),
            "searcher": {
                "name": "single",
                "max_steps": 1,
                "metric": "accuracy"
            },
            "max_restarts": 0,
        },
        num_expected_steps_per_trial=1,
        num_expected_trials=1,
        min_num_gpus_required=0,
    )

    # Train a single tf.keras model using fit().
    TFKerasMNISTCNNSingleFit = NativeImplementation(
        cwd=conf.experimental_path("native_fashion_mnist_tf_keras"),
        command=[
            "python",
            conf.experimental_path(
                "native_fashion_mnist_tf_keras/native_impl.py"),
            "--use-fit",
        ],
        configuration={
            "batches_per_step": 4,
            "checkpoint_storage": experiment.shared_fs_checkpoint_config(),
            "searcher": {
                "name": "single",
                "max_steps": 1,
                "metric": "val_accuracy"
            },
            "max_restarts": 2,
        },
        num_expected_steps_per_trial=1,
        num_expected_trials=1,
        min_num_gpus_required=0,
    )

    # Train a single tf.keras model using fit() on multiple GPUs.
    TFKerasMNISTCNNSingleFitParallel = NativeImplementation(
        cwd=conf.experimental_path("native_fashion_mnist_tf_keras"),
        command=[
            "python",
            conf.experimental_path(
                "native_fashion_mnist_tf_keras/native_impl.py"),
            "--use-fit",
        ],
        configuration={
            "batches_per_step": 4,
            "checkpoint_storage": experiment.shared_fs_checkpoint_config(),
            "searcher": {
                "name": "single",
                "max_steps": 1,
                "metric": "val_accuracy"
            },
            "resources": {
                "slots_per_trial": 2
            },
            "max_restarts": 2,
        },
        num_expected_steps_per_trial=1,
        num_expected_trials=1,
        min_num_gpus_required=2,
    )

    # Train a single tf.keras model using fit_generator().
    TFKerasMNISTCNNSingleFitGenerator = NativeImplementation(
        cwd=conf.experimental_path("native_fashion_mnist_tf_keras"),
        command=[
            "python",
            conf.experimental_path(
                "native_fashion_mnist_tf_keras/native_impl.py")
        ],
        configuration={
            "batches_per_step": 4,
            "checkpoint_storage": experiment.shared_fs_checkpoint_config(),
            "searcher": {
                "name": "single",
                "max_steps": 1,
                "metric": "val_accuracy"
            },
            "max_restarts": 2,
        },
        num_expected_steps_per_trial=1,
        num_expected_trials=1,
        min_num_gpus_required=0,
    )

    TFKerasMNISTCNNSingleGeneric = NativeImplementation(
        cwd=conf.experimental_path("native_fashion_mnist_tf_keras"),
        command=[
            "python",
            conf.experimental_path(
                "native_fashion_mnist_tf_keras/trial_impl.py")
        ],
        configuration={
            "batches_per_step": 4,
            "checkpoint_storage": experiment.shared_fs_checkpoint_config(),
            "searcher": {
                "name": "single",
                "max_steps": 1,
                "metric": "val_accuracy"
            },
            "max_restarts": 2,
        },
        num_expected_steps_per_trial=1,
        num_expected_trials=1,
        min_num_gpus_required=0,
    )