Exemplo n.º 1
0
def test_sdk_e2e():
    container = V1Container(
        name="xgboost",
        image="docker.io/merlintang/xgboost-dist-iris:1.1",
        args=[
            "--job_type=Train",
            "--xgboost_parameter=objective:multi:softprob,num_class:3",
            "--n_estimators=10", "--learning_rate=0.1",
            "--model_path=/tmp/xgboost-model", "--model_storage_type=local"
        ],
    )

    master = V1ReplicaSpec(
        replicas=1,
        restart_policy="Never",
        template=V1PodTemplateSpec(spec=V1PodSpec(containers=[container])))

    worker = V1ReplicaSpec(
        replicas=1,
        restart_policy="Never",
        template=V1PodTemplateSpec(spec=V1PodSpec(containers=[container])))

    xgboostjob = KubeflowOrgV1XGBoostJob(
        api_version="kubeflow.org/v1",
        kind="XGBoostJob",
        metadata=V1ObjectMeta(name="xgboostjob-iris-ci-test",
                              namespace=SDK_TEST_NAMESPACE),
        spec=KubeflowOrgV1XGBoostJobSpec(run_policy=V1RunPolicy(
            clean_pod_policy="None", ),
                                         xgb_replica_specs={
                                             "Master": master,
                                             "Worker": worker
                                         }))

    XGBOOST_CLIENT.create(xgboostjob)

    XGBOOST_CLIENT.wait_for_job("xgboostjob-iris-ci-test",
                                namespace=SDK_TEST_NAMESPACE)
    if not XGBOOST_CLIENT.is_job_succeeded("xgboostjob-iris-ci-test",
                                           namespace=SDK_TEST_NAMESPACE):
        raise RuntimeError("The XGBoostJob is not succeeded.")

    XGBOOST_CLIENT.get_logs("xgboostjob-iris-ci-test",
                            namespace=SDK_TEST_NAMESPACE)

    XGBOOST_CLIENT.delete("xgboostjob-iris-ci-test",
                          namespace=SDK_TEST_NAMESPACE)
Exemplo n.º 2
0
def test_sdk_e2e():
    container = V1Container(
        name="pytorch",
        image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
        args=["--backend", "gloo"],
    )

    master = V1ReplicaSpec(
        replicas=1,
        restart_policy="OnFailure",
        template=V1PodTemplateSpec(spec=V1PodSpec(containers=[container])))

    worker = V1ReplicaSpec(
        replicas=1,
        restart_policy="OnFailure",
        template=V1PodTemplateSpec(spec=V1PodSpec(containers=[container])))

    pytorchjob = KubeflowOrgV1PyTorchJob(
        api_version="kubeflow.org/v1",
        kind="PyTorchJob",
        metadata=V1ObjectMeta(name="pytorchjob-mnist-ci-test",
                              namespace=SDK_TEST_NAMESPACE),
        spec=KubeflowOrgV1PyTorchJobSpec(run_policy=V1RunPolicy(
            clean_pod_policy="None", ),
                                         pytorch_replica_specs={
                                             "Master": master,
                                             "Worker": worker
                                         }))

    PYTORCH_CLIENT.create(pytorchjob)

    PYTORCH_CLIENT.wait_for_job("pytorchjob-mnist-ci-test",
                                namespace=SDK_TEST_NAMESPACE)
    if not PYTORCH_CLIENT.is_job_succeeded("pytorchjob-mnist-ci-test",
                                           namespace=SDK_TEST_NAMESPACE):
        raise RuntimeError("The PyTorchJob is not succeeded.")

    PYTORCH_CLIENT.get_logs("pytorchjob-mnist-ci-test",
                            namespace=SDK_TEST_NAMESPACE)

    PYTORCH_CLIENT.delete("pytorchjob-mnist-ci-test",
                          namespace=SDK_TEST_NAMESPACE)
Exemplo n.º 3
0
def test_sdk_e2e():
    container = V1Container(
        name="tensorflow",
        image="gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0",
        command=[
            "python",
            "/var/tf_mnist/mnist_with_summaries.py",
            "--log_dir=/train/logs", "--learning_rate=0.01",
            "--batch_size=150"
        ]
    )

    worker = V1ReplicaSpec(
        replicas=1,
        restart_policy="Never",
        template=V1PodTemplateSpec(
            spec=V1PodSpec(
                containers=[container]
            )
        )
    )

    tfjob = V1TFJob(
        api_version="kubeflow.org/v1",
        kind="TFJob",
        metadata=V1ObjectMeta(name="mnist-ci-test", namespace=SDK_TEST_NAMESPACE),
        spec=V1TFJobSpec(
            run_policy=V1RunPolicy(
                clean_pod_policy="None",
            ),
            tf_replica_specs={"Worker": worker}
        )
    )

    TFJOB_CLIENT.create(tfjob, namespace=SDK_TEST_NAMESPACE)

    TFJOB_CLIENT.wait_for_job("mnist-ci-test", namespace=SDK_TEST_NAMESPACE)
    if not TFJOB_CLIENT.is_job_succeeded("mnist-ci-test", namespace=SDK_TEST_NAMESPACE):
        raise RuntimeError("The TFJob is not succeeded.")

    TFJOB_CLIENT.get_logs("mnist-ci-test", master=False, namespace=SDK_TEST_NAMESPACE)

    TFJOB_CLIENT.delete("mnist-ci-test", namespace=SDK_TEST_NAMESPACE)
Exemplo n.º 4
0
def test_sdk_e2e():
    master_container = V1Container(
        name="mpi",
        image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
        command=["mpirun"],
        args=[
            "-np",
            "1",
            "--allow-run-as-root",
            "-bind-to",
            "none",
            "-map-by",
            "slot",
            "-x",
            "LD_LIBRARY_PATH",
            "-x",
            "PATH",
            "-mca",
            "pml",
            "ob1",
            "-mca",
            "btl",
            "^openib",
            #"python", "/examples/tensorflow2_mnist.py"]
            "python",
            "/examples/pytorch_mnist.py",
            "--epochs",
            "1"
        ])

    worker_container = V1Container(
        name="mpi",
        image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
    )

    master = V1ReplicaSpec(replicas=1,
                           restart_policy="Never",
                           template=V1PodTemplateSpec(spec=V1PodSpec(
                               containers=[master_container])))

    worker = V1ReplicaSpec(replicas=1,
                           restart_policy="Never",
                           template=V1PodTemplateSpec(spec=V1PodSpec(
                               containers=[worker_container])))

    mpijob = KubeflowOrgV1MPIJob(
        api_version="kubeflow.org/v1",
        kind="MPIJob",
        metadata=V1ObjectMeta(name="mpijob-mxnet-ci-test",
                              namespace=SDK_TEST_NAMESPACE),
        spec=KubeflowOrgV1MPIJobSpec(slots_per_worker=1,
                                     run_policy=V1RunPolicy(
                                         clean_pod_policy="None", ),
                                     mpi_replica_specs={
                                         "Launcher": master,
                                         "Worker": worker
                                     }))

    MPI_CLIENT.create(mpijob)

    MPI_CLIENT.wait_for_job("mpijob-mxnet-ci-test",
                            namespace=SDK_TEST_NAMESPACE)
    if not MPI_CLIENT.is_job_succeeded("mpijob-mxnet-ci-test",
                                       namespace=SDK_TEST_NAMESPACE):
        raise RuntimeError("The MPIJob is not succeeded.")

    MPI_CLIENT.get_logs("mpijob-mxnet-ci-test", namespace=SDK_TEST_NAMESPACE)

    MPI_CLIENT.delete("mpijob-mxnet-ci-test", namespace=SDK_TEST_NAMESPACE)
Exemplo n.º 5
0
def test_sdk_e2e():
    worker_container = V1Container(
        name="mxnet",
        image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3",
        command=["/usr/local/bin/python3"],
        args=["incubator-mxnet/example/image-classification/train_mnist.py",
              "--num-epochs", "5",
              "--num-examples","1000",
              "--kv-store", "dist_sync"],
        ports=[V1ContainerPort(container_port=9991, name="mxjob-port")]
    )

    server_container = V1Container(
        name="mxnet",
        image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3",
        ports=[V1ContainerPort(container_port=9991, name="mxjob-port")]
    )

    scheduler_container = V1Container(
        name="mxnet",
        image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3",
        ports=[V1ContainerPort(container_port=9991, name="mxjob-port")]
    )

    worker = V1ReplicaSpec(
        replicas=1,
        restart_policy="Never",
        template=V1PodTemplateSpec(
            spec=V1PodSpec(
                containers=[worker_container]
            )
        )
    )

    server = V1ReplicaSpec(
        replicas=1,
        restart_policy="Never",
        template=V1PodTemplateSpec(
            spec=V1PodSpec(
                containers=[server_container]
            )
        )
    )

    scheduler = V1ReplicaSpec(
        replicas=1,
        restart_policy="Never",
        template=V1PodTemplateSpec(
            spec=V1PodSpec(
                containers=[scheduler_container]
            )
        )
    )

    mxjob = KubeflowOrgV1MXJob(
        api_version="kubeflow.org/v1",
        kind="MXJob",
        metadata=V1ObjectMeta(name="mxjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE),
        spec=KubeflowOrgV1MXJobSpec(
            job_mode="MXTrain",
            run_policy=V1RunPolicy(
                clean_pod_policy="None",
            ),
            mx_replica_specs={"Scheduler": scheduler,
                                "Server": server,
                                   "Worker": worker}
        )
    )

    MX_CLIENT.create(mxjob)

    MX_CLIENT.wait_for_job("mxjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE)
    if not MX_CLIENT.is_job_succeeded("mxjob-mnist-ci-test",
                                           namespace=SDK_TEST_NAMESPACE):
        raise RuntimeError("The MXJob is not succeeded.")

    MX_CLIENT.get_logs("mxjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE, master=False)

    MX_CLIENT.delete("mxjob-mnist-ci-test", namespace=SDK_TEST_NAMESPACE)