Exemplo n.º 1
0
def horovod_mnist_hpo(
    experiment_name: str = "mpi-horovod-mnist",
    experiment_namespace: str = "anonymous",
):

    # Trial count specification.
    max_trial_count = 6
    max_failed_trial_count = 3
    parallel_trial_count = 2

    # Objective specification.
    objective = V1beta1ObjectiveSpec(
        type="minimize",
        goal=0.01,
        objective_metric_name="loss",
    )

    # Algorithm specification.
    algorithm = V1beta1AlgorithmSpec(algorithm_name="bayesianoptimization",
                                     algorithm_settings=[
                                         V1beta1AlgorithmSetting(
                                             name="random_state", value="10")
                                     ])

    # Experiment search space.
    # In this example we tune learning rate and number of training steps.
    parameters = [
        V1beta1ParameterSpec(
            name="lr",
            parameter_type="double",
            feasible_space=V1beta1FeasibleSpace(min="0.001", max="0.003"),
        ),
        V1beta1ParameterSpec(
            name="num-steps",
            parameter_type="int",
            feasible_space=V1beta1FeasibleSpace(min="50", max="150",
                                                step="10"),
        ),
    ]

    # JSON template specification for the Trial's Worker Kubeflow MPIJob.
    trial_spec = {
        "apiVersion": "kubeflow.org/v1",
        "kind": "MPIJob",
        "spec": {
            "slotsPerWorker": 1,
            "cleanPodPolicy": "Running",
            "mpiReplicaSpecs": {
                "Launcher": {
                    "replicas": 1,
                    "template": {
                        "metadata": {
                            "annotations": {
                                "sidecar.istio.io/inject": "false"
                            }
                        },
                        "spec": {
                            "containers": [{
                                "image":
                                "docker.io/kubeflow/mpi-horovod-mnist",
                                "name":
                                "mpi-launcher",
                                "command": ["mpirun"],
                                "args": [
                                    "-np", "2", "--allow-run-as-root",
                                    "-bind-to", "none", "-map-by", "slot",
                                    "-x", "LD_LIBRARY_PATH", "-x", "PATH",
                                    "-mca", "pml", "ob1", "-mca", "btl",
                                    "^openib", "python",
                                    "/examples/tensorflow_mnist.py", "--lr",
                                    "${trialParameters.learningRate}",
                                    "--num-steps",
                                    "${trialParameters.numberSteps}"
                                ],
                                "resources": {
                                    "limits": {
                                        "cpu": "500m",
                                        "memory": "2Gi"
                                    }
                                }
                            }]
                        }
                    }
                },
                "Worker": {
                    "replicas": 2,
                    "template": {
                        "metadata": {
                            "annotations": {
                                "sidecar.istio.io/inject": "false"
                            }
                        },
                        "spec": {
                            "containers": [{
                                "image":
                                "docker.io/kubeflow/mpi-horovod-mnist",
                                "name": "mpi-worker",
                                "resources": {
                                    "limits": {
                                        "cpu": "500m",
                                        "memory": "4Gi"
                                    }
                                }
                            }]
                        }
                    }
                }
            }
        }
    }

    # Configure parameters for the Trial template.
    trial_template = V1beta1TrialTemplate(
        primary_pod_labels={"mpi-job-role": "launcher"},
        primary_container_name="mpi-launcher",
        success_condition=
        'status.conditions.#(type=="Succeeded")#|#(status=="True")#',
        failure_condition=
        'status.conditions.#(type=="Failed")#|#(status=="True")#',
        trial_parameters=[
            V1beta1TrialParameterSpec(
                name="learningRate",
                description="Learning rate for the training model",
                reference="lr"),
            V1beta1TrialParameterSpec(name="numberSteps",
                                      description="Number of training steps",
                                      reference="num-steps"),
        ],
        trial_spec=trial_spec)

    # Create Experiment specification.
    experiment_spec = V1beta1ExperimentSpec(
        max_trial_count=max_trial_count,
        max_failed_trial_count=max_failed_trial_count,
        parallel_trial_count=parallel_trial_count,
        objective=objective,
        algorithm=algorithm,
        parameters=parameters,
        trial_template=trial_template)

    # Get the Katib launcher.
    # Load component from the URL or from the file.
    katib_experiment_launcher_op = components.load_component_from_url(
        "https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml"
    )
    # katib_experiment_launcher_op = components.load_component_from_file(
    #     "../../../components/kubeflow/katib-launcher/component.yaml"
    # )

    # Katib launcher component.
    # Experiment Spec should be serialized to a valid Kubernetes object.
    # The Experiment is deleted after the Pipeline is finished.
    op = katib_experiment_launcher_op(
        experiment_name=experiment_name,
        experiment_namespace=experiment_namespace,
        experiment_spec=ApiClient().sanitize_for_serialization(
            experiment_spec),
        experiment_timeout_minutes=60)

    # Output container to print the results.
    dsl.ContainerOp(
        name="best-hp",
        image="library/bash:4.4.23",
        command=["sh", "-c"],
        arguments=["echo Best HyperParameters: %s" % op.output],
    )
Exemplo n.º 2
0
def create_katib_experiment_task(experiment_name, experiment_namespace,
                                 training_steps):
    # Trial count specification.
    max_trial_count = 5
    max_failed_trial_count = 3
    parallel_trial_count = 2

    # Objective specification.
    objective = V1beta1ObjectiveSpec(type="minimize",
                                     goal=0.001,
                                     objective_metric_name="loss")

    # Algorithm specification.
    algorithm = V1beta1AlgorithmSpec(algorithm_name="random", )

    # Experiment search space.
    # In this example we tune learning rate and batch size.
    parameters = [
        V1beta1ParameterSpec(
            name="learning_rate",
            parameter_type="double",
            feasible_space=V1beta1FeasibleSpace(min="0.01", max="0.05"),
        ),
        V1beta1ParameterSpec(
            name="batch_size",
            parameter_type="int",
            feasible_space=V1beta1FeasibleSpace(min="80", max="100"),
        )
    ]

    # Experiment Trial template.
    # TODO (andreyvelich): Use community image for the mnist example.
    trial_spec = {
        "apiVersion": "kubeflow.org/v1",
        "kind": "TFJob",
        "spec": {
            "tfReplicaSpecs": {
                "Chief": {
                    "replicas": 1,
                    "restartPolicy": "OnFailure",
                    "template": {
                        "metadata": {
                            "annotations": {
                                "sidecar.istio.io/inject": "false"
                            }
                        },
                        "spec": {
                            "containers": [{
                                "name":
                                "tensorflow",
                                "image":
                                "docker.io/liuhougangxa/tf-estimator-mnist",
                                "command": [
                                    "python", "/opt/model.py",
                                    "--tf-train-steps=" + str(training_steps),
                                    "--tf-learning-rate=${trialParameters.learningRate}",
                                    "--tf-batch-size=${trialParameters.batchSize}"
                                ]
                            }]
                        }
                    }
                },
                "Worker": {
                    "replicas": 1,
                    "restartPolicy": "OnFailure",
                    "template": {
                        "metadata": {
                            "annotations": {
                                "sidecar.istio.io/inject": "false"
                            }
                        },
                        "spec": {
                            "containers": [{
                                "name":
                                "tensorflow",
                                "image":
                                "docker.io/liuhougangxa/tf-estimator-mnist",
                                "command": [
                                    "python", "/opt/model.py",
                                    "--tf-train-steps=" + str(training_steps),
                                    "--tf-learning-rate=${trialParameters.learningRate}",
                                    "--tf-batch-size=${trialParameters.batchSize}"
                                ]
                            }]
                        }
                    }
                }
            }
        }
    }

    # Configure parameters for the Trial template.
    trial_template = V1beta1TrialTemplate(
        primary_container_name="tensorflow",
        trial_parameters=[
            V1beta1TrialParameterSpec(
                name="learningRate",
                description="Learning rate for the training model",
                reference="learning_rate"),
            V1beta1TrialParameterSpec(name="batchSize",
                                      description="Batch size for the model",
                                      reference="batch_size"),
        ],
        trial_spec=trial_spec)

    # Create an Experiment from the above parameters.
    experiment_spec = V1beta1ExperimentSpec(
        max_trial_count=max_trial_count,
        max_failed_trial_count=max_failed_trial_count,
        parallel_trial_count=parallel_trial_count,
        objective=objective,
        algorithm=algorithm,
        parameters=parameters,
        trial_template=trial_template)

    # Create the KFP task for the Katib Experiment.
    # Experiment Spec should be serialized to a valid Kubernetes object.
    katib_experiment_launcher_op = components.load_component_from_url(
        "https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml"
    )
    op = katib_experiment_launcher_op(
        experiment_name=experiment_name,
        experiment_namespace=experiment_namespace,
        experiment_spec=ApiClient().sanitize_for_serialization(
            experiment_spec),
        experiment_timeout_minutes=60,
        delete_finished_experiment=False)

    return op