def horovod_mnist_hpo( experiment_name: str = "mpi-horovod-mnist", experiment_namespace: str = "anonymous", ): # Trial count specification. max_trial_count = 6 max_failed_trial_count = 3 parallel_trial_count = 2 # Objective specification. objective = V1beta1ObjectiveSpec( type="minimize", goal=0.01, objective_metric_name="loss", ) # Algorithm specification. algorithm = V1beta1AlgorithmSpec(algorithm_name="bayesianoptimization", algorithm_settings=[ V1beta1AlgorithmSetting( name="random_state", value="10") ]) # Experiment search space. # In this example we tune learning rate and number of training steps. parameters = [ V1beta1ParameterSpec( name="lr", parameter_type="double", feasible_space=V1beta1FeasibleSpace(min="0.001", max="0.003"), ), V1beta1ParameterSpec( name="num-steps", parameter_type="int", feasible_space=V1beta1FeasibleSpace(min="50", max="150", step="10"), ), ] # JSON template specification for the Trial's Worker Kubeflow MPIJob. trial_spec = { "apiVersion": "kubeflow.org/v1", "kind": "MPIJob", "spec": { "slotsPerWorker": 1, "cleanPodPolicy": "Running", "mpiReplicaSpecs": { "Launcher": { "replicas": 1, "template": { "metadata": { "annotations": { "sidecar.istio.io/inject": "false" } }, "spec": { "containers": [{ "image": "docker.io/kubeflow/mpi-horovod-mnist", "name": "mpi-launcher", "command": ["mpirun"], "args": [ "-np", "2", "--allow-run-as-root", "-bind-to", "none", "-map-by", "slot", "-x", "LD_LIBRARY_PATH", "-x", "PATH", "-mca", "pml", "ob1", "-mca", "btl", "^openib", "python", "/examples/tensorflow_mnist.py", "--lr", "${trialParameters.learningRate}", "--num-steps", "${trialParameters.numberSteps}" ], "resources": { "limits": { "cpu": "500m", "memory": "2Gi" } } }] } } }, "Worker": { "replicas": 2, "template": { "metadata": { "annotations": { "sidecar.istio.io/inject": "false" } }, "spec": { "containers": [{ "image": "docker.io/kubeflow/mpi-horovod-mnist", "name": "mpi-worker", "resources": { "limits": { "cpu": "500m", "memory": "4Gi" } } }] } } } } } } # Configure parameters for the Trial template. trial_template = V1beta1TrialTemplate( primary_pod_labels={"mpi-job-role": "launcher"}, primary_container_name="mpi-launcher", success_condition= 'status.conditions.#(type=="Succeeded")#|#(status=="True")#', failure_condition= 'status.conditions.#(type=="Failed")#|#(status=="True")#', trial_parameters=[ V1beta1TrialParameterSpec( name="learningRate", description="Learning rate for the training model", reference="lr"), V1beta1TrialParameterSpec(name="numberSteps", description="Number of training steps", reference="num-steps"), ], trial_spec=trial_spec) # Create Experiment specification. experiment_spec = V1beta1ExperimentSpec( max_trial_count=max_trial_count, max_failed_trial_count=max_failed_trial_count, parallel_trial_count=parallel_trial_count, objective=objective, algorithm=algorithm, parameters=parameters, trial_template=trial_template) # Get the Katib launcher. # Load component from the URL or from the file. katib_experiment_launcher_op = components.load_component_from_url( "https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml" ) # katib_experiment_launcher_op = components.load_component_from_file( # "../../../components/kubeflow/katib-launcher/component.yaml" # ) # Katib launcher component. # Experiment Spec should be serialized to a valid Kubernetes object. # The Experiment is deleted after the Pipeline is finished. op = katib_experiment_launcher_op( experiment_name=experiment_name, experiment_namespace=experiment_namespace, experiment_spec=ApiClient().sanitize_for_serialization( experiment_spec), experiment_timeout_minutes=60) # Output container to print the results. dsl.ContainerOp( name="best-hp", image="library/bash:4.4.23", command=["sh", "-c"], arguments=["echo Best HyperParameters: %s" % op.output], )
def create_katib_experiment_task(experiment_name, experiment_namespace, training_steps): # Trial count specification. max_trial_count = 5 max_failed_trial_count = 3 parallel_trial_count = 2 # Objective specification. objective = V1beta1ObjectiveSpec(type="minimize", goal=0.001, objective_metric_name="loss") # Algorithm specification. algorithm = V1beta1AlgorithmSpec(algorithm_name="random", ) # Experiment search space. # In this example we tune learning rate and batch size. parameters = [ V1beta1ParameterSpec( name="learning_rate", parameter_type="double", feasible_space=V1beta1FeasibleSpace(min="0.01", max="0.05"), ), V1beta1ParameterSpec( name="batch_size", parameter_type="int", feasible_space=V1beta1FeasibleSpace(min="80", max="100"), ) ] # Experiment Trial template. # TODO (andreyvelich): Use community image for the mnist example. trial_spec = { "apiVersion": "kubeflow.org/v1", "kind": "TFJob", "spec": { "tfReplicaSpecs": { "Chief": { "replicas": 1, "restartPolicy": "OnFailure", "template": { "metadata": { "annotations": { "sidecar.istio.io/inject": "false" } }, "spec": { "containers": [{ "name": "tensorflow", "image": "docker.io/liuhougangxa/tf-estimator-mnist", "command": [ "python", "/opt/model.py", "--tf-train-steps=" + str(training_steps), "--tf-learning-rate=${trialParameters.learningRate}", "--tf-batch-size=${trialParameters.batchSize}" ] }] } } }, "Worker": { "replicas": 1, "restartPolicy": "OnFailure", "template": { "metadata": { "annotations": { "sidecar.istio.io/inject": "false" } }, "spec": { "containers": [{ "name": "tensorflow", "image": "docker.io/liuhougangxa/tf-estimator-mnist", "command": [ "python", "/opt/model.py", "--tf-train-steps=" + str(training_steps), "--tf-learning-rate=${trialParameters.learningRate}", "--tf-batch-size=${trialParameters.batchSize}" ] }] } } } } } } # Configure parameters for the Trial template. trial_template = V1beta1TrialTemplate( primary_container_name="tensorflow", trial_parameters=[ V1beta1TrialParameterSpec( name="learningRate", description="Learning rate for the training model", reference="learning_rate"), V1beta1TrialParameterSpec(name="batchSize", description="Batch size for the model", reference="batch_size"), ], trial_spec=trial_spec) # Create an Experiment from the above parameters. experiment_spec = V1beta1ExperimentSpec( max_trial_count=max_trial_count, max_failed_trial_count=max_failed_trial_count, parallel_trial_count=parallel_trial_count, objective=objective, algorithm=algorithm, parameters=parameters, trial_template=trial_template) # Create the KFP task for the Katib Experiment. # Experiment Spec should be serialized to a valid Kubernetes object. katib_experiment_launcher_op = components.load_component_from_url( "https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml" ) op = katib_experiment_launcher_op( experiment_name=experiment_name, experiment_namespace=experiment_namespace, experiment_spec=ApiClient().sanitize_for_serialization( experiment_spec), experiment_timeout_minutes=60, delete_finished_experiment=False) return op