Пример #1
0
def my_pipeline(
    minio_endpoint='minio-service:9000',
    log_bucket='mlpipeline',
    log_dir=f'tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}',
    # Pin to tensorflow 2.3, because in 2.4+ tensorboard cannot load in KFP:
    # refer to https://github.com/kubeflow/pipelines/issues/5521.
    tf_image='gcr.io/deeplearning-platform-release/tf2-cpu.2-3:latest'
):
    # tensorboard uses s3 protocol to access minio
    prepare_tb_task = prepare_tensorboard(
        log_dir_uri=f's3://{log_bucket}/{log_dir}',
        image=tf_image,
        pod_template_spec=json.dumps({
            'spec': {
                'containers': [{
                    # These env vars make tensorboard access KFP in-cluster minio
                    # using s3 protocol.
                    # Reference: https://blog.min.io/hyper-scale-machine-learning-with-minio-and-tensorflow/
                    'env': [{
                        'name': 'AWS_ACCESS_KEY_ID',
                        'valueFrom': {
                            'secretKeyRef': {
                                'name': 'mlpipeline-minio-artifact',
                                'key': 'accesskey'
                            }
                        }
                    }, {
                        'name': 'AWS_SECRET_ACCESS_KEY',
                        'valueFrom': {
                            'secretKeyRef': {
                                'name': 'mlpipeline-minio-artifact',
                                'key': 'secretkey'
                            }
                        }
                    }, {
                        'name': 'AWS_REGION',
                        'value': 'minio'
                    }, {
                        'name': 'S3_ENDPOINT',
                        'value': f'{minio_endpoint}',
                    }, {
                        'name': 'S3_USE_HTTPS',
                        'value': '0',
                    }, {
                        'name': 'S3_VERIFY_SSL',
                        'value': '0',
                    }]
                }],
            },
        })
    )
    train_task = train_op(
        minio_endpoint=minio_endpoint,
        log_bucket=log_bucket,
        log_dir=log_dir,
    )
    train_task.apply(
        use_k8s_secret(
            secret_name='mlpipeline-minio-artifact',
            k8s_secret_key_to_env={
                'secretkey': 'MINIO_SECRET_KEY',
                'accesskey': 'MINIO_ACCESS_KEY'
            },
        )
    )
    # optional, let training task use the same tensorflow image as specified tensorboard
    train_task.container.image = tf_image
    train_task.after(prepare_tb_task)
Пример #2
0
def pytorch_bert(  # pylint: disable=too-many-arguments
        minio_endpoint=MINIO_ENDPOINT,
        log_bucket=LOG_BUCKET,
        log_dir=f"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}",
        mar_path=f"mar/{dsl.RUN_ID_PLACEHOLDER}/model-store",
        config_prop_path=f"mar/{dsl.RUN_ID_PLACEHOLDER}/config",
        model_uri=f"s3://mlpipeline/mar/{dsl.RUN_ID_PLACEHOLDER}",
        tf_image=TENSORBOARD_IMAGE,
        deploy=DEPLOY_NAME,
        namespace=NAMESPACE,
        confusion_matrix_log_dir=f"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/",
        num_samples=1000,
        max_epochs=1):
    """Thid method defines the pipeline tasks and operations"""
    prepare_tb_task = prepare_tensorboard_op(
        log_dir_uri=f"s3://{log_bucket}/{log_dir}",
        image=tf_image,
        pod_template_spec=json.dumps({
            "spec": {
                "containers": [{
                    "env": [
                        {
                            "name": "AWS_ACCESS_KEY_ID",
                            "valueFrom": {
                                "secretKeyRef": {
                                    "name": "mlpipeline-minio-artifact",
                                    "key": "accesskey",
                                }
                            },
                        },
                        {
                            "name": "AWS_SECRET_ACCESS_KEY",
                            "valueFrom": {
                                "secretKeyRef": {
                                    "name": "mlpipeline-minio-artifact",
                                    "key": "secretkey",
                                }
                            },
                        },
                        {
                            "name": "AWS_REGION",
                            "value": "minio"
                        },
                        {
                            "name": "S3_ENDPOINT",
                            "value": f"{minio_endpoint}",
                        },
                        {
                            "name": "S3_USE_HTTPS",
                            "value": "0"
                        },
                        {
                            "name": "S3_VERIFY_SSL",
                            "value": "0"
                        },
                    ]
                }]
            }
        }),
    ).set_display_name("Visualization")

    prep_task = (prep_op().after(prepare_tb_task).set_display_name(
        "Preprocess & Transform"))
    confusion_matrix_url = f"minio://{log_bucket}/{confusion_matrix_log_dir}"
    script_args = f"model_name=bert.pth," \
                  f"num_samples={num_samples}," \
                  f"confusion_matrix_url={confusion_matrix_url}"
    # For gpus, set number of gpus and accelerator type
    ptl_args = f"max_epochs={max_epochs}," \
               "profiler=pytorch," \
               "gpus=0," \
               "accelerator=None"
    train_task = (train_op(
        input_data=prep_task.outputs["output_data"],
        script_args=script_args,
        ptl_arguments=ptl_args).after(prep_task).set_display_name("Training"))
    # For GPU uncomment below line and set GPU limit and node selector
    # ).set_gpu_limit(1).add_node_selector_constraint
    # ('cloud.google.com/gke-accelerator','nvidia-tesla-p4')

    (minio_op(
        bucket_name="mlpipeline",
        folder_name=log_dir,
        input_path=train_task.outputs["tensorboard_root"],
        filename="",
    ).after(train_task).set_display_name("Tensorboard Events Pusher"))
    minio_mar_upload = (minio_op(
        bucket_name="mlpipeline",
        folder_name=mar_path,
        input_path=train_task.outputs["checkpoint_dir"],
        filename="bert_test.mar",
    ).after(train_task).set_display_name("Mar Pusher"))
    (minio_op(
        bucket_name="mlpipeline",
        folder_name=config_prop_path,
        input_path=train_task.outputs["checkpoint_dir"],
        filename="config.properties",
    ).after(train_task).set_display_name("Conifg Pusher"))

    model_uri = str(model_uri)
    # pylint: disable=unused-variable
    isvc_yaml = """
    apiVersion: "serving.kubeflow.org/v1beta1"
    kind: "InferenceService"
    metadata:
      name: {}
      namespace: {}
    spec:
      predictor:
        serviceAccountName: sa
        pytorch:
          storageUri: {}
          resources:
            limits:
              memory: 4Gi   
    """.format(deploy, namespace, model_uri)

    # For GPU inference use below yaml with gpu count and accelerator
    gpu_count = "1"
    accelerator = "nvidia-tesla-p4"
    isvc_gpu_yaml = """
    apiVersion: "serving.kubeflow.org/v1beta1"
    kind: "InferenceService"
    metadata:
      name: {}
      namespace: {}
    spec:
      predictor:
        serviceAccountName: sa
        pytorch:
          storageUri: {}
          resources:
            limits:
              memory: 4Gi   
              nvidia.com/gpu: {}
          nodeSelector:
            cloud.google.com/gke-accelerator: {}
""".format(deploy, namespace, model_uri, gpu_count, accelerator)
    # Update inferenceservice_yaml for GPU inference
    deploy_task = (deploy_op(
        action="apply", inferenceservice_yaml=isvc_yaml).after(
            minio_mar_upload).set_display_name("Deployer"))

    dsl.get_pipeline_conf().add_op_transformer(
        use_k8s_secret(
            secret_name="mlpipeline-minio-artifact",
            k8s_secret_key_to_env={
                "secretkey": "MINIO_SECRET_KEY",
                "accesskey": "MINIO_ACCESS_KEY",
            },
        ))
Пример #3
0
def pytorch_bert(
        minio_endpoint="http://minio-service.kubeflow:9000",
        log_bucket="mlpipeline",
        log_dir=f"tensorboard/logs/{dsl.RUN_ID_PLACEHOLDER}",
        mar_path=f"mar/{dsl.RUN_ID_PLACEHOLDER}/model-store",
        config_prop_path=f"mar/{dsl.RUN_ID_PLACEHOLDER}/config",
        model_uri=f"s3://mlpipeline/mar/{dsl.RUN_ID_PLACEHOLDER}",
        tf_image="jagadeeshj/tb_plugin:v1.8",
        deploy="bertserve",
        namespace="kubeflow-user-example-com",
        confusion_matrix_log_dir=f"confusion_matrix/{dsl.RUN_ID_PLACEHOLDER}/",
        num_samples=1000):

    prepare_tb_task = prepare_tensorboard_op(
        log_dir_uri=f"s3://{log_bucket}/{log_dir}",
        image=tf_image,
        pod_template_spec=json.dumps({
            "spec": {
                "containers": [{
                    "env": [
                        {
                            "name": "AWS_ACCESS_KEY_ID",
                            "valueFrom": {
                                "secretKeyRef": {
                                    "name": "mlpipeline-minio-artifact",
                                    "key": "accesskey",
                                }
                            },
                        },
                        {
                            "name": "AWS_SECRET_ACCESS_KEY",
                            "valueFrom": {
                                "secretKeyRef": {
                                    "name": "mlpipeline-minio-artifact",
                                    "key": "secretkey",
                                }
                            },
                        },
                        {
                            "name": "AWS_REGION",
                            "value": "minio"
                        },
                        {
                            "name": "S3_ENDPOINT",
                            "value": f"{minio_endpoint}"
                        },
                        {
                            "name": "S3_USE_HTTPS",
                            "value": "0"
                        },
                        {
                            "name": "S3_VERIFY_SSL",
                            "value": "0"
                        },
                    ]
                }]
            }
        }),
    ).set_display_name("Visualization")

    prep_task = prep_op().after(prepare_tb_task).set_display_name(
        "Preprocess & Transform")
    train_task = (train_op(
        input_data=prep_task.outputs["output_data"],
        profiler="pytorch",
        confusion_matrix_url=f"minio://{log_bucket}/{confusion_matrix_log_dir}",
        num_samples=num_samples).apply(
            use_k8s_secret(
                secret_name="mlpipeline-minio-artifact",
                k8s_secret_key_to_env={
                    "secretkey": "MINIO_SECRET_KEY",
                    "accesskey": "MINIO_ACCESS_KEY",
                },
            )).after(prep_task).set_display_name("Training"))

    minio_tb_upload = (minio_op(
        bucket_name="mlpipeline",
        folder_name=log_dir,
        input_path=train_task.outputs["tensorboard_root"],
        filename="",
    ).apply(
        use_k8s_secret(
            secret_name="mlpipeline-minio-artifact",
            k8s_secret_key_to_env={
                "secretkey": "MINIO_SECRET_KEY",
                "accesskey": "MINIO_ACCESS_KEY",
            },
        )).after(train_task).set_display_name("Tensorboard Events Pusher"))
    minio_mar_upload = (minio_op(
        bucket_name="mlpipeline",
        folder_name=mar_path,
        input_path=train_task.outputs["checkpoint_dir"],
        filename="bert_test.mar",
    ).apply(
        use_k8s_secret(
            secret_name="mlpipeline-minio-artifact",
            k8s_secret_key_to_env={
                "secretkey": "MINIO_SECRET_KEY",
                "accesskey": "MINIO_ACCESS_KEY",
            },
        )).after(train_task).set_display_name("Mar Pusher"))
    minio_config_upload = (minio_op(
        bucket_name="mlpipeline",
        folder_name=config_prop_path,
        input_path=train_task.outputs["checkpoint_dir"],
        filename="config.properties",
    ).apply(
        use_k8s_secret(
            secret_name="mlpipeline-minio-artifact",
            k8s_secret_key_to_env={
                "secretkey": "MINIO_SECRET_KEY",
                "accesskey": "MINIO_ACCESS_KEY",
            },
        )).after(train_task).set_display_name("Conifg Pusher"))

    model_uri = str(model_uri)
    isvc_yaml = """
    apiVersion: "serving.kubeflow.org/v1beta1"
    kind: "InferenceService"
    metadata:
      name: {}
      namespace: {}
    spec:
      predictor:
        serviceAccountName: sa
        pytorch:
          storageUri: {}
          resources:
            limits:
              memory: 4Gi   
    """.format(deploy, namespace, model_uri)
    deploy_task = (deploy_op(
        action="apply", inferenceservice_yaml=isvc_yaml).after(
            minio_mar_upload).set_display_name("Deployer"))