Exemplo n.º 1
0
def add_azure_files(kube_manager, pod_spec, namespace):
    context_hash = pod_spec.containers[0].args[1].split(':')[-1]
    secret_name = constants.AZURE_STORAGE_CREDS_SECRET_NAME_PREFIX + context_hash.lower(
    )
    if not kube_manager.secret_exists(secret_name, namespace):
        raise Exception("Secret '{}' not found in namespace '{}'".format(
            secret_name, namespace))

    volume_mount = client.V1VolumeMount(name='azure-files',
                                        mount_path='/mnt/azure/',
                                        read_only=True)

    if pod_spec.containers[0].volume_mounts:
        pod_spec.containers[0].volume_mounts.append(volume_mount)
    else:
        pod_spec.containers[0].volume_mounts = [volume_mount]

    volume = client.V1Volume(
        name='azure-files',
        azure_file=client.V1AzureFileVolumeSource(
            secret_name=secret_name,
            share_name=constants.AZURE_FILES_SHARED_FOLDER))

    if pod_spec.volumes:
        pod_spec.volumes.append(volume)
    else:
        pod_spec.volumes = [volume]
Exemplo n.º 2
0
    def prepare_azure_volumes(self, volume_sub_path: str, afs_volume_name: str,
                              azure_mount_path: str):
        assert afs_volume_name, f"Check afs_volume_name {afs_volume_name}"
        assert azure_mount_path, f"Check azure_mount_path {azure_mount_path}"
        volume_mounts = [
            client.V1VolumeMount(name=afs_volume_name,
                                 mount_path=azure_mount_path,
                                 sub_path=volume_sub_path)
        ]
        azure_volume = client.V1AzureFileVolumeSource(
            secret_name=self.azure_secret, share_name=self.afs_share)
        volumes = [
            client.V1Volume(name=afs_volume_name, azure_file=azure_volume)
        ]

        return volumes, volume_mounts
Exemplo n.º 3
0
def got_image_pipeline(
    trainingsteps=4000,
    learningrate=0.01,
    trainbatchsize=100,
):

    persistent_volume_name = 'azure-files'
    persistent_volume_path = '/tf-output'
    azure_file_secret_name = 'azure-file-secret'
    azure_file_share_name = 'aksshare'
    field_path = 'metadata.name'

    operations = {}

    # preprocess images
    operations['preprocess'] = dsl.ContainerOp(
        name='preprocess',
        image='briaracr.azurecr.io/chzbrgr71/got-image-preprocess:1.63',
        arguments=[
            '--bottleneck_dir', "/tf-output/bottlenecks", '--image_dir',
            '/images'
        ])

    # train
    operations['train'] = dsl.ContainerOp(
        name='train',
        image='briaracr.azurecr.io/chzbrgr71/got-image-training:1.63',
        arguments=[
            '--bottleneck_dir', "/tmp/tensorflow/bottlenecks", '--model_dir',
            "/tmp/tensorflow/inception", '--summaries_dir', '/tf-output',
            '--output_graph', '/tf-output', '--output_labels', '/tf-output',
            '--image_dir', "/images", '--saved_model_dir', '/tf-output',
            '--how_many_training_steps', trainingsteps, '--learning_rate',
            learningrate, '--train_batch_size', trainbatchsize
        ])
    operations['train'].after(operations['preprocess'])

    # score model
    operations['score'] = dsl.ContainerOp(
        name='score',
        image='briaracr.azurecr.io/chzbrgr71/got-model-scoring:1.0',
        arguments=['/tf-output/latest_model'])
    operations['score'].after(operations['train'])

    # convert onnx
    operations['onnx'] = dsl.ContainerOp(
        name='onnx',
        image='briaracr.azurecr.io/chzbrgr71/onnx-convert:1.1',
        arguments=[
            'show', '--dir', '/tf-output/latest_model/exported_model/1/',
            '--tag_set', 'serve', '--signature_def', 'serving_default'
        ])
    operations['onnx'].after(operations['score'])

    # convert tflite
    operations['convert-tflite'] = dsl.ContainerOp(
        name='convert-tflite',
        image='briaracr.azurecr.io/chzbrgr71/tflite-convert:1.0',
        arguments=[
            '--graph_def_file',
            '/tf-output/latest_model/got_retrained_graph.pb', '--output_file',
            '/tf-output/latest_model/optimized_graph.lite', '--input_format',
            'TENSORFLOW_GRAPHDEF', '--output_format', 'TFLITE',
            '--input_sedhape', '1,299,299,3', '--input_array', 'Mul',
            '--output_array', 'final_result', '--inference_type', 'FLOAT',
            '--input_data_type', 'FLOAT'
        ])
    operations['convert-tflite'].after(operations['score'])

    # copy models to external storage
    operations['export-to-cloud'] = dsl.ContainerOp(
        name='export-to-cloud',
        image='alpine',
        command=['cp'],
        arguments=[
            '/tf-output/latest_model/got_retrained_graph.pb',
            '/tf-output/latest_model/got_retrained_graph-latest.pb'
        ])
    operations['export-to-cloud'].after(operations['onnx']).after(
        operations['convert-tflite'])

    for _, op in operations.items():
        op.add_volume(
                k8s_client.V1Volume(
                    azure_file=k8s_client.V1AzureFileVolumeSource(
                        secret_name=azure_file_secret_name,
                        share_name=azure_file_share_name,
                        read_only=False),
                        name=persistent_volume_name)
                ) \
            .add_volume_mount(k8s_client.V1VolumeMount(
                mount_path=persistent_volume_path,
                name=persistent_volume_name)
                ) \
            .add_env_variable(k8s_client.V1EnvVar(name='MSG', value='HELLO!')
                ) \
            .add_env_variable(k8s_client.V1EnvVar(name='KUBE_POD_NAME',
                value_from=k8s_client.V1EnvVarSource(
                    field_ref=k8s_client.V1ObjectFieldSelector(field_path=field_path)
            )))
Exemplo n.º 4
0
def create_job(MODEL):

    assert MODEL is not None, "model name is None, cannot spawn a new worker"

    api = client.BatchV1Api()

    body = client.V1Job(api_version="batch/v1", kind="Job")
    name = 'speechlab-worker-job-{}-{}'.format(MODEL.lower().replace("_", "-"),
                                               id_generator())
    body.metadata = client.V1ObjectMeta(namespace=NAMESPACE, name=name)
    body.status = client.V1JobStatus()
    template = client.V1PodTemplate()
    template.template = client.V1PodTemplateSpec()
    template.template.metadata = client.V1ObjectMeta(
        annotations={
            "prometheus.io/scrape": "true",
            "prometheus.io/port": "8081"
        })
    azure_file_volume = client.V1AzureFileVolumeSource(
        read_only=True,
        secret_name=MODELS_FILESHARE_SECRET,
        share_name=MODELS_SHARE_NAME)
    volume = client.V1Volume(name="models-azurefiles",
                             azure_file=azure_file_volume)
    env_vars = {
        "AZURE_STORAGE_ACCOUNT": AZURE_STORAGE_ACCOUNT,
        "AZURE_STORAGE_ACCESS_KEY": AZURE_STORAGE_ACCESS_KEY,
        "AZURE_CONTAINER": AZURE_CONTAINER,
        "MASTER": MASTER,
        "NAMESPACE": NAMESPACE,
        "RUN_FREQ": "ONCE",
        "MODEL_DIR": MODEL,  # important
        "MODELS_FILESHARE_SECRET": MODELS_FILESHARE_SECRET,
        "MODELS_SHARE_NAME": MODELS_SHARE_NAME
    }

    env_list = []
    if env_vars:
        for env_name, env_value in env_vars.items():
            env_list.append(client.V1EnvVar(name=env_name, value=env_value))

    container = client.V1Container(
        name='{}-c'.format(name),
        image=IMAGE,
        image_pull_policy="IfNotPresent",
        command=[
            "/home/appuser/opt/tini", "--", "/home/appuser/opt/start_worker.sh"
        ],
        env=env_list,
        ports=[client.V1ContainerPort(container_port=8081, name="prometheus")],
        security_context=client.V1SecurityContext(
            privileged=True,
            capabilities=client.V1Capabilities(add=["SYS_ADMIN"])),
        resources=client.V1ResourceRequirements(limits={
            "memory": "5G",
            "cpu": "1"
        },
                                                requests={
                                                    "memory": "5G",
                                                    "cpu": "1"
                                                }),
        volume_mounts=[
            client.V1VolumeMount(mount_path="/home/appuser/opt/models",
                                 name="models-azurefiles",
                                 read_only=True)
        ])
    template.template.spec = client.V1PodSpec(
        containers=[container],
        image_pull_secrets=[{
            "name": "azure-cr-secret"
        }],
        # reason to use OnFailure https://github.com/kubernetes/kubernetes/issues/20255
        restart_policy="OnFailure",
        volumes=[volume])

    # And finaly we can create our V1JobSpec!
    body.spec = client.V1JobSpec(ttl_seconds_after_finished=100,
                                 template=template.template)

    try:
        api_response = api.create_namespaced_job(NAMESPACE, body)
        print("api_response=" + str(api_response))
        return True
    except ApiException as e:
        logging.exception('error spawning new job')
        print("Exception when creating a job: %s\n" % e)