예제 #1
0
def run_as_aiplatform_custom_job(
    op: dsl.ContainerOp,
    display_name: Optional[str] = None,
    replica_count: Optional[int] = None,
    machine_type: Optional[str] = None,
    accelerator_type: Optional[str] = None,
    accelerator_count: Optional[int] = None,
    boot_disk_type: Optional[str] = None,
    boot_disk_size_gb: Optional[int] = None,
    timeout: Optional[str] = None,
    restart_job_on_worker_restart: Optional[bool] = None,
    service_account: Optional[str] = None,
    network: Optional[str] = None,
    output_uri_prefix: Optional[str] = None,
    worker_pool_specs: Optional[List[Mapping[str, Any]]] = None,
) -> None:
    """Run a pipeline task using AI Platform (Unified) custom training job.

    For detailed doc of the service, please refer to
    https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job

    Args:
      op: The task (ContainerOp) object to run as aiplatform custom job.
      display_name: Optional. The name of the custom job.
      replica_count: Optional. The number of replicas to be split between master
        workerPoolSpec and worker workerPoolSpec. (master always has 1 replica).
      machine_type: Optional. The type of the machine to run the custom job. The
        default value is "n1-standard-4".
      accelerator_type: Optional. The type of accelerator(s) that may be attached
        to the machine as per acceleratorCount. Optional.
      accelerator_count: Optional. The number of accelerators to attach to the
        machine.
      boot_disk_type: Optional. Type of the boot disk (default is "pd-ssd"). Valid
        values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard"
          (Persistent Disk Hard Disk Drive).
      boot_disk_size_gb: Optional. Size in GB of the boot disk (default is 100GB).
      timeout: Optional. The maximum job running time. The default is 7 days. A
        duration in seconds with up to nine fractional digits, terminated by 's'.
        Example: "3.5s"
      restart_job_on_worker_restart: Optional. Restarts the entire CustomJob if a
        worker gets restarted. This feature can be used by distributed training
        jobs that are not resilient to workers leaving and joining a job.
      service_account: Optional. Specifies the service account for workload run-as
        account.
      network: Optional. The full name of the Compute Engine network to which the
        job should be peered. For example, projects/12345/global/networks/myVPC.
      output_uri_prefix: Optional. Google Cloud Storage URI to output directory.
      additional_worker_pool_specs: Optional. Additional workerPoolSpecs for
        distributed training. For details, please see:
        https://cloud.google.com/ai-platform-unified/docs/training/distributed-training
    """
    job_spec = {}

    if worker_pool_specs is not None:
        worker_pool_specs = copy.deepcopy(worker_pool_specs)

        def _is_output_parameter(output_key: str) -> bool:
            return output_key in (
                op.component_spec.output_definitions.parameters.keys())

        for worker_pool_spec in worker_pool_specs:
            if 'containerSpec' in worker_pool_spec:
                container_spec = worker_pool_spec['containerSpec']
                if 'command' in container_spec:
                    dsl_utils.resolve_cmd_lines(container_spec['command'],
                                                _is_output_parameter)
                if 'args' in container_spec:
                    dsl_utils.resolve_cmd_lines(container_spec['args'],
                                                _is_output_parameter)

            elif 'pythonPackageSpec' in worker_pool_spec:
                # For custom Python training, resolve placeholders in args only.
                python_spec = worker_pool_spec['pythonPackageSpec']
                if 'args' in python_spec:
                    dsl_utils.resolve_cmd_lines(python_spec['args'],
                                                _is_output_parameter)

            else:
                raise ValueError(
                    'Expect either "containerSpec" or "pythonPackageSpec" in each '
                    'workerPoolSpec. Got: {}'.format(custom_job_spec))

        job_spec['workerPoolSpecs'] = worker_pool_specs

    else:
        worker_pool_spec = {
            'machineSpec': {
                'machineType': machine_type or _DEFAULT_CUSTOM_JOB_MACHINE_TYPE
            },
            'replicaCount': '1',
            'containerSpec': {
                'imageUri': op.container.image,
            }
        }
        if op.container.command:
            worker_pool_spec['containerSpec']['command'] = op.container.command
        if op.container.args:
            worker_pool_spec['containerSpec']['args'] = op.container.args
        if accelerator_type is not None:
            worker_pool_spec['machineSpec'][
                'acceleratorType'] = accelerator_type
        if accelerator_count is not None:
            worker_pool_spec['machineSpec'][
                'acceleratorCount'] = accelerator_count
        if boot_disk_type is not None:
            if 'diskSpec' not in worker_pool_spec:
                worker_pool_spec['diskSpec'] = {}
            worker_pool_spec['diskSpec']['bootDiskType'] = boot_disk_type
        if boot_disk_size_gb is not None:
            if 'diskSpec' not in worker_pool_spec:
                worker_pool_spec['diskSpec'] = {}
            worker_pool_spec['diskSpec']['bootDiskSizeGb'] = boot_disk_size_gb

        job_spec['workerPoolSpecs'] = [worker_pool_spec]
        if replica_count is not None and replica_count > 1:
            additional_worker_pool_spec = copy.deepcopy(worker_pool_spec)
            additional_worker_pool_spec['replicaCount'] = str(replica_count -
                                                              1)
            job_spec['workerPoolSpecs'].append(additional_worker_pool_spec)

    if timeout is not None:
        if 'scheduling' not in job_spec:
            job_spec['scheduling'] = {}
        job_spec['scheduling']['timeout'] = timeout
    if restart_job_on_worker_restart is not None:
        if 'scheduling' not in job_spec:
            job_spec['scheduling'] = {}
        job_spec['scheduling'][
            'restartJobOnWorkerRestart'] = restart_job_on_worker_restart
    if service_account is not None:
        job_spec['serviceAccount'] = service_account
    if network is not None:
        job_spec['network'] = network
    if output_uri_prefix is not None:
        job_spec['baseOutputDirectory'] = {
            'outputUriPrefix': output_uri_prefix
        }

    op.custom_job_spec = {
        'displayName': display_name or op.name,
        'jobSpec': job_spec
    }
예제 #2
0
def create_custom_training_job_op_from_component(
    component_spec: Callable,  # pylint: disable=g-bare-generic
    display_name: Optional[str] = '',
    replica_count: Optional[int] = 1,
    machine_type: Optional[str] = 'n1-standard-4',
    accelerator_type: Optional[str] = '',
    accelerator_count: Optional[int] = 1,
    boot_disk_type: Optional[str] = 'pd-ssd',
    boot_disk_size_gb: Optional[int] = 100,
    timeout: Optional[str] = '',
    restart_job_on_worker_restart: Optional[bool] = False,
    service_account: Optional[str] = '',
    network: Optional[str] = '',
    encryption_spec_key_name: Optional[str] = '',
    tensorboard: Optional[str] = '',
    enable_web_access: Optional[bool] = False,
    base_output_directory: Optional[str] = '',
    labels: Optional[Dict[str, str]] = None,
) -> Callable:  # pylint: disable=g-bare-generic
    """Create a component spec that runs a custom training in Vertex AI.

  This utility converts a given component to a CustomTrainingJobOp that runs a
  custom training in Vertex AI. This simplifies the creation of custom training
  jobs. All Inputs and Outputs of the supplied component will be copied over to
  the constructed training job.

  Note that this utility constructs a ClusterSpec where the master and all the
  workers use the same spec, meaning all disk/machine spec related parameters
  will apply to all replicas. This is suitable for use cases such as training
  with MultiWorkerMirroredStrategy or Mirrored Strategy.

  This component does not support Vertex AI Python training application.

  For more details on Vertex AI Training service, please refer to
  https://cloud.google.com/vertex-ai/docs/training/create-custom-job

  Args:
    component_spec: The task (ContainerOp) object to run as Vertex AI custom
      job.
    display_name (Optional[str]): The name of the custom job. If not provided
      the component_spec.name will be used instead.
    replica_count (Optional[int]): The count of instances in the cluster. One
      replica always counts towards the master in worker_pool_spec[0] and the
      remaining replicas will be allocated in worker_pool_spec[1]. For more
      details see
      https://cloud.google.com/vertex-ai/docs/training/distributed-training#configure_a_distributed_training_job.
    machine_type (Optional[str]): The type of the machine to run the custom job.
      The default value is "n1-standard-4".  For more details about this input
      config, see
      https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types.
    accelerator_type (Optional[str]): The type of accelerator(s) that may be
      attached to the machine as per accelerator_count.  For more details about
      this input config, see
      https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#acceleratortype.
    accelerator_count (Optional[int]): The number of accelerators to attach to
      the machine. Defaults to 1 if accelerator_type is set.
    boot_disk_type (Optional[str]):
      Type of the boot disk (default is "pd-ssd"). Valid values: "pd-ssd"
        (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk
        Hard Disk Drive).
    boot_disk_size_gb (Optional[int]): Size in GB of the boot disk (default is
      100GB).
    timeout (Optional[str]): The maximum job running time. The default is 7
      days. A duration in seconds with up to nine fractional digits, terminated
      by 's', for example: "3.5s".
    restart_job_on_worker_restart (Optional[bool]): Restarts the entire
      CustomJob if a worker gets restarted. This feature can be used by
      distributed training jobs that are not resilient to workers leaving and
      joining a job.
    service_account (Optional[str]): Sets the default service account for
      workload run-as account. The service account running the pipeline
        (https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account)
          submitting jobs must have act-as permission on this run-as account. If
          unspecified, the Vertex AI Custom Code Service
        Agent(https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents)
          for the CustomJob's project.
    network (Optional[str]): The full name of the Compute Engine network to
      which the job should be peered. For example,
      projects/12345/global/networks/myVPC. Format is of the form
      projects/{project}/global/networks/{network}. Where {project} is a project
      number, as in 12345, and {network} is a network name. Private services
      access must already be configured for the network. If left unspecified,
      the job is not peered with any network.
    encryption_spec_key_name (Optional[str]): Customer-managed encryption key
      options for the CustomJob. If this is set, then all resources created by
      the CustomJob will be encrypted with the provided encryption key.
    tensorboard (Optional[str]): The name of a Vertex AI Tensorboard resource to
      which this CustomJob will upload Tensorboard logs.
    enable_web_access (Optional[bool]): Whether you want Vertex AI to enable
      [interactive shell access](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell)
      to training containers.
      If set to `true`, you can access interactive shells at the URIs given
      by [CustomJob.web_access_uris][].
    base_output_directory (Optional[str]): The Cloud Storage location to store
      the output of this CustomJob or
      HyperparameterTuningJob. see below for more details:
      https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination
    labels (Optional[Dict[str, str]]): The labels with user-defined metadata to
      organize CustomJobs.
      See https://goo.gl/xmQnxf for more information.

  Returns:
    A Custom Job component operator corresponding to the input component
    operator.

  """
    job_spec = {}
    input_specs = []
    output_specs = []

    # pytype: disable=attribute-error

    if component_spec.component_spec.inputs:
        input_specs = component_spec.component_spec.inputs
    if component_spec.component_spec.outputs:
        output_specs = component_spec.component_spec.outputs

    def _is_output_parameter(output_key: str) -> bool:
        for output in component_spec.component_spec.outputs:
            if output.name == output_key:
                return type_utils.is_parameter_type(output.type)
        return False

    worker_pool_spec = {
        'machine_spec': {
            'machine_type': machine_type
        },
        'replica_count': 1,
        'container_spec': {
            'image_uri':
            component_spec.component_spec.implementation.container.image,
        }
    }
    if component_spec.component_spec.implementation.container.command:
        container_command_copy = component_spec.component_spec.implementation.container.command.copy(
        )
        dsl_utils.resolve_cmd_lines(container_command_copy,
                                    _is_output_parameter)
        # Replace executor place holder with the json escaped placeholder.
        for idx, val in enumerate(container_command_copy):
            if val == '{{{{$}}}}':
                container_command_copy[
                    idx] = _EXECUTOR_PLACE_HOLDER_REPLACEMENT
        worker_pool_spec['container_spec']['command'] = container_command_copy

    if component_spec.component_spec.implementation.container.args:
        container_args_copy = component_spec.component_spec.implementation.container.args.copy(
        )
        dsl_utils.resolve_cmd_lines(container_args_copy, _is_output_parameter)
        # Replace executor place holder with the json escaped placeholder.
        for idx, val in enumerate(container_args_copy):
            if val == '{{{{$}}}}':
                container_args_copy[idx] = _EXECUTOR_PLACE_HOLDER_REPLACEMENT
        worker_pool_spec['container_spec']['args'] = container_args_copy
    if accelerator_type:
        worker_pool_spec['machine_spec']['accelerator_type'] = accelerator_type
        worker_pool_spec['machine_spec'][
            'accelerator_count'] = accelerator_count
    if boot_disk_type:
        if 'disk_spec' not in worker_pool_spec:
            worker_pool_spec['disk_spec'] = {}
        worker_pool_spec['disk_spec']['boot_disk_type'] = boot_disk_type
        if 'disk_spec' not in worker_pool_spec:
            worker_pool_spec['disk_spec'] = {}
        worker_pool_spec['disk_spec']['boot_disk_size_gb'] = boot_disk_size_gb

    job_spec['worker_pool_specs'] = [worker_pool_spec]
    if int(replica_count) > 1:
        additional_worker_pool_spec = copy.deepcopy(worker_pool_spec)
        additional_worker_pool_spec['replica_count'] = str(replica_count - 1)
        job_spec['worker_pool_specs'].append(additional_worker_pool_spec)

    # TODO(chavoshi): Use input parameter instead of hard coded string label.
    # This requires Dictionary input type to be supported in V2.
    if labels is not None:
        job_spec['labels'] = labels

    if timeout:
        if 'scheduling' not in job_spec:
            job_spec['scheduling'] = {}
        job_spec['scheduling']['timeout'] = timeout
    if restart_job_on_worker_restart:
        if 'scheduling' not in job_spec:
            job_spec['scheduling'] = {}
        job_spec['scheduling'][
            'restart_job_on_worker_restart'] = restart_job_on_worker_restart
    if enable_web_access:
        job_spec['enable_web_access'] = enable_web_access

    if encryption_spec_key_name:
        job_spec['encryption_spec'] = {}
        job_spec['encryption_spec'][
            'kms_key_name'] = "{{$.inputs.parameters['encryption_spec_key_name']}}"
        input_specs.append(
            structures.InputSpec(name='encryption_spec_key_name',
                                 type='String',
                                 optional=True,
                                 default=encryption_spec_key_name), )

    # Remove any existing service_account from component input list.
    input_specs[:] = [
        input_spec for input_spec in input_specs
        if input_spec.name not in ('service_account', 'network', 'tensorboard',
                                   'base_output_directory')
    ]
    job_spec['service_account'] = "{{$.inputs.parameters['service_account']}}"
    job_spec['network'] = "{{$.inputs.parameters['network']}}"

    job_spec['tensorboard'] = "{{$.inputs.parameters['tensorboard']}}"
    job_spec['base_output_directory'] = {}
    job_spec['base_output_directory'][
        'output_uri_prefix'] = "{{$.inputs.parameters['base_output_directory']}}"
    custom_job_payload = {
        'display_name': display_name or component_spec.component_spec.name,
        'job_spec': job_spec
    }

    custom_job_component_spec = structures.ComponentSpec(
        name=component_spec.component_spec.name,
        inputs=input_specs + [
            structures.InputSpec(name='base_output_directory',
                                 type='String',
                                 optional=True,
                                 default=base_output_directory),
            structures.InputSpec(name='tensorboard',
                                 type='String',
                                 optional=True,
                                 default=tensorboard),
            structures.InputSpec(
                name='network', type='String', optional=True, default=network),
            structures.InputSpec(name='service_account',
                                 type='String',
                                 optional=True,
                                 default=service_account),
            structures.InputSpec(name='project', type='String'),
            structures.InputSpec(name='location', type='String')
        ],
        outputs=output_specs +
        [structures.OutputSpec(name='gcp_resources', type='String')],
        implementation=structures.
        ContainerImplementation(container=structures.ContainerSpec(
            image=_DEFAULT_CUSTOM_JOB_CONTAINER_IMAGE,
            command=[
                'python3', '-u', '-m',
                'google_cloud_pipeline_components.container.v1.gcp_launcher.launcher'
            ],
            args=[
                '--type',
                'CustomJob',
                '--payload',
                json.dumps(custom_job_payload),
                '--project',
                structures.InputValuePlaceholder(input_name='project'),
                '--location',
                structures.InputValuePlaceholder(input_name='location'),
                '--gcp_resources',
                structures.OutputPathPlaceholder(output_name='gcp_resources'),
            ],
        )))

    # pytype: enable=attribute-error

    component_path = tempfile.mktemp()
    custom_job_component_spec.save(component_path)
    return components.load_component_from_file(component_path)
예제 #3
0
def create_custom_training_job_op_from_component(
    component_spec: Callable,  # pylint: disable=g-bare-generic
    display_name: Optional[str] = '',
    replica_count: Optional[int] = 1,
    machine_type: Optional[str] = 'n1-standard-4',
    accelerator_type: Optional[str] = '',
    accelerator_count: Optional[int] = 1,
    boot_disk_type: Optional[str] = 'pd-ssd',
    boot_disk_size_gb: Optional[int] = 100,
    timeout: Optional[str] = '604800s',
    restart_job_on_worker_restart: Optional[bool] = False,
    service_account: Optional[str] = '',
    network: Optional[str] = '',
    encryption_spec_key_name: Optional[str] = '',
    tensorboard: Optional[str] = '',
    enable_web_access: Optional[bool] = False,
    reserved_ip_ranges: Optional[Sequence[str]] = None,
    nfs_mounts: Optional[Sequence[Dict[str, str]]] = None,
    base_output_directory: Optional[str] = '',
    labels: Optional[Dict[str, str]] = None,
) -> Callable:  # pylint: disable=g-bare-generic
    """Create a component spec that runs a custom training in Vertex AI.

  This utility converts a given component to a CustomTrainingJobOp that runs a
  custom training in Vertex AI. This simplifies the creation of custom training
  jobs. All Inputs and Outputs of the supplied component will be copied over to
  the constructed training job.

  Note that this utility constructs a ClusterSpec where the master and all the
  workers use the same spec, meaning all disk/machine spec related parameters
  will apply to all replicas. This is suitable for use cases such as training
  with MultiWorkerMirroredStrategy or Mirrored Strategy.

  This component does not support Vertex AI Python training application.

  For more details on Vertex AI Training service, please refer to
  https://cloud.google.com/vertex-ai/docs/training/create-custom-job

  Args:
    component_spec: The task (ContainerOp) object to run as Vertex AI custom
      job.
    display_name (Optional[str]): The name of the custom job. If not provided
      the component_spec.name will be used instead.
    replica_count (Optional[int]): The count of instances in the cluster. One
      replica always counts towards the master in worker_pool_spec[0] and the
      remaining replicas will be allocated in worker_pool_spec[1]. For more
      details see
      https://cloud.google.com/vertex-ai/docs/training/distributed-training#configure_a_distributed_training_job.
    machine_type (Optional[str]): The type of the machine to run the custom job.
      The default value is "n1-standard-4".  For more details about this input
      config, see
      https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types.
    accelerator_type (Optional[str]): The type of accelerator(s) that may be
      attached to the machine as per accelerator_count.  For more details about
      this input config, see
      https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#acceleratortype.
    accelerator_count (Optional[int]): The number of accelerators to attach to
      the machine. Defaults to 1 if accelerator_type is set.
    boot_disk_type (Optional[str]):
      Type of the boot disk (default is "pd-ssd"). Valid values: "pd-ssd"
        (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk
        Hard Disk Drive). boot_disk_type is set as a static value and cannot be
        changed as a pipeline parameter.
    boot_disk_size_gb (Optional[int]): Size in GB of the boot disk (default is
      100GB). boot_disk_size_gb is set as a static value and cannot be
        changed as a pipeline parameter.
    timeout (Optional[str]): The maximum job running time. The default is 7
      days. A duration in seconds with up to nine fractional digits, terminated
      by 's', for example: "3.5s".
    restart_job_on_worker_restart (Optional[bool]): Restarts the entire
      CustomJob if a worker gets restarted. This feature can be used by
      distributed training jobs that are not resilient to workers leaving and
      joining a job.
    service_account (Optional[str]): Sets the default service account for
      workload run-as account. The service account running the pipeline
        (https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account)
          submitting jobs must have act-as permission on this run-as account. If
          unspecified, the Vertex AI Custom Code Service
        Agent(https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents)
          for the CustomJob's project.
    network (Optional[str]): The full name of the Compute Engine network to
      which the job should be peered. For example,
      projects/12345/global/networks/myVPC. Format is of the form
      projects/{project}/global/networks/{network}. Where {project} is a project
      number, as in 12345, and {network} is a network name. Private services
      access must already be configured for the network. If left unspecified,
      the job is not peered with any network.
    encryption_spec_key_name (Optional[str]): Customer-managed encryption key
      options for the CustomJob. If this is set, then all resources created by
      the CustomJob will be encrypted with the provided encryption key.
    tensorboard (Optional[str]): The name of a Vertex AI Tensorboard resource to
      which this CustomJob will upload Tensorboard logs.
    enable_web_access (Optional[bool]): Whether you want Vertex AI to enable
      [interactive shell
        access](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell)
          to training containers. If set to `true`, you can access interactive
          shells at the URIs given by [CustomJob.web_access_uris][].
    reserved_ip_ranges (Optional[Sequence[str]]): A list of names for the
      reserved ip ranges under the VPC network that can be used for this job. If
      set, we will deploy the job within the provided ip ranges. Otherwise, the
      job will be deployed to any ip ranges under the provided VPC network.
    nfs_mounts (Optional[Sequence[Dict]]): A list of NFS mount specs in Json
      dict format. nfs_mounts is set as a static value and cannot be changed as
      a pipeline parameter. For API spec, see
      https://cloud.devsite.corp.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#NfsMount
        For more details about mounting NFS for CustomJob, see
      https://cloud.devsite.corp.google.com/vertex-ai/docs/training/train-nfs-share
    base_output_directory (Optional[str]): The Cloud Storage location to store
      the output of this CustomJob or
      HyperparameterTuningJob. see below for more details:
      https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination
    labels (Optional[Dict[str, str]]): The labels with user-defined metadata to
      organize CustomJobs.
      See https://goo.gl/xmQnxf for more information.

  Returns:
    A Custom Job component operator corresponding to the input component
    operator.

  """
    worker_pool_specs = {}
    input_specs = []
    output_specs = []

    # pytype: disable=attribute-error

    if component_spec.component_spec.inputs:
        input_specs = component_spec.component_spec.inputs
    if component_spec.component_spec.outputs:
        output_specs = component_spec.component_spec.outputs

    def _is_output_parameter(output_key: str) -> bool:
        for output in component_spec.component_spec.outputs:
            if output.name == output_key:
                return type_utils.is_parameter_type(output.type)
        return False

    worker_pool_spec = {
        'machine_spec': {
            'machine_type': machine_type
        },
        'replica_count': 1,
        'container_spec': {
            'image_uri':
            component_spec.component_spec.implementation.container.image,
        }
    }
    if component_spec.component_spec.implementation.container.command:
        container_command_copy = component_spec.component_spec.implementation.container.command.copy(
        )
        dsl_utils.resolve_cmd_lines(container_command_copy,
                                    _is_output_parameter)
        # Replace executor place holder with the json escaped placeholder.
        for idx, val in enumerate(container_command_copy):
            if val == '{{{{$}}}}':
                container_command_copy[
                    idx] = _EXECUTOR_PLACE_HOLDER_REPLACEMENT
        worker_pool_spec['container_spec']['command'] = container_command_copy

    if component_spec.component_spec.implementation.container.env:
        worker_pool_spec['container_spec'][
            'env'] = component_spec.component_spec.implementation.container.env.copy(
            )

    if component_spec.component_spec.implementation.container.args:
        container_args_copy = component_spec.component_spec.implementation.container.args.copy(
        )
        dsl_utils.resolve_cmd_lines(container_args_copy, _is_output_parameter)
        # Replace executor place holder with the json escaped placeholder.
        for idx, val in enumerate(container_args_copy):
            if val == '{{{{$}}}}':
                container_args_copy[idx] = _EXECUTOR_PLACE_HOLDER_REPLACEMENT
        worker_pool_spec['container_spec']['args'] = container_args_copy
    if accelerator_type:
        worker_pool_spec['machine_spec']['accelerator_type'] = accelerator_type
        worker_pool_spec['machine_spec'][
            'accelerator_count'] = accelerator_count
    if boot_disk_type:
        if 'disk_spec' not in worker_pool_spec:
            worker_pool_spec['disk_spec'] = {}
        worker_pool_spec['disk_spec']['boot_disk_type'] = boot_disk_type
        if 'disk_spec' not in worker_pool_spec:
            worker_pool_spec['disk_spec'] = {}
        worker_pool_spec['disk_spec']['boot_disk_size_gb'] = boot_disk_size_gb
    if nfs_mounts:
        if 'nfs_mounts' not in worker_pool_spec:
            worker_pool_spec['nfs_mounts'] = []
        worker_pool_spec['nfs_mounts'].extend(nfs_mounts)

    worker_pool_specs = [worker_pool_spec]
    if int(replica_count) > 1:
        additional_worker_pool_spec = copy.deepcopy(worker_pool_spec)
        additional_worker_pool_spec['replica_count'] = str(replica_count - 1)
        worker_pool_specs.append(additional_worker_pool_spec)

    # Remove any Vertex Training duplicate input_spec from component input list.
    input_specs[:] = [
        input_spec for input_spec in input_specs
        if input_spec.name not in ('project', 'location', 'display_name',
                                   'worker_pool_specs', 'timeout',
                                   'restart_job_on_worker_restart',
                                   'service_account', 'tensorboard', 'network',
                                   'reserved_ip_ranges', 'nfs_mounts',
                                   'base_output_directory', 'labels',
                                   'encryption_spec_key_name')
    ]

    custom_training_job_json = None
    with open(os.path.join(os.path.dirname(__file__),
                           'component.yaml')) as file:
        custom_training_job_json = yaml.load(file, Loader=yaml.FullLoader)

    for input_item in custom_training_job_json['inputs']:
        if 'display_name' in input_item.values():
            input_item[
                'default'] = display_name if display_name else component_spec.component_spec.name
            input_item['optional'] = True
        elif 'worker_pool_specs' in input_item.values():
            input_item['default'] = json.dumps(worker_pool_specs)
            input_item['optional'] = True
        elif 'timeout' in input_item.values():
            input_item['default'] = timeout
            input_item['optional'] = True
        elif 'restart_job_on_worker_restart' in input_item.values():
            input_item['default'] = json.dumps(restart_job_on_worker_restart)
            input_item['optional'] = True
        elif 'service_account' in input_item.values():
            input_item['default'] = service_account
            input_item['optional'] = True
        elif 'tensorboard' in input_item.values():
            input_item['default'] = tensorboard
            input_item['optional'] = True
        elif 'enable_web_access' in input_item.values():
            input_item['default'] = json.dumps(enable_web_access)
            input_item['optional'] = True
        elif 'network' in input_item.values():
            input_item['default'] = network
            input_item['optional'] = True
        elif 'reserved_ip_ranges' in input_item.values():
            input_item['default'] = json.dumps(
                reserved_ip_ranges) if reserved_ip_ranges else '[]'
            input_item['optional'] = True
        elif 'base_output_directory' in input_item.values():
            input_item['default'] = base_output_directory
            input_item['optional'] = True
        elif 'labels' in input_item.values():
            input_item['default'] = json.dumps(labels) if labels else '{}'
            input_item['optional'] = True
        elif 'encryption_spec_key_name' in input_item.values():
            input_item['default'] = encryption_spec_key_name
            input_item['optional'] = True
        else:
            # This field does not need to be updated.
            continue

    # Copying over the input and output spec from the given component.
    for input_spec in input_specs:
        custom_training_job_json['inputs'].append(input_spec.to_dict())

    for output_spec in output_specs:
        custom_training_job_json['outputs'].append(output_spec.to_dict())

    # Copy the component name and description
    custom_training_job_json['name'] = component_spec.component_spec.name

    if component_spec.component_spec.description:
        # TODO(chavoshi) Add support for docstring parsing.
        component_description = 'A custom job that wraps '
        component_description += f'{component_spec.component_spec.name}.\n\nOrigional component'
        component_description += f' description:\n{component_spec.component_spec.description}\n\nCustom'
        component_description += ' Job wrapper description:\n'
        component_description += custom_training_job_json['description']
        custom_training_job_json['description'] = component_description

    component_path = tempfile.mktemp()
    with open(component_path, 'w') as out_file:
        yaml.dump(custom_training_job_json, out_file)

    return components.load_component_from_file(component_path)
예제 #4
0
def run_as_vertex_ai_custom_job(
    component_spec: Callable,
    display_name: Optional[str] = None,
    replica_count: Optional[int] = None,
    machine_type: Optional[str] = None,
    accelerator_type: Optional[str] = None,
    accelerator_count: Optional[int] = None,
    boot_disk_type: Optional[str] = None,
    boot_disk_size_gb: Optional[int] = None,
    timeout: Optional[str] = None,
    restart_job_on_worker_restart: Optional[bool] = None,
    service_account: Optional[str] = None,
    network: Optional[str] = None,
    worker_pool_specs: Optional[List[Mapping[str, Any]]] = None,
) -> Callable:
    """Run a pipeline task using AI Platform (Unified) custom training job.

    For detailed doc of the service, please refer to
    https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job

    Args:
      component_spec: The task (ContainerOp) object to run as aiplatform custom job.
      display_name: Optional. The name of the custom job. If not provided the
        component_spec.name will be used instead.
      replica_count: Optional. The number of replicas to be split between master
        workerPoolSpec and worker workerPoolSpec. (master always has 1 replica).
      machine_type: Optional. The type of the machine to run the custom job. The
        default value is "n1-standard-4".
      accelerator_type: Optional. The type of accelerator(s) that may be attached
        to the machine as per accelerator_count. Optional.
      accelerator_count: Optional. The number of accelerators to attach to the
        machine.
      boot_disk_type: Optional. Type of the boot disk (default is "pd-ssd"). Valid
        values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard"
          (Persistent Disk Hard Disk Drive).
      boot_disk_size_gb: Optional. Size in GB of the boot disk (default is 100GB).
      timeout: Optional. The maximum job running time. The default is 7 days. A
        duration in seconds with up to nine fractional digits, terminated by 's'.
        Example: "3.5s"
      restart_job_on_worker_restart: Optional. Restarts the entire CustomJob if a
        worker gets restarted. This feature can be used by distributed training
        jobs that are not resilient to workers leaving and joining a job.
      service_account: Optional. Specifies the service account for workload run-as
        account.
      network: Optional. The full name of the Compute Engine network to which the
        job should be peered. For example, projects/12345/global/networks/myVPC.
      worker_pool_specs: Optional, worker_pool_specs for distributed training. this
        will overwite all other cluster configurations. For details, please see:
        https://cloud.google.com/ai-platform-unified/docs/training/distributed-training
    Returns:
      A Custom Job component OP correspoinding to the input component OP.
    """
    job_spec = {}

    # As a temporary work aruond for issue with kfp v2 based compiler where
    # compiler expects place holders in origional form in args, instead of
    # using fields from outputs, we add back the args from the origional
    # component to the custom job component. These args will be ignored
    # by the remote launcher.
    copy_of_origional_args = []

    if worker_pool_specs is not None:
        worker_pool_specs = copy.deepcopy(worker_pool_specs)

        def _is_output_parameter(output_key: str) -> bool:
            return output_key in (component_spec.component_spec.
                                  output_definitions.parameters.keys())

        for worker_pool_spec in worker_pool_specs:
            if 'container_spec' in worker_pool_spec:
                container_spec = worker_pool_spec['container_spec']
                if 'command' in container_spec:
                    dsl_utils.resolve_cmd_lines(container_spec['command'],
                                                _is_output_parameter)
                if 'args' in container_spec:
                    copy_of_origional_args = container_spec['args'].copy()
                    dsl_utils.resolve_cmd_lines(container_spec['args'],
                                                _is_output_parameter)

            elif 'python_package_spec' in worker_pool_spec:
                # For custom Python training, resolve placeholders in args only.
                python_spec = worker_pool_spec['python_package_spec']
                if 'args' in python_spec:
                    dsl_utils.resolve_cmd_lines(python_spec['args'],
                                                _is_output_parameter)

            else:
                raise ValueError(
                    'Expect either "container_spec" or "python_package_spec" in each '
                    'workerPoolSpec. Got: {}'.format(worker_pool_spec))

        job_spec['worker_pool_specs'] = worker_pool_specs

    else:

        def _is_output_parameter(output_key: str) -> bool:
            for output in component_spec.component_spec.outputs:
                if output.name == output_key:
                    return type_utils.is_parameter_type(output.type)
            return False

        worker_pool_spec = {
            'machine_spec': {
                'machine_type': machine_type
                or _DEFAULT_CUSTOM_JOB_MACHINE_TYPE
            },
            'replica_count': 1,
            'container_spec': {
                'image_uri':
                component_spec.component_spec.implementation.container.image,
            }
        }
        if component_spec.component_spec.implementation.container.command:
            container_command_copy = component_spec.component_spec.implementation.container.command.copy(
            )
            dsl_utils.resolve_cmd_lines(container_command_copy,
                                        _is_output_parameter)
            worker_pool_spec['container_spec'][
                'command'] = container_command_copy

        if component_spec.component_spec.implementation.container.args:
            container_args_copy = component_spec.component_spec.implementation.container.args.copy(
            )
            copy_of_origional_args = component_spec.component_spec.implementation.container.args.copy(
            )
            dsl_utils.resolve_cmd_lines(container_args_copy,
                                        _is_output_parameter)
            worker_pool_spec['container_spec']['args'] = container_args_copy
        if accelerator_type is not None:
            worker_pool_spec['machine_spec'][
                'accelerator_type'] = accelerator_type
        if accelerator_count is not None:
            worker_pool_spec['machine_spec'][
                'accelerator_count'] = accelerator_count
        if boot_disk_type is not None:
            if 'disk_spec' not in worker_pool_spec:
                worker_pool_spec['disk_spec'] = {}
            worker_pool_spec['disk_spec']['boot_disk_type'] = boot_disk_type
        if boot_disk_size_gb is not None:
            if 'disk_spec' not in worker_pool_spec:
                worker_pool_spec['disk_spec'] = {}
            worker_pool_spec['disk_spec'][
                'boot_disk_size_gb'] = boot_disk_size_gb

        job_spec['worker_pool_specs'] = [worker_pool_spec]
        if replica_count is not None and replica_count > 1:
            additional_worker_pool_spec = copy.deepcopy(worker_pool_spec)
            additional_worker_pool_spec['replica_count'] = str(replica_count -
                                                               1)
            job_spec['worker_pool_specs'].append(additional_worker_pool_spec)

    if timeout is not None:
        if 'scheduling' not in job_spec:
            job_spec['scheduling'] = {}
        job_spec['scheduling']['timeout'] = timeout
    if restart_job_on_worker_restart is not None:
        if 'scheduling' not in job_spec:
            job_spec['scheduling'] = {}
        job_spec['scheduling'][
            'restart_job_on_worker_restart'] = restart_job_on_worker_restart
    if service_account is not None:
        job_spec['service_account'] = service_account
    if network is not None:
        job_spec['network'] = network

    custom_job_payload = {
        'display_name': display_name or component_spec.component_spec.name,
        'job_spec': job_spec
    }

    custom_job_component_spec = structures.ComponentSpec(
        name=component_spec.component_spec.name,
        inputs=component_spec.component_spec.inputs + [
            structures.InputSpec(name='gcp_project', type='String'),
            structures.InputSpec(name='gcp_region', type='String')
        ],
        outputs=component_spec.component_spec.outputs +
        [structures.OutputSpec(name='GCP_RESOURCES', type='String')],
        implementation=structures.ContainerImplementation(
            container=structures.ContainerSpec(
                image=_DEFAULT_CUSTOM_JOB_CONTAINER_IMAGE,
                command=["python", "-u", "-m", "launcher"],
                args=[
                    '--type',
                    'CustomJob',
                    '--gcp_project',
                    structures.InputValuePlaceholder(input_name='gcp_project'),
                    '--gcp_region',
                    structures.InputValuePlaceholder(input_name='gcp_region'),
                    '--payload',
                    json.dumps(custom_job_payload),
                    '--gcp_resources',
                    structures.OutputPathPlaceholder(
                        output_name='GCP_RESOURCES'),
                ] + copy_of_origional_args,
            )))
    component_path = tempfile.mktemp()
    custom_job_component_spec.save(component_path)

    return components.load_component_from_file(component_path)
예제 #5
0
def custom_job(
    name: str,
    input_artifacts: Optional[Dict[str, dsl.PipelineParam]] = None,
    input_parameters: Optional[Dict[str, _ValueOrPipelineParam]] = None,
    output_artifacts: Optional[Dict[str, Type[io_types.Artifact]]] = None,
    output_parameters: Optional[Dict[str, Type[Union[str, float,
                                                     int]]]] = None,
    # Custom container training specs.
    image_uri: Optional[str] = None,
    commands: Optional[List[str]] = None,
    # Custom Python training spec.
    executor_image_uri: Optional[str] = None,
    package_uris: Optional[List[str]] = None,
    python_module: Optional[str] = None,
    # Command line args of the user program.
    args: Optional[List[Any]] = None,
    machine_type: Optional[str] = None,
    # Full-fledged custom job API spec. For details please see:
    # https://cloud.google.com/ai-platform-unified/docs/reference/rest/v1beta1/CustomJobSpec
    additional_job_spec: Optional[Dict[str, Any]] = None
) -> AiPlatformCustomJobOp:
    """DSL representation of a AI Platform (Unified) custom training job.

  For detailed doc of the service, please refer to
  https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job

  Args:
    name: The name of this task.
    input_artifacts: The input artifact specification. Should be a mapping from
      input name to output from upstream tasks.
    input_parameters: The input parameter specification. Should be a mapping
      from input name to one of the following three: - output from upstream
        tasks, or - pipeline parameter, or - constant value
    output_artifacts: The output artifact declaration. Should be a mapping from
      output name to a type subclassing artifact.Artifact.
    output_parameters: The output parameter declaration. Should be a mapping
      from output name to one of 1) str, 2) float, or 3) int.
    image_uri: The URI of the container image containing the user training
      program. Applicable for custom container training.
    commands: The container command/entrypoint. Applicable for custom container
      training.
    executor_image_uri: The URI of the container image containing the
      dependencies of user training program. Applicable for custom Python
      training.
    package_uris: The Python packages that are expected to be running on the
      executor container. Applicable for custom Python training.
    python_module: The entrypoint of user training program. Applicable for
      custom Python training.
    args: The command line arguments of user training program. This is expected
      to be a list of either 1) constant string, or 2) KFP DSL placeholders, to
      connect the user program with the declared component I/O.
    machine_type: The machine type used to run the training program. The value
      of this field will be propagated to all worker pools if not specified
      otherwise in additional_job_spec.
    additional_job_spec: Full-fledged custom job API spec. The value specified
      in this field will override the defaults provided through other function
      parameters.
      For details please see:
      https://cloud.google.com/ai-platform-unified/docs/reference/rest/v1beta1/CustomJobSpec

  Returns:
    A KFP ContainerOp object represents the launcher container job, from which
    the user training program will be submitted to AI Platform (Unified) Custom
    Job service.

  Raises:
    KeyError on name collision between parameter and artifact I/O declaration.
    ValueError when:
      1. neither or both image_uri and executor_image_uri are provided; or
      2. no valid package_uris and python_module is provided for custom Python
         training.
  """
    # Check the sanity of the provided parameters.
    input_artifacts = input_artifacts or {}
    input_parameters = input_parameters or {}
    output_artifacts = output_artifacts or {}
    output_parameters = output_parameters or {}
    if bool(set(input_artifacts.keys()) & set(input_parameters.keys())):
        raise KeyError(
            'Input key conflict between input parameters and artifacts.')
    if bool(set(output_artifacts.keys()) & set(output_parameters.keys())):
        raise KeyError('Output key conflict between output parameters and '
                       'artifacts.')

    if not additional_job_spec and bool(image_uri) == bool(executor_image_uri):
        raise ValueError(
            'The user program needs to be either a custom container '
            'training job, or a custom Python training job')

    # For Python custom training job, package URIs and modules are also required.
    if executor_image_uri:
        if not package_uris or not python_module or len(
                package_uris) > _MAX_PACKAGE_URIS:
            raise ValueError(
                'For custom Python training, package_uris with length < '
                '100 and python_module are expected.')

    # Check and scaffold the parameters to form the custom job request spec.
    custom_job_spec = additional_job_spec or {}
    if not custom_job_spec.get('workerPoolSpecs'):
        # Single node training, deriving job spec from top-level parameters.
        if image_uri:
            # Single node custom container training
            worker_pool_spec = {
                'machineSpec': {
                    'machineType': machine_type
                    or _DEFAULT_CUSTOM_JOB_MACHINE_TYPE
                },
                'replicaCount': '1',
                'containerSpec': {
                    'imageUri': image_uri,
                }
            }
            if commands:
                worker_pool_spec['containerSpec']['command'] = commands
            if args:
                worker_pool_spec['containerSpec']['args'] = args
            custom_job_spec['workerPoolSpecs'] = [worker_pool_spec]
        if executor_image_uri:
            worker_pool_spec = {
                'machineSpec': {
                    'machineType': machine_type
                    or _DEFAULT_CUSTOM_JOB_MACHINE_TYPE
                },
                'replicaCount': '1',
                'pythonPackageSpec': {
                    'executorImageUri': executor_image_uri,
                    'packageUris': package_uris,
                    'pythonModule': python_module,
                    'args': args
                }
            }
            custom_job_spec['workerPoolSpecs'] = [worker_pool_spec]
    else:
        # If the full-fledged job spec is provided. We'll use it as much as
        # possible, and patch some top-level parameters.
        for spec in custom_job_spec['workerPoolSpecs']:
            if image_uri:
                if (not spec.get('pythonPackageSpec')
                        and not spec.get('containerSpec', {}).get('imageUri')):
                    spec['containerSpec'] = spec.get('containerSpec', {})
                    spec['containerSpec']['imageUri'] = image_uri
            if commands:
                if (not spec.get('pythonPackageSpec')
                        and not spec.get('containerSpec', {}).get('command')):
                    spec['containerSpec'] = spec.get('containerSpec', {})
                    spec['containerSpec']['command'] = commands
            if executor_image_uri:
                if (not spec.get('containerSpec') and not spec.get(
                        'pythonPackageSpec', {}).get('executorImageUri')):
                    spec['pythonPackageSpec'] = spec.get(
                        'pythonPackageSpec', {})
                    spec['pythonPackageSpec'][
                        'executorImageUri'] = executor_image_uri
            if package_uris:
                if (not spec.get('containerSpec') and not spec.get(
                        'pythonPackageSpec', {}).get('packageUris')):
                    spec['pythonPackageSpec'] = spec.get(
                        'pythonPackageSpec', {})
                    spec['pythonPackageSpec']['packageUris'] = package_uris
            if python_module:
                if (not spec.get('containerSpec') and not spec.get(
                        'pythonPackageSpec', {}).get('pythonModule')):
                    spec['pythonPackageSpec'] = spec.get(
                        'pythonPackageSpec', {})
                    spec['pythonPackageSpec']['pythonModule'] = python_module
            if args:
                if spec.get('containerSpec'
                            ) and not spec['containerSpec'].get('args'):
                    spec['containerSpec']['args'] = args
                if (spec.get('pythonPackageSpec')
                        and not spec['pythonPackageSpec'].get('args')):
                    spec['pythonPackageSpec']['args'] = args

    # Resolve the custom job spec by wiring it with the I/O spec.
    def _is_output_parameter(output_key: str) -> str:
        return output_key in output_parameters

    for wp_spec in custom_job_spec['workerPoolSpecs']:
        if 'containerSpec' in wp_spec:
            # For custom container training, resolve placeholders in commands and
            # program args.
            container_spec = wp_spec['containerSpec']
            if 'command' in container_spec:
                dsl_utils.resolve_cmd_lines(container_spec['command'],
                                            _is_output_parameter)
            if 'args' in container_spec:
                dsl_utils.resolve_cmd_lines(container_spec['args'],
                                            _is_output_parameter)
        else:
            assert 'pythonPackageSpec' in wp_spec
            # For custom Python training, resolve placeholders in args only.
            python_spec = wp_spec['pythonPackageSpec']
            if 'args' in python_spec:
                dsl_utils.resolve_cmd_lines(python_spec['args'],
                                            _is_output_parameter)

    job_spec = {'name': name, 'jobSpec': custom_job_spec}

    return _get_custom_job_op(task_name=name,
                              job_spec=job_spec,
                              input_artifacts=input_artifacts,
                              input_parameters=input_parameters,
                              output_artifacts=output_artifacts,
                              output_parameters=output_parameters)