예제 #1
0
    def _run_executor(self, execution_id: int,
                      input_dict: Dict[Text, List[types.Artifact]],
                      output_dict: Dict[Text, List[types.Artifact]],
                      exec_properties: Dict[Text, Any]) -> None:
        """Execute underlying component implementation."""

        executor_container_spec = cast(executor_spec.ExecutorContainerSpec,
                                       self._component_executor_spec)

        # Replace container spec with jinja2 template.
        executor_container_spec = container_common.resolve_container_template(
            executor_container_spec, input_dict, output_dict, exec_properties)

        # Call client.containers.run and wait for completion.
        # ExecutorContainerSpec follows k8s container spec which has different
        # names to Docker's container spec. It's intended to set command to docker's
        # entrypoint and args to docker's command.
        client = docker.from_env()
        container = client.containers.run(
            image=executor_container_spec.image,
            entrypoint=executor_container_spec.command,
            command=executor_container_spec.args,
            detach=True)

        # Streaming logs
        for log in container.logs(stream=True):
            tf.logging.info('Docker: ' + log.decode('utf-8'))
        exit_code = container.wait()['StatusCode']
        if exit_code != 0:
            raise RuntimeError(
                'Container exited with error code "{}"'.format(exit_code))
예제 #2
0
  def testResolveContainerTemplate(self):
    container_spec = executor_spec.ExecutorContainerSpec(
        image='gcr.io/my/trainer:{{exec_properties.version}}',
        command=['{{exec_properties.model}}_trainer'],
        args=[
            '--steps',
            '{{exec_properties.train_args.num_steps}}',
            '--examples',
            '{{input_dict["examples"]|join(",",attribute="uri")}}',
            '--model-path',
            '{{output_dict["model"][0].uri}}',
        ])
    examples_artifact_1 = standard_artifacts.Examples()
    examples_artifact_1.uri = 'gcs://examples/1'
    examples_artifact_2 = standard_artifacts.Examples()
    examples_artifact_2.uri = 'gcs://examples/2'
    model = standard_artifacts.Model()
    model.uri = 'gcs://model'
    input_dict = {'examples': [examples_artifact_1, examples_artifact_2]}
    output_dict = {'model': [model]}
    exec_properties = {
        'version': 'v1',
        'model': 'cnn',
        'train_args': trainer_pb2.TrainArgs(num_steps=10000),
    }

    actual_spec = container_common.resolve_container_template(
        container_spec, input_dict, output_dict, exec_properties)

    self.assertEqual('gcr.io/my/trainer:v1', actual_spec.image)
    self.assertListEqual(['cnn_trainer'], actual_spec.command)
    self.assertListEqual([
        '--steps',
        '10000',
        '--examples',
        'gcs://examples/1,gcs://examples/2',
        '--model-path',
        'gcs://model',
    ], actual_spec.args)
예제 #3
0
  def _run_executor(self, execution_id: int,
                    input_dict: Dict[Text, List[types.Artifact]],
                    output_dict: Dict[Text, List[types.Artifact]],
                    exec_properties: Dict[Text, Any]) -> None:
    """Execute underlying component implementation.

    Runs executor container in a Kubernetes Pod and wait until it goes into
    `Succeeded` or `Failed` state.

    Args:
      execution_id: The ID of the execution.
      input_dict: Input dict from input key to a list of Artifacts. These are
        often outputs of another component in the pipeline and passed to the
        component by the orchestration system.
      output_dict: Output dict from output key to a list of Artifacts. These are
        often consumed by a dependent component.
      exec_properties: A dict of execution properties. These are inputs to
        pipeline with primitive types (int, string, float) and fully
        materialized when a pipeline is constructed. No dependency to other
        component or later injection from orchestration systems is necessary or
        possible on these values.

    Raises:
      RuntimeError: when the pod is in `Failed` state or unexpected failure from
      Kubernetes API.

    """

    container_spec = cast(executor_spec.ExecutorContainerSpec,
                          self._component_executor_spec)

    # Replace container spec with jinja2 template.
    container_spec = container_common.resolve_container_template(
        container_spec, input_dict, output_dict, exec_properties)
    pod_name = self._build_pod_name(execution_id)
    # TODO(hongyes): replace the default value from component config.
    try:
      namespace = kube_utils.get_kfp_namespace()
    except RuntimeError:
      namespace = 'kubeflow'

    pod_manifest = self._build_pod_manifest(pod_name, container_spec)
    core_api = kube_utils.make_core_v1_api()

    if kube_utils.is_inside_kfp():
      launcher_pod = kube_utils.get_current_kfp_pod(core_api)
      pod_manifest['spec']['serviceAccount'] = launcher_pod.spec.service_account
      pod_manifest['spec'][
          'serviceAccountName'] = launcher_pod.spec.service_account_name
      pod_manifest['metadata'][
          'ownerReferences'] = container_common.to_swagger_dict(
              launcher_pod.metadata.owner_references)
    else:
      pod_manifest['spec']['serviceAccount'] = kube_utils.TFX_SERVICE_ACCOUNT
      pod_manifest['spec'][
          'serviceAccountName'] = kube_utils.TFX_SERVICE_ACCOUNT

    logging.info('Looking for pod "%s:%s".', namespace, pod_name)
    resp = kube_utils.get_pod(core_api, pod_name, namespace)
    if not resp:
      logging.info('Pod "%s:%s" does not exist. Creating it...',
                   namespace, pod_name)
      logging.info('Pod manifest: %s', pod_manifest)
      try:
        resp = core_api.create_namespaced_pod(
            namespace=namespace, body=pod_manifest)
      except client.rest.ApiException as e:
        raise RuntimeError(
            'Failed to created container executor pod!\nReason: %s\nBody: %s' %
            (e.reason, e.body))

    # Wait up to 300 seconds for the pod to move from pending to another status.
    logging.info('Waiting for pod "%s:%s" to start.', namespace, pod_name)
    kube_utils.wait_pod(
        core_api,
        pod_name,
        namespace,
        exit_condition_lambda=kube_utils.pod_is_not_pending,
        condition_description='non-pending status',
        timeout_sec=300)

    logging.info('Start log streaming for pod "%s:%s".', namespace, pod_name)
    try:
      logs = core_api.read_namespaced_pod_log(
          name=pod_name,
          namespace=namespace,
          container=kube_utils.ARGO_MAIN_CONTAINER_NAME,
          follow=True,
          _preload_content=False).stream()
    except client.rest.ApiException as e:
      raise RuntimeError(
          'Failed to stream the logs from the pod!\nReason: %s\nBody: %s' %
          (e.reason, e.body))

    for log in logs:
      logging.info(log.decode().rstrip('\n'))

    # Wait indefinitely for the pod to complete.
    resp = kube_utils.wait_pod(
        core_api,
        pod_name,
        namespace,
        exit_condition_lambda=kube_utils.pod_is_done,
        condition_description='done state')

    if resp.status.phase == kube_utils.PodPhase.FAILED.value:
      raise RuntimeError('Pod "%s:%s" failed with status "%s".' %
                         (namespace, pod_name, resp.status))

    logging.info('Pod "%s:%s" is done.', namespace, pod_name)
예제 #4
0
    def _run_executor(self, execution_id: int,
                      input_dict: Dict[Text, List[types.Artifact]],
                      output_dict: Dict[Text, List[types.Artifact]],
                      exec_properties: Dict[Text, Any]) -> None:
        """Execute underlying component implementation.

    Runs executor container in a Kubernetes Pod and wait until it goes into
    `Succeeded` or `Failed` state.

    Args:
      execution_id: The ID of the execution.
      input_dict: Input dict from input key to a list of Artifacts. These are
        often outputs of another component in the pipeline and passed to the
        component by the orchestration system.
      output_dict: Output dict from output key to a list of Artifacts. These are
        often consumed by a dependent component.
      exec_properties: A dict of execution properties. These are inputs to
        pipeline with primitive types (int, string, float) and fully
        materialized when a pipeline is constructed. No dependency to other
        component or later injection from orchestration systems is necessary or
        possible on these values.

    Raises:
      RuntimeError: when the pod is in `Failed` state or unexpected failure from
      Kubernetes API.

    """

        container_spec = cast(executor_spec.ExecutorContainerSpec,
                              self._component_executor_spec)

        # Replace container spec with jinja2 template.
        container_spec = container_common.resolve_container_template(
            container_spec, input_dict, output_dict, exec_properties)
        pod_name = self._build_pod_name(execution_id)
        # TODO(hongyes): replace the default value from component config.
        namespace = os.getenv(_KFP_NAMESPACE_ENV, 'kubeflow')

        pod_manifest = self._build_pod_manifest(pod_name, container_spec)

        try:
            is_in_cluster = True
            config.load_incluster_config()
            absl.logging.info('Loaded in cluster config.')
        except config.config_exception.ConfigException:
            is_in_cluster = False
            config.load_kube_config()
            absl.logging.info('Loaded kube config.')

        core_api = client.CoreV1Api()

        if is_in_cluster:
            launcher_pod_name = os.getenv(_KFP_POD_NAME_ENV)
            launcher_pod = self._get_pod(core_api, launcher_pod_name,
                                         namespace)
            pod_manifest['spec'][
                'serviceAccount'] = launcher_pod.spec.service_account
            pod_manifest['spec'][
                'serviceAccountName'] = launcher_pod.spec.service_account_name
            pod_manifest['metadata'][
                'ownerReferences'] = container_common.to_swagger_dict(
                    launcher_pod.metadata.owner_references)

        absl.logging.info('Looking for pod "%s:%s".' % (namespace, pod_name))
        resp = self._get_pod(core_api, pod_name, namespace)
        if not resp:
            absl.logging.info('Pod "%s:%s" does not exist. Creating it...' %
                              (namespace, pod_name))
            absl.logging.info('Pod manifest: ' + str(pod_manifest))
            try:
                resp = core_api.create_namespaced_pod(namespace=namespace,
                                                      body=pod_manifest)
            except client.rest.ApiException as e:
                raise RuntimeError(
                    'Failed to created container executor pod!\nReason: %s\nBody: %s'
                    % (e.reason, e.body))

        absl.logging.info('Waiting for pod "%s:%s" to start.' %
                          (namespace, pod_name))
        self._wait_pod(core_api,
                       pod_name,
                       namespace,
                       exit_condition_lambda=_pod_is_not_pending,
                       condition_description='non-pending status')

        absl.logging.info('Start log streaming for pod "%s:%s".' %
                          (namespace, pod_name))
        try:
            logs = core_api.read_namespaced_pod_log(
                name=pod_name,
                namespace=namespace,
                container='main',
                follow=True,
                _preload_content=False).stream()
        except client.rest.ApiException as e:
            raise RuntimeError(
                'Failed to stream the logs from the pod!\nReason: %s\nBody: %s'
                % (e.reason, e.body))

        for log in logs:
            absl.logging.info(log.decode().rstrip('\n'))

        resp = self._wait_pod(core_api,
                              pod_name,
                              namespace,
                              exit_condition_lambda=_pod_is_done,
                              condition_description='done state')

        if resp.status.phase == _POD_FAILED_PHASE:
            raise RuntimeError('Pod "%s:%s" failed with status "%s".' %
                               (namespace, pod_name, resp.status))

        absl.logging.info('Pod "%s:%s" is done.' % (namespace, pod_name))
    def run_executor(
        self, execution_info: data_types.ExecutionInfo
    ) -> execution_result_pb2.ExecutorOutput:
        """Execute underlying component implementation.

    Runs executor container in a Kubernetes Pod and wait until it goes into
    `Succeeded` or `Failed` state.

    Args:
      execution_info: All the information that the launcher provides.

    Raises:
      RuntimeError: when the pod is in `Failed` state or unexpected failure from
      Kubernetes API.

    Returns:
      An ExecutorOutput instance

    """

        context = placeholder_utils.ResolutionContext(
            exec_info=execution_info,
            executor_spec=self._executor_spec,
            platform_config=self._platform_config)

        container_spec = executor_spec_lib.ExecutorContainerSpec(
            image=self._container_executor_spec.image,
            command=[
                placeholder_utils.resolve_placeholder_expression(cmd, context)
                for cmd in self._container_executor_spec.commands
            ] or None,
            args=[
                placeholder_utils.resolve_placeholder_expression(arg, context)
                for arg in self._container_executor_spec.args
            ] or None,
        )

        # Replace container spec with jinja2 template.
        input_dict = execution_info.input_dict
        output_dict = execution_info.output_dict
        exec_properties = execution_info.exec_properties
        container_spec = container_common.resolve_container_template(
            container_spec, input_dict, output_dict, exec_properties)
        pod_name = self._build_pod_name(execution_info)
        # TODO(hongyes): replace the default value from component config.
        try:
            namespace = kube_utils.get_kfp_namespace()
        except RuntimeError:
            namespace = 'kubeflow'

        pod_manifest = self._build_pod_manifest(pod_name, container_spec)
        core_api = kube_utils.make_core_v1_api()

        if kube_utils.is_inside_kfp():
            launcher_pod = kube_utils.get_current_kfp_pod(core_api)
            pod_manifest['spec'][
                'serviceAccount'] = launcher_pod.spec.service_account
            pod_manifest['spec'][
                'serviceAccountName'] = launcher_pod.spec.service_account_name
            pod_manifest['metadata'][
                'ownerReferences'] = container_common.to_swagger_dict(
                    launcher_pod.metadata.owner_references)
        else:
            pod_manifest['spec'][
                'serviceAccount'] = kube_utils.TFX_SERVICE_ACCOUNT
            pod_manifest['spec'][
                'serviceAccountName'] = kube_utils.TFX_SERVICE_ACCOUNT

        logging.info('Looking for pod "%s:%s".', namespace, pod_name)
        resp = kube_utils.get_pod(core_api, pod_name, namespace)
        if not resp:
            logging.info('Pod "%s:%s" does not exist. Creating it...',
                         namespace, pod_name)
            logging.info('Pod manifest: %s', pod_manifest)
            try:
                resp = core_api.create_namespaced_pod(namespace=namespace,
                                                      body=pod_manifest)
            except client.rest.ApiException as e:
                raise RuntimeError(
                    'Failed to created container executor pod!\nReason: %s\nBody: %s'
                    % (e.reason, e.body))

        # Wait up to 300 seconds for the pod to move from pending to another status.
        logging.info('Waiting for pod "%s:%s" to start.', namespace, pod_name)
        kube_utils.wait_pod(
            core_api,
            pod_name,
            namespace,
            exit_condition_lambda=kube_utils.pod_is_not_pending,
            condition_description='non-pending status',
            timeout_sec=300)

        logging.info('Start log streaming for pod "%s:%s".', namespace,
                     pod_name)
        try:
            logs = core_api.read_namespaced_pod_log(
                name=pod_name,
                namespace=namespace,
                container=kube_utils.ARGO_MAIN_CONTAINER_NAME,
                follow=True,
                _preload_content=False).stream()
        except client.rest.ApiException as e:
            raise RuntimeError(
                'Failed to stream the logs from the pod!\nReason: %s\nBody: %s'
                % (e.reason, e.body))

        for log in logs:
            logging.info(log.decode().rstrip('\n'))

        # Wait indefinitely for the pod to complete.
        resp = kube_utils.wait_pod(
            core_api,
            pod_name,
            namespace,
            exit_condition_lambda=kube_utils.pod_is_done,
            condition_description='done state')

        if resp.status.phase == kube_utils.PodPhase.FAILED.value:
            raise RuntimeError('Pod "%s:%s" failed with status "%s".' %
                               (namespace, pod_name, resp.status))

        logging.info('Pod "%s:%s" is done.', namespace, pod_name)

        return execution_result_pb2.ExecutorOutput()