Пример #1
0
 def testBuildParameterTypeSpec(self):
     type_enum = pipeline_pb2.PrimitiveType.PrimitiveTypeEnum
     testdata = {
         42: type_enum.INT,
         42.1: type_enum.DOUBLE,
         '42': type_enum.STRING,
         data_types.RuntimeParameter(name='_', ptype=int): type_enum.INT,
         data_types.RuntimeParameter(name='_', ptype=float):
         type_enum.DOUBLE,
         data_types.RuntimeParameter(name='_', ptype=str): type_enum.STRING,
     }
     for value, expected_type_enum in testdata.items():
         self.assertEqual(
             compiler_utils.build_parameter_type_spec(value).type,
             expected_type_enum)
Пример #2
0
    def build(self) -> pipeline_pb2.PipelineSpec:
        """Build a pipeline PipelineSpec."""

        _check_name(self._pipeline_info.pipeline_name)

        deployment_config = pipeline_pb2.PipelineDeploymentConfig()
        pipeline_info = pipeline_pb2.PipelineInfo(
            name=self._pipeline_info.pipeline_name)

        tasks = {}
        component_defs = {}
        # Map from (producer component id, output key) to (new producer component
        # id, output key)
        channel_redirect_map = {}
        with parameter_utils.ParameterContext() as pc:
            for component in self._pipeline.components:
                # Here the topological order of components is required.
                # If a channel redirection is needed, redirect mapping is expected to be
                # available because the upstream node (which is the cause for
                # redirecting) is processed before the downstream consumer nodes.
                built_tasks = step_builder.StepBuilder(
                    node=component,
                    deployment_config=deployment_config,
                    component_defs=component_defs,
                    image=self._default_image,
                    image_cmds=self._default_commands,
                    beam_pipeline_args=self._pipeline.beam_pipeline_args,
                    enable_cache=self._pipeline.enable_cache,
                    pipeline_info=self._pipeline_info,
                    channel_redirect_map=channel_redirect_map).build()
                tasks.update(built_tasks)

        result = pipeline_pb2.PipelineSpec(pipeline_info=pipeline_info)
        result.deployment_spec.update(
            json_format.MessageToDict(deployment_config))
        for name, component_def in component_defs.items():
            result.components[name].CopyFrom(component_def)
        for name, task_spec in tasks.items():
            result.root.dag.tasks[name].CopyFrom(task_spec)

        # Attach runtime parameter to root's input parameter
        for param in pc.parameters:
            result.root.input_definitions.parameters[param.name].CopyFrom(
                compiler_utils.build_parameter_type_spec(param))

        return result
Пример #3
0
  def _build_latest_artifact_resolver(
      self) -> Dict[str, pipeline_pb2.PipelineTaskSpec]:
    """Builds a resolver spec for a latest artifact resolver.

    Returns:
      A list of two PipelineTaskSpecs. One represents the query for latest valid
      ModelBlessing artifact. Another one represents the query for latest
      blessed Model artifact.
    Raises:
      ValueError: when desired_num_of_artifacts != 1. 1 is the only supported
        value currently.
    """
    # Fetch the init kwargs for the resolver.
    resolver_config = self._exec_properties[resolver.RESOLVER_CONFIG]
    if (isinstance(resolver_config, dict) and
        resolver_config.get('desired_num_of_artifacts', 0) > 1):
      raise ValueError('Only desired_num_of_artifacts=1 is supported currently.'
                       ' Got {}'.format(
                           resolver_config.get('desired_num_of_artifacts')))

    component_def = pipeline_pb2.ComponentSpec()
    executor_label = _EXECUTOR_LABEL_PATTERN.format(self._name)
    component_def.executor_label = executor_label
    task_spec = pipeline_pb2.PipelineTaskSpec()
    task_spec.task_info.name = self._name

    for name, output_channel in self._outputs.items():
      output_artifact_spec = compiler_utils.build_output_artifact_spec(
          output_channel)
      component_def.output_definitions.artifacts[name].CopyFrom(
          output_artifact_spec)
    for name, value in self._exec_properties.items():
      if value is None:
        continue
      parameter_type_spec = compiler_utils.build_parameter_type_spec(value)
      component_def.input_definitions.parameters[name].CopyFrom(
          parameter_type_spec)
      if isinstance(value, data_types.RuntimeParameter):
        parameter_utils.attach_parameter(value)
        task_spec.inputs.parameters[name].component_input_parameter = value.name
      else:
        task_spec.inputs.parameters[name].CopyFrom(
            pipeline_pb2.TaskInputsSpec.InputParameterSpec(
                runtime_value=compiler_utils.value_converter(value)))
    self._component_defs[self._name] = component_def
    task_spec.component_ref.name = self._name

    artifact_queries = {}
    # Buid the artifact query for each channel in the input dict.
    for name, c in self._inputs.items():
      query_filter = ('artifact_type="{type}" and state={state}').format(
          type=compiler_utils.get_artifact_title(c.type),
          state=metadata_store_pb2.Artifact.State.Name(
              metadata_store_pb2.Artifact.LIVE))
      # Resolver's output dict has the same set of keys as its input dict.
      artifact_queries[name] = ResolverSpec.ArtifactQuerySpec(
          filter=query_filter)

    resolver_spec = ResolverSpec(output_artifact_queries=artifact_queries)
    executor = pipeline_pb2.PipelineDeploymentConfig.ExecutorSpec()
    executor.resolver.CopyFrom(resolver_spec)
    self._deployment_config.executors[executor_label].CopyFrom(executor)
    return {self._name: task_spec}
Пример #4
0
  def build(self) -> Dict[str, pipeline_pb2.PipelineTaskSpec]:
    """Builds a pipeline PipelineTaskSpec given the node information.

    Each TFX node maps one task spec and usually one component definition and
    one executor spec. (with resolver node as an exception. See explaination
    in the Returns section).

     - Component definition includes interfaces of a node. For example, name
    and type information of inputs/outputs/execution_properties.
     - Task spec contains the topologies around the node. For example, the
    dependency nodes, where to read the inputs and exec_properties (from another
    task, from parent component or from a constant value). The task spec has the
    name of the component definition it references. It is possible that a task
    spec references an existing component definition that's built previously.
     - Executor spec encodes how the node is actually executed. For example,
    args to start a container, or query strings for resolvers. All executor spec
    will be packed into deployment config proto.

    During the build, all three parts mentioned above will be updated.

    Returns:
      A Dict mapping from node id to PipelineTaskSpec messages corresponding to
      the node. For most of the cases, the dict contains a single element.
      The only exception is when compiling latest blessed model resolver.
      One DSL node will be split to two resolver specs to reflect the
      two-phased query execution.

    Raises:
      NotImplementedError: When the node being built is an InfraValidator.
    """
    # 1. Resolver tasks won't have input artifacts in the API proto. First we
    #    specialcase two resolver types we support.
    if isinstance(self._node, resolver.Resolver):
      return self._build_resolver_spec()

    # 2. Build component spec.
    component_def = pipeline_pb2.ComponentSpec()
    executor_label = _EXECUTOR_LABEL_PATTERN.format(self._name)
    component_def.executor_label = executor_label
    # Inputs
    for name, input_channel in self._inputs.items():
      input_artifact_spec = compiler_utils.build_input_artifact_spec(
          input_channel)
      component_def.input_definitions.artifacts[name].CopyFrom(
          input_artifact_spec)
    # Outputs
    for name, output_channel in self._outputs.items():
      # Currently, we're working under the assumption that for tasks
      # (those generated by BaseComponent), each channel contains a single
      # artifact.
      output_artifact_spec = compiler_utils.build_output_artifact_spec(
          output_channel)
      component_def.output_definitions.artifacts[name].CopyFrom(
          output_artifact_spec)
    # Exec properties
    for name, value in self._exec_properties.items():
      # value can be None for unprovided optional exec properties.
      if value is None:
        continue
      parameter_type_spec = compiler_utils.build_parameter_type_spec(value)
      component_def.input_definitions.parameters[name].CopyFrom(
          parameter_type_spec)
    if self._name not in self._component_defs:
      self._component_defs[self._name] = component_def
    else:
      raise ValueError(f'Found duplicate component ids {self._name} while '
                       'building component definitions.')

    # 3. Build task spec.
    task_spec = pipeline_pb2.PipelineTaskSpec()
    task_spec.task_info.name = self._name
    dependency_ids = [node.id for node in self._node.upstream_nodes]
    for name, input_channel in self._inputs.items():
      # If the redirecting map is provided (usually for latest blessed model
      # resolver, we'll need to redirect accordingly. Also, the upstream node
      # list will be updated and replaced by the new producer id.
      producer_id = input_channel.producer_component_id
      output_key = input_channel.output_key
      for k, v in self._channel_redirect_map.items():
        if k[0] == producer_id and producer_id in dependency_ids:
          dependency_ids.remove(producer_id)
          dependency_ids.append(v[0])
      producer_id = self._channel_redirect_map.get((producer_id, output_key),
                                                   (producer_id, output_key))[0]
      output_key = self._channel_redirect_map.get((producer_id, output_key),
                                                  (producer_id, output_key))[1]
      input_artifact_spec = pipeline_pb2.TaskInputsSpec.InputArtifactSpec()
      input_artifact_spec.task_output_artifact.producer_task = producer_id
      input_artifact_spec.task_output_artifact.output_artifact_key = output_key
      task_spec.inputs.artifacts[name].CopyFrom(input_artifact_spec)
    for name, value in self._exec_properties.items():
      if value is None:
        continue
      if isinstance(value, data_types.RuntimeParameter):
        parameter_utils.attach_parameter(value)
        task_spec.inputs.parameters[name].component_input_parameter = value.name
      else:
        task_spec.inputs.parameters[name].CopyFrom(
            pipeline_pb2.TaskInputsSpec.InputParameterSpec(
                runtime_value=compiler_utils.value_converter(value)))

    task_spec.component_ref.name = self._name

    dependency_ids = sorted(dependency_ids)
    for dependency in dependency_ids:
      task_spec.dependent_tasks.append(dependency)

    if self._enable_cache:
      task_spec.caching_options.CopyFrom(
          pipeline_pb2.PipelineTaskSpec.CachingOptions(
              enable_cache=self._enable_cache))

    # 4. Build the executor body for other common tasks.
    executor = pipeline_pb2.PipelineDeploymentConfig.ExecutorSpec()
    if isinstance(self._node, importer.Importer):
      executor.importer.CopyFrom(self._build_importer_spec())
    elif isinstance(self._node, components.FileBasedExampleGen):
      executor.container.CopyFrom(self._build_file_based_example_gen_spec())
    elif isinstance(self._node, (components.InfraValidator)):
      raise NotImplementedError(
          'The componet type "{}" is not supported'.format(type(self._node)))
    else:
      executor.container.CopyFrom(self._build_container_spec())
    self._deployment_config.executors[executor_label].CopyFrom(executor)

    return {self._name: task_spec}
Пример #5
0
    def build(self) -> pipeline_pb2.PipelineSpec:
        """Build a pipeline PipelineSpec."""

        _check_name(self._pipeline_info.pipeline_name)

        deployment_config = pipeline_pb2.PipelineDeploymentConfig()
        pipeline_info = pipeline_pb2.PipelineInfo(
            name=self._pipeline_info.pipeline_name)

        tfx_tasks = {}
        component_defs = {}
        # Map from (producer component id, output key) to (new producer component
        # id, output key)
        channel_redirect_map = {}
        with parameter_utils.ParameterContext() as pc:
            for component in self._pipeline.components:
                if self._exit_handler and component.id == compiler_utils.TFX_DAG_NAME:
                    component.with_id(component.id +
                                      _generate_component_name_suffix())
                    logging.warning(
                        '_tfx_dag is system reserved name for pipeline with'
                        'exit handler, added suffix to your component name: %s',
                        component.id)
                # Here the topological order of components is required.
                # If a channel redirection is needed, redirect mapping is expected to be
                # available because the upstream node (which is the cause for
                # redirecting) is processed before the downstream consumer nodes.
                built_tasks = step_builder.StepBuilder(
                    node=component,
                    deployment_config=deployment_config,
                    component_defs=component_defs,
                    image=self._default_image,
                    image_cmds=self._default_commands,
                    beam_pipeline_args=self._pipeline.beam_pipeline_args,
                    enable_cache=self._pipeline.enable_cache,
                    pipeline_info=self._pipeline_info,
                    channel_redirect_map=channel_redirect_map).build()
                tfx_tasks.update(built_tasks)

        result = pipeline_pb2.PipelineSpec(pipeline_info=pipeline_info)

        # if exit handler is defined, put all the TFX tasks under tfx_dag,
        # exit handler is a separate component triggered by tfx_dag.
        if self._exit_handler:
            for name, task_spec in tfx_tasks.items():
                result.components[compiler_utils.TFX_DAG_NAME].dag.tasks[
                    name].CopyFrom(task_spec)
            # construct root with exit handler
            exit_handler_task = step_builder.StepBuilder(
                node=self._exit_handler,
                deployment_config=deployment_config,
                component_defs=component_defs,
                image=self._default_image,
                image_cmds=self._default_commands,
                beam_pipeline_args=self._pipeline.beam_pipeline_args,
                enable_cache=False,
                pipeline_info=self._pipeline_info,
                channel_redirect_map=channel_redirect_map,
                is_exit_handler=True).build()
            result.root.dag.tasks[
                compiler_utils.
                TFX_DAG_NAME].component_ref.name = compiler_utils.TFX_DAG_NAME
            result.root.dag.tasks[
                compiler_utils.
                TFX_DAG_NAME].task_info.name = compiler_utils.TFX_DAG_NAME
            result.root.dag.tasks[self._exit_handler.id].CopyFrom(
                exit_handler_task[self._exit_handler.id])
        else:
            for name, task_spec in tfx_tasks.items():
                result.root.dag.tasks[name].CopyFrom(task_spec)

        result.deployment_spec.update(
            json_format.MessageToDict(deployment_config))
        for name, component_def in component_defs.items():
            result.components[name].CopyFrom(component_def)

        # Attach runtime parameter to root's input parameter
        for param in pc.parameters:
            result.root.input_definitions.parameters[param.name].CopyFrom(
                compiler_utils.build_parameter_type_spec(param))

        return result
Пример #6
0
  def build(self) -> Dict[str, pipeline_pb2.PipelineTaskSpec]:
    """Builds a pipeline PipelineTaskSpec given the node information.

    Each TFX node maps one task spec and usually one component definition and
    one executor spec. (with resolver node as an exception. See explaination
    in the Returns section).

     - Component definition includes interfaces of a node. For example, name
    and type information of inputs/outputs/execution_properties.
     - Task spec contains the topologies around the node. For example, the
    dependency nodes, where to read the inputs and exec_properties (from another
    task, from parent component or from a constant value). The task spec has the
    name of the component definition it references. It is possible that a task
    spec references an existing component definition that's built previously.
     - Executor spec encodes how the node is actually executed. For example,
    args to start a container, or query strings for resolvers. All executor spec
    will be packed into deployment config proto.

    During the build, all three parts mentioned above will be updated.

    Returns:
      A Dict mapping from node id to PipelineTaskSpec messages corresponding to
      the node. For most of the cases, the dict contains a single element.
      The only exception is when compiling latest blessed model resolver.
      One DSL node will be split to two resolver specs to reflect the
      two-phased query execution.

    Raises:
      NotImplementedError: When the node being built is an InfraValidator.
    """
    # 1. Resolver tasks won't have input artifacts in the API proto. First we
    #    specialcase two resolver types we support.
    if isinstance(self._node, resolver.Resolver):
      return self._build_resolver_spec()

    # 2. Build component spec.
    component_def = pipeline_pb2.ComponentSpec()
    task_spec = pipeline_pb2.PipelineTaskSpec()
    executor_label = _EXECUTOR_LABEL_PATTERN.format(self._name)
    component_def.executor_label = executor_label

    # Conditionals
    implicit_input_channels = {}
    implicit_upstream_node_ids = set()
    predicates = conditional.get_predicates(self._node)
    if predicates:
      implicit_keys_map = {
          tfx_compiler_utils.implicit_channel_key(channel): key
          for key, channel in self._inputs.items()
      }
      cel_predicates = []
      for predicate in predicates:
        for channel in predicate.dependent_channels():
          implicit_key = tfx_compiler_utils.implicit_channel_key(channel)
          if implicit_key not in implicit_keys_map:
            # Store this channel and add it to the node inputs later.
            implicit_input_channels[implicit_key] = channel
            # Store the producer node and add it to the upstream nodes later.
            implicit_upstream_node_ids.add(channel.producer_component_id)
        placeholder_pb = predicate.encode_with_keys(
            tfx_compiler_utils.build_channel_to_key_fn(implicit_keys_map))
        cel_predicates.append(compiler_utils.placeholder_to_cel(placeholder_pb))
      task_spec.trigger_policy.condition = ' && '.join(cel_predicates)

    # Inputs
    for name, input_channel in itertools.chain(self._inputs.items(),
                                               implicit_input_channels.items()):
      input_artifact_spec = compiler_utils.build_input_artifact_spec(
          input_channel)
      component_def.input_definitions.artifacts[name].CopyFrom(
          input_artifact_spec)
    # Outputs
    for name, output_channel in self._outputs.items():
      # Currently, we're working under the assumption that for tasks
      # (those generated by BaseComponent), each channel contains a single
      # artifact.
      output_artifact_spec = compiler_utils.build_output_artifact_spec(
          output_channel)
      component_def.output_definitions.artifacts[name].CopyFrom(
          output_artifact_spec)
    # Exec properties
    for name, value in self._exec_properties.items():
      # value can be None for unprovided optional exec properties.
      if value is None:
        continue
      parameter_type_spec = compiler_utils.build_parameter_type_spec(value)
      component_def.input_definitions.parameters[name].CopyFrom(
          parameter_type_spec)
    if self._name not in self._component_defs:
      self._component_defs[self._name] = component_def
    else:
      raise ValueError(f'Found duplicate component ids {self._name} while '
                       'building component definitions.')

    # 3. Build task spec.
    task_spec.task_info.name = self._name
    dependency_ids = sorted({node.id for node in self._node.upstream_nodes}
                            | implicit_upstream_node_ids)

    for name, input_channel in itertools.chain(self._inputs.items(),
                                               implicit_input_channels.items()):
      # TODO(b/169573945): Add support for vertex if requested.
      if not isinstance(input_channel, Channel):
        raise TypeError('Only single Channel is supported.')
      if self._is_exit_handler:
        logging.error('exit handler component doesn\'t take input artifact, '
                      'the input will be ignored.')
        continue
      # If the redirecting map is provided (usually for latest blessed model
      # resolver, we'll need to redirect accordingly. Also, the upstream node
      # list will be updated and replaced by the new producer id.
      producer_id = input_channel.producer_component_id
      output_key = input_channel.output_key
      for k, v in self._channel_redirect_map.items():
        if k[0] == producer_id and producer_id in dependency_ids:
          dependency_ids.remove(producer_id)
          dependency_ids.append(v[0])
      producer_id = self._channel_redirect_map.get((producer_id, output_key),
                                                   (producer_id, output_key))[0]
      output_key = self._channel_redirect_map.get((producer_id, output_key),
                                                  (producer_id, output_key))[1]
      input_artifact_spec = pipeline_pb2.TaskInputsSpec.InputArtifactSpec()
      input_artifact_spec.task_output_artifact.producer_task = producer_id
      input_artifact_spec.task_output_artifact.output_artifact_key = output_key
      task_spec.inputs.artifacts[name].CopyFrom(input_artifact_spec)
    for name, value in self._exec_properties.items():
      if value is None:
        continue
      if isinstance(value, data_types.RuntimeParameter):
        parameter_utils.attach_parameter(value)
        task_spec.inputs.parameters[name].component_input_parameter = value.name
      elif isinstance(value, decorators.FinalStatusStr):
        if not self._is_exit_handler:
          logging.error('FinalStatusStr type is only allowed to use in exit'
                        ' handler. The parameter is ignored.')
        else:
          task_spec.inputs.parameters[name].task_final_status.producer_task = (
              compiler_utils.TFX_DAG_NAME)
      else:
        task_spec.inputs.parameters[name].CopyFrom(
            pipeline_pb2.TaskInputsSpec.InputParameterSpec(
                runtime_value=compiler_utils.value_converter(value)))

    task_spec.component_ref.name = self._name

    dependency_ids = sorted(dependency_ids)
    for dependency in dependency_ids:
      task_spec.dependent_tasks.append(dependency)

    if self._enable_cache:
      task_spec.caching_options.CopyFrom(
          pipeline_pb2.PipelineTaskSpec.CachingOptions(
              enable_cache=self._enable_cache))

    if self._is_exit_handler:
      task_spec.trigger_policy.strategy = (
          pipeline_pb2.PipelineTaskSpec.TriggerPolicy
          .ALL_UPSTREAM_TASKS_COMPLETED)
      task_spec.dependent_tasks.append(compiler_utils.TFX_DAG_NAME)

    # 4. Build the executor body for other common tasks.
    executor = pipeline_pb2.PipelineDeploymentConfig.ExecutorSpec()
    if isinstance(self._node, importer.Importer):
      executor.importer.CopyFrom(self._build_importer_spec())
    elif isinstance(self._node, components.FileBasedExampleGen):
      executor.container.CopyFrom(self._build_file_based_example_gen_spec())
    elif isinstance(self._node, (components.InfraValidator)):
      raise NotImplementedError(
          'The componet type "{}" is not supported'.format(type(self._node)))
    else:
      executor.container.CopyFrom(self._build_container_spec())
    self._deployment_config.executors[executor_label].CopyFrom(executor)

    return {self._name: task_spec}