Exemplo n.º 1
0
  def setUp(self):
    super(BaseComponentWithPipelineParamTest, self).setUp()

    test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')
    example_gen_buckets = data_types.RuntimeParameter(
        name='example-gen-buckets', ptype=int, default=10)

    examples = standard_artifacts.ExternalArtifact()
    example_gen = csv_example_gen_component.CsvExampleGen(
        input=channel_utils.as_channel([examples]),
        output_config={
            'split_config': {
                'splits': [{
                    'name': 'examples',
                    'hash_buckets': example_gen_buckets
                }]
            }
        })
    statistics_gen = statistics_gen_component.StatisticsGen(
        examples=example_gen.outputs['examples'], instance_name='foo')

    pipeline = tfx_pipeline.Pipeline(
        pipeline_name=self._test_pipeline_name,
        pipeline_root='test_pipeline_root',
        metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
        components=[example_gen, statistics_gen],
    )

    self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
    self._tfx_ir = pipeline_pb2.Pipeline()
    with dsl.Pipeline('test_pipeline'):
      self.example_gen = base_component.BaseComponent(
          component=example_gen,
          component_launcher_class=in_process_component_launcher
          .InProcessComponentLauncher,
          depends_on=set(),
          pipeline=pipeline,
          pipeline_name=self._test_pipeline_name,
          pipeline_root=test_pipeline_root,
          tfx_image='container_image',
          kubeflow_metadata_config=self._metadata_config,
          component_config=None,
          tfx_ir=self._tfx_ir)
      self.statistics_gen = base_component.BaseComponent(
          component=statistics_gen,
          component_launcher_class=in_process_component_launcher
          .InProcessComponentLauncher,
          depends_on=set(),
          pipeline=pipeline,
          pipeline_name=self._test_pipeline_name,
          pipeline_root=test_pipeline_root,
          tfx_image='container_image',
          kubeflow_metadata_config=self._metadata_config,
          component_config=None,
          tfx_ir=self._tfx_ir
      )

    self.tfx_example_gen = example_gen
    self.tfx_statistics_gen = statistics_gen
Exemplo n.º 2
0
    def setUp(self):
        super(BaseComponentWithPipelineParamTest, self).setUp()

        test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')
        example_gen_output_name = runtime_string_parameter.RuntimeStringParameter(
            name='example-gen-output-name', default='default-to-be-discarded')

        examples = standard_artifacts.ExternalArtifact()
        example_gen = csv_example_gen_component.CsvExampleGen(
            input=channel_utils.as_channel([examples]),
            output_config=example_gen_pb2.Output(
                split_config=example_gen_pb2.SplitConfig(splits=[
                    example_gen_pb2.SplitConfig.Split(
                        name=example_gen_output_name, hash_buckets=10)
                ])))
        statistics_gen = statistics_gen_component.StatisticsGen(
            examples=example_gen.outputs['examples'], instance_name='foo')

        pipeline = tfx_pipeline.Pipeline(
            pipeline_name=self._test_pipeline_name,
            pipeline_root='test_pipeline_root',
            metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
            components=[example_gen, statistics_gen],
        )

        self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
        self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
        with dsl.Pipeline('test_pipeline'):
            self.example_gen = base_component.BaseComponent(
                component=example_gen,
                component_launcher_class=in_process_component_launcher.
                InProcessComponentLauncher,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_name=self._test_pipeline_name,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                component_config=None)
            self.statistics_gen = base_component.BaseComponent(
                component=statistics_gen,
                component_launcher_class=in_process_component_launcher.
                InProcessComponentLauncher,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_name=self._test_pipeline_name,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                component_config=None,
            )

        self.tfx_example_gen = example_gen
        self.tfx_statistics_gen = statistics_gen
Exemplo n.º 3
0
  def _construct_pipeline_graph(self, pipeline: tfx_pipeline.Pipeline):
    """Constructs a Kubeflow Pipeline graph.

    Args:
      pipeline: The logical TFX pipeline to base the construction on.
    """
    component_to_kfp_op = {}

    # Assumption: There is a partial ordering of components in the list, i.e.,
    # if component A depends on component B and C, then A appears after B and C
    # in the list.
    for component in pipeline.components:
      # Keep track of the set of upstream dsl.ContainerOps for this component.
      depends_on = set()

      for upstream_component in component.upstream_nodes:
        depends_on.add(component_to_kfp_op[upstream_component])

      kfp_component = base_component.BaseComponent(
          component=component,
          depends_on=depends_on,
          pipeline=pipeline,
          tfx_image=self._config.tfx_image,
          kubeflow_metadata_config=self._config.kubeflow_metadata_config)

      for operator in self._config.pipeline_operator_funcs:
        kfp_component.container_op.apply(operator)

      component_to_kfp_op[component] = kfp_component.container_op
Exemplo n.º 4
0
  def setUp(self):
    super(BaseComponentTest, self).setUp()
    examples = standard_artifacts.ExternalArtifact()
    example_gen = csv_example_gen_component.CsvExampleGen(
        input_base=channel_utils.as_channel([examples]))
    statistics_gen = statistics_gen_component.StatisticsGen(
        input_data=example_gen.outputs.examples, instance_name='foo')

    pipeline = tfx_pipeline.Pipeline(
        pipeline_name='test_pipeline',
        pipeline_root='test_pipeline_root',
        metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
        components=[example_gen, statistics_gen],
    )

    self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
    with dsl.Pipeline('test_pipeline'):
      self.component = base_component.BaseComponent(
          component=statistics_gen,
          depends_on=set(),
          pipeline=pipeline,
          tfx_image='container_image',
          kubeflow_metadata_config=self._metadata_config,
      )
    self.tfx_component = statistics_gen
Exemplo n.º 5
0
  def setUp(self):
    super(BaseComponentTest, self).setUp()
    examples = standard_artifacts.ExternalArtifact()
    example_gen = csv_example_gen_component.CsvExampleGen(
        input=channel_utils.as_channel([examples]))
    statistics_gen = statistics_gen_component.StatisticsGen(
        examples=example_gen.outputs['examples'], instance_name='foo')

    pipeline = tfx_pipeline.Pipeline(
        pipeline_name=self._test_pipeline_name,
        pipeline_root='test_pipeline_root',
        metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
        components=[example_gen, statistics_gen],
    )

    test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')

    self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
    with dsl.Pipeline('test_pipeline'):
      self.component = base_component.BaseComponent(
          component=statistics_gen,
          component_launcher_class=in_process_component_launcher
          .InProcessComponentLauncher,
          depends_on=set(),
          pipeline=pipeline,
          pipeline_name=self._test_pipeline_name,
          pipeline_root=test_pipeline_root,
          tfx_image='container_image',
          kubeflow_metadata_config=self._metadata_config,
          component_config=None,
      )
    self.tfx_component = statistics_gen
Exemplo n.º 6
0
    def setUp(self):
        super(BaseComponentTest, self).setUp()
        example_gen = csv_example_gen_component.CsvExampleGen(
            input_base='data_input')
        statistics_gen = statistics_gen_component.StatisticsGen(
            examples=example_gen.outputs['examples']).with_id('foo')

        pipeline = tfx_pipeline.Pipeline(
            pipeline_name=self._test_pipeline_name,
            pipeline_root='test_pipeline_root',
            metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
            components=[example_gen, statistics_gen],
        )

        test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')

        self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
        self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
        self._tfx_ir = pipeline_pb2.Pipeline()
        with dsl.Pipeline('test_pipeline'):
            self.component = base_component.BaseComponent(
                component=statistics_gen,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                tfx_ir=self._tfx_ir,
            )
        self.tfx_component = statistics_gen
Exemplo n.º 7
0
    def setUp(self):
        super().setUp()

        example_gen_output_config = data_types.RuntimeParameter(
            name='example-gen-output-config', ptype=str)

        example_gen = csv_example_gen_component.CsvExampleGen(
            input_base='data_root', output_config=example_gen_output_config)
        statistics_gen = statistics_gen_component.StatisticsGen(
            examples=example_gen.outputs['examples']).with_id('foo')

        test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')
        pipeline = tfx_pipeline.Pipeline(
            pipeline_name=self._test_pipeline_name,
            pipeline_root='test_pipeline_root',
            metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
            components=[example_gen, statistics_gen],
        )

        self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
        self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
        self._tfx_ir = pipeline_pb2.Pipeline()
        with dsl.Pipeline('test_pipeline'):
            self.example_gen = base_component.BaseComponent(
                component=example_gen,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                tfx_ir=self._tfx_ir,
                pod_labels_to_attach={},
                runtime_parameters=[example_gen_output_config])
            self.statistics_gen = base_component.BaseComponent(
                component=statistics_gen,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                tfx_ir=self._tfx_ir,
                pod_labels_to_attach={},
                runtime_parameters=[])

        self.tfx_example_gen = example_gen
        self.tfx_statistics_gen = statistics_gen
Exemplo n.º 8
0
    def setUp(self):
        output_dict = {
            'output_name': [types.TfxType(type_name='ExamplesPath')]
        }

        with dsl.Pipeline('test_pipeline'):
            self.component = base_component.BaseComponent(
                component_name='TFXComponent',
                input_dict={
                    'input_data': 'input-data-contents',
                    'train_steps': 300,
                    'accuracy_threshold': 0.3,
                },
                output_dict=output_dict,
                exec_properties={'module_file': '/path/to/module.py'},
            )
Exemplo n.º 9
0
    def _construct_pipeline_graph(self, pipeline: tfx_pipeline.Pipeline,
                                  pipeline_root: dsl.PipelineParam):
        """Constructs a Kubeflow Pipeline graph.

    Args:
      pipeline: The logical TFX pipeline to base the construction on.
      pipeline_root: dsl.PipelineParam representing the pipeline root.
    """
        component_to_kfp_op = {}

        # Assumption: There is a partial ordering of components in the list, i.e.,
        # if component A depends on component B and C, then A appears after B and C
        # in the list.
        for component in pipeline.components:
            # Keep track of the set of upstream dsl.ContainerOps for this component.
            depends_on = set()

            for upstream_component in component.upstream_nodes:
                depends_on.add(component_to_kfp_op[upstream_component])

            (component_launcher_class,
             component_config) = config_utils.find_component_launch_info(
                 self._config, component)

            kfp_component = base_component.BaseComponent(
                component=component,
                component_launcher_class=component_launcher_class,
                depends_on=depends_on,
                pipeline=pipeline,
                pipeline_name=pipeline.pipeline_info.pipeline_name,
                pipeline_root=pipeline_root,
                tfx_image=self._config.tfx_image,
                kubeflow_metadata_config=self._config.kubeflow_metadata_config,
                component_config=component_config)

            for operator in self._config.pipeline_operator_funcs:
                kfp_component.container_op.apply(operator)

            kfp_component.container_op.add_pod_label(SDK_ENV_LABEL,
                                                     self._sdk_env)
            assert self._pipeline_id, 'Failed to generate pipeline ID.'
            kfp_component.container_op.add_pod_label(PIPELINE_UUID_LABEL,
                                                     self._pipeline_id)

            component_to_kfp_op[component] = kfp_component.container_op
Exemplo n.º 10
0
    def _construct_pipeline_graph(self, pipeline: tfx_pipeline.Pipeline,
                                  pipeline_root: dsl.PipelineParam):
        """Constructs a Kubeflow Pipeline graph.

    Args:
      pipeline: The logical TFX pipeline to base the construction on.
      pipeline_root: dsl.PipelineParam representing the pipeline root.
    """
        component_to_kfp_op = {}
        tfx_ir = self._generate_tfx_ir(pipeline)

        # Assumption: There is a partial ordering of components in the list, i.e.,
        # if component A depends on component B and C, then A appears after B and C
        # in the list.
        for component in pipeline.components:
            # Keep track of the set of upstream dsl.ContainerOps for this component.
            depends_on = set()

            for upstream_component in component.upstream_nodes:
                depends_on.add(component_to_kfp_op[upstream_component])

            # remove the extra pipeline node information
            tfx_node_ir = self._dehydrate_tfx_ir(tfx_ir, component.id)

            kfp_component = base_component.BaseComponent(
                component=component,
                depends_on=depends_on,
                pipeline=pipeline,
                pipeline_root=pipeline_root,
                tfx_image=self._config.tfx_image,
                kubeflow_metadata_config=self._config.kubeflow_metadata_config,
                pod_labels_to_attach=self._pod_labels_to_attach,
                tfx_ir=tfx_node_ir,
                metadata_ui_path=self._config.metadata_ui_path,
                runtime_parameters=(
                    self._params_by_component_id[component.id] +
                    [tfx_pipeline.ROOT_PARAMETER]))

            for operator in self._config.pipeline_operator_funcs:
                kfp_component.container_op.apply(operator)

            component_to_kfp_op[component] = kfp_component.container_op
Exemplo n.º 11
0
    def _construct_pipeline_graph(self, pipeline):
        """Constructs a Kubeflow Pipeline graph.

    Args:
      pipeline: The logical TFX pipeline to base the construction on.
    """
        # producers is a map from an output Channel, to a Kubeflow component that
        # is responsible for the named output represented by the Channel.
        # Assumption: Channels are unique in a pipeline.
        producers = {}

        # Assumption: There is a partial ordering of components in the list, i.e.,
        # if component A depends on component B and C, then A appears after B and C
        # in the list.
        for component in pipeline.components:
            input_dict = {}
            for input_name, input_channel in component.input_dict.items():
                if input_channel in producers:
                    output = getattr(
                        producers[input_channel]['component'].outputs,
                        producers[input_channel]['channel_name'])

                    if not isinstance(output, dsl.PipelineParam):
                        raise ValueError(
                            'Component outputs should be of type dsl.PipelineParam.'
                            ' Got type {} for output {}'.format(
                                type(output), output))
                    input_dict[input_name] = output
                else:
                    input_dict[input_name] = json.dumps(
                        [x.json_dict() for x in input_channel.get()])

            kfp_component = base_component.BaseComponent(
                component_name=component.component_name,
                input_dict=input_dict,
                output_dict=self._prepare_output_dict(component.outputs),
                exec_properties=component.exec_properties)

            for channel_name, channel in component.outputs.get_all().items():
                producers[channel] = {}
                producers[channel]['component'] = kfp_component
                producers[channel]['channel_name'] = channel_name
Exemplo n.º 12
0
    def setUp(self):
        self._output_dict = {'output_name': [standard_artifacts.Examples()]}
        self._pipeline_properties = base_component.PipelineProperties(
            output_dir='output_dir',
            log_root='log_root',
        )

        with dsl.Pipeline('test_pipeline'):
            self.component = base_component.BaseComponent(
                component_name='TFXComponent',
                input_dict=collections.OrderedDict([
                    ('input_data', 'input-data-contents'),
                    ('train_steps', 300),
                    ('accuracy_threshold', 0.3),
                ]),
                output_dict=self._output_dict,
                exec_properties=collections.OrderedDict([
                    ('module_file', '/path/to/module.py')
                ]),
                executor_class_path='some.executor.Class',
                pipeline_properties=self._pipeline_properties,
            )
Exemplo n.º 13
0
    def _construct_pipeline_graph(self, pipeline):
        """Constructs a Kubeflow Pipeline graph.

    Args:
      pipeline: The logical TFX pipeline to base the construction on.
    """
        output_dir = os.path.join(pipeline.pipeline_args['pipeline_root'],
                                  pipeline.pipeline_args['pipeline_name'])
        beam_pipeline_args = []
        tfx_image = None
        if 'additional_pipeline_args' in pipeline.pipeline_args:
            additional_pipeline_args = pipeline.pipeline_args[
                'additional_pipeline_args']
            beam_pipeline_args = additional_pipeline_args.get(
                'beam_pipeline_args', [])
            tfx_image = additional_pipeline_args.get('tfx_image')

        pipeline_properties = base_component.PipelineProperties(
            output_dir=output_dir,
            log_root=pipeline.pipeline_args['log_root'],
            beam_pipeline_args=beam_pipeline_args,
            tfx_image=tfx_image,
        )

        # producers is a map from an output Channel, to a Kubeflow component that
        # is responsible for the named output represented by the Channel.
        # Assumption: Channels are unique in a pipeline.
        producers = {}

        # Assumption: There is a partial ordering of components in the list, i.e.,
        # if component A depends on component B and C, then A appears after B and C
        # in the list.
        for component in pipeline.components:
            input_dict = {}
            for input_name, input_channel in component.input_dict.items():
                if input_channel in producers:
                    output = getattr(
                        producers[input_channel]['component'].outputs,
                        producers[input_channel]['channel_name'])

                    if not isinstance(output, dsl.PipelineParam):
                        raise ValueError(
                            'Component outputs should be of type dsl.PipelineParam.'
                            ' Got type {} for output {}'.format(
                                type(output), output))
                    input_dict[input_name] = output
                else:
                    input_dict[input_name] = json.dumps(
                        [x.json_dict() for x in input_channel.get()])
            executor_class_path = '.'.join(
                [component.executor.__module__, component.executor.__name__])
            kfp_component = base_component.BaseComponent(
                component_name=component.component_name,
                input_dict=input_dict,
                output_dict=self._prepare_output_dict(component.outputs),
                exec_properties=component.exec_properties,
                executor_class_path=executor_class_path,
                pipeline_properties=pipeline_properties)

            for channel_name, channel in component.outputs.get_all().items():
                producers[channel] = {}
                producers[channel]['component'] = kfp_component
                producers[channel]['channel_name'] = channel_name