Пример #1
0
 def testConstructWithRuntimeParam(self):
     eval_args = data_types.RuntimeParameter(
         name='eval-args',
         default='{"num_steps": 50}',
         ptype=str,
     )
     custom_config = data_types.RuntimeParameter(
         name='custom-config',
         default='{"test": 10}',
         ptype=str,
     )
     trainer = component.Trainer(trainer_fn='path.to.my_trainer_fn',
                                 examples=self.examples,
                                 train_args=self.train_args,
                                 eval_args=eval_args,
                                 custom_config=custom_config)
     self._verify_outputs(trainer)
     self.assertIsInstance(
         trainer.spec.exec_properties[
             standard_component_specs.EVAL_ARGS_KEY],
         data_types.RuntimeParameter)
     self.assertIsInstance(
         trainer.spec.exec_properties[
             standard_component_specs.CUSTOM_CONFIG_KEY],
         data_types.RuntimeParameter)
Пример #2
0
    def testTypeCheckWithRuntimeParameter(self):
        class SimpleComponentSpec(ComponentSpec):
            INPUTS = {}
            OUTPUTS = {}
            PARAMETERS = {
                'x': ExecutionParameter(type=int),
                'y': ExecutionParameter(type=int, optional=True),
            }

        parameter_int = data_types.RuntimeParameter(name='int', ptype=int)
        parameter_str = data_types.RuntimeParameter(name='str', ptype=Text)

        _ = SimpleComponentSpec(x=parameter_int)
        with self.assertRaisesRegexp(TypeError, 'Expected type'):
            _ = SimpleComponentSpec(x=42, y=parameter_str)

        class ComponentSpecWithContainer(ComponentSpec):
            INPUTS = {}
            OUTPUTS = {}
            PARAMETERS = {
                'x': ExecutionParameter(type=Dict[Text, Text]),
                'y': ExecutionParameter(type=List[int]),
            }

        _ = ComponentSpecWithContainer(x={u'key': parameter_str},
                                       y=[parameter_int])
        with self.assertRaisesRegexp(TypeError, 'Expecting value type'):
            _ = ComponentSpecWithContainer(x={u'key': parameter_int}, y=[])
Пример #3
0
    def from_config(cls, config: Config):
        data_root_runtime = data_types.RuntimeParameter(
            'data_root', ptype=str, default=config.DATA_ROOT_URI)
        train_steps_runtime = data_types.RuntimeParameter(
            name='train-steps', ptype=int, default=int(config.TRAIN_STEPS))
        eval_steps_runtime = data_types.RuntimeParameter(
            name='eval-steps', default=int(config.EVAL_STEPS), ptype=int)

        return cls(data_root_runtime=data_root_runtime,
                   train_steps_runtime=train_steps_runtime,
                   eval_steps_runtime=eval_steps_runtime)
Пример #4
0
 def _testAttachParametersInSingleThread(self, suffix: Text):
     with parameter_utils.ParameterContext() as pc:
         parameter_utils.attach_parameter(
             data_types.RuntimeParameter(name='param1_in_{}'.format(suffix),
                                         ptype=int))
         parameter_utils.attach_parameter(
             data_types.RuntimeParameter(name='param2_in_{}'.format(suffix),
                                         ptype=int))
     self.assertLen(pc.parameters, 2)
     self.assertEqual(pc.parameters[0].name, 'param1_in_{}'.format(suffix))
     self.assertEqual(pc.parameters[1].name, 'param2_in_{}'.format(suffix))
Пример #5
0
 def testConstructWithParameter(self):
   module_file = data_types.RuntimeParameter(name='module-file', ptype=Text)
   n_steps = data_types.RuntimeParameter(name='n-steps', ptype=int)
   trainer = component.Trainer(
       module_file=module_file,
       transformed_examples=self.examples,
       transform_graph=self.transform_output,
       schema=self.schema,
       train_args=dict(num_steps=n_steps),
       eval_args=dict(num_steps=n_steps))
   self._verify_outputs(trainer)
   self.assertJsonEqual(
       str(module_file), str(trainer.spec.exec_properties['module_file']))
Пример #6
0
    def testAttachParameters(self):
        with parameter_utils.ParameterContext() as pc:
            param1 = data_types.RuntimeParameter(name='test_param_1',
                                                 ptype=int)
            parameter_utils.attach_parameter(param1)
            param2 = data_types.RuntimeParameter(name='test_param_2',
                                                 ptype=Text)
            parameter_utils.attach_parameter(param2)
            param3 = data_types.RuntimeParameter(name='test_param_3',
                                                 ptype=float)
            parameter_utils.attach_parameter(param3)

        self.assertListEqual([param1, param2, param3], pc.parameters)
Пример #7
0
 def testConstructWithParameter(self):
   column_name = data_types.RuntimeParameter(name='column-name', ptype=Text)
   threshold = data_types.RuntimeParameter(name='threshold', ptype=float)
   examples = standard_artifacts.Examples()
   model_exports = standard_artifacts.Model()
   evaluator = component.Evaluator(
       examples=channel_utils.as_channel([examples]),
       model=channel_utils.as_channel([model_exports]),
       feature_slicing_spec={'specs': [{
           'column_for_slicing': [column_name]
       }]},
       fairness_indicator_thresholds=[threshold])
   self.assertEqual(standard_artifacts.ModelEvaluation.TYPE_NAME,
                    evaluator.outputs['evaluation'].type_name)
Пример #8
0
 def testBuildParameterTypeSpec(self):
     type_enum = pipeline_pb2.PrimitiveType.PrimitiveTypeEnum
     testdata = {
         42: type_enum.INT,
         42.1: type_enum.DOUBLE,
         '42': type_enum.STRING,
         data_types.RuntimeParameter(name='_', ptype=int): type_enum.INT,
         data_types.RuntimeParameter(name='_', ptype=float):
         type_enum.DOUBLE,
         data_types.RuntimeParameter(name='_', ptype=str): type_enum.STRING,
     }
     for value, expected_type_enum in testdata.items():
         self.assertEqual(
             compiler_utils.build_parameter_type_spec(value).type,
             expected_type_enum)
Пример #9
0
  def setUp(self):
    super(BaseComponentWithPipelineParamTest, self).setUp()

    test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')
    example_gen_buckets = data_types.RuntimeParameter(
        name='example-gen-buckets', ptype=int, default=10)

    examples = standard_artifacts.ExternalArtifact()
    example_gen = csv_example_gen_component.CsvExampleGen(
        input=channel_utils.as_channel([examples]),
        output_config={
            'split_config': {
                'splits': [{
                    'name': 'examples',
                    'hash_buckets': example_gen_buckets
                }]
            }
        })
    statistics_gen = statistics_gen_component.StatisticsGen(
        examples=example_gen.outputs['examples'], instance_name='foo')

    pipeline = tfx_pipeline.Pipeline(
        pipeline_name=self._test_pipeline_name,
        pipeline_root='test_pipeline_root',
        metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
        components=[example_gen, statistics_gen],
    )

    self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
    self._tfx_ir = pipeline_pb2.Pipeline()
    with dsl.Pipeline('test_pipeline'):
      self.example_gen = base_component.BaseComponent(
          component=example_gen,
          component_launcher_class=in_process_component_launcher
          .InProcessComponentLauncher,
          depends_on=set(),
          pipeline=pipeline,
          pipeline_name=self._test_pipeline_name,
          pipeline_root=test_pipeline_root,
          tfx_image='container_image',
          kubeflow_metadata_config=self._metadata_config,
          component_config=None,
          tfx_ir=self._tfx_ir)
      self.statistics_gen = base_component.BaseComponent(
          component=statistics_gen,
          component_launcher_class=in_process_component_launcher
          .InProcessComponentLauncher,
          depends_on=set(),
          pipeline=pipeline,
          pipeline_name=self._test_pipeline_name,
          pipeline_root=test_pipeline_root,
          tfx_image='container_image',
          kubeflow_metadata_config=self._metadata_config,
          component_config=None,
          tfx_ir=self._tfx_ir
      )

    self.tfx_example_gen = example_gen
    self.tfx_statistics_gen = statistics_gen
Пример #10
0
    def testBuildImporterWithRuntimeParam(self):
        param = data_types.RuntimeParameter(name='runtime_flag', ptype=str)
        impt = importer.Importer(
            source_uri=param,
            artifact_type=standard_artifacts.Examples).with_id('my_importer')
        deployment_config = pipeline_pb2.PipelineDeploymentConfig()
        component_defs = {}
        with parameter_utils.ParameterContext() as pc:
            my_builder = step_builder.StepBuilder(
                node=impt,
                deployment_config=deployment_config,
                component_defs=component_defs)
            actual_step_spec = self._sole(my_builder.build())
        actual_component_def = self._sole(component_defs)

        self.assertProtoEquals(
            test_utils.get_proto_from_test_data(
                'expected_importer_component_with_runtime_param.pbtxt',
                pipeline_pb2.ComponentSpec()), actual_component_def)
        self.assertProtoEquals(
            test_utils.get_proto_from_test_data(
                'expected_importer_task_with_runtime_param.pbtxt',
                pipeline_pb2.PipelineTaskSpec()), actual_step_spec)
        self.assertProtoEquals(
            test_utils.get_proto_from_test_data(
                'expected_importer_executor_with_runtime_param.pbtxt',
                pipeline_pb2.PipelineDeploymentConfig()), deployment_config)
        self.assertListEqual([param], pc.parameters)
Пример #11
0
 def testComponentSpecWithRuntimeParam(self):
     param = data_types.RuntimeParameter(name='split-1', ptype=Text)
     serialized_param = str(param)
     # Dict representation of a example_gen_pb2.Input proto message.
     proto = dict(splits=[
         dict(name=param, pattern='pattern1'),
         dict(name='name2', pattern='pattern2'),
         dict(name='name3', pattern='pattern3'),
     ])
     input_channel = Channel(type=_InputArtifact)
     output_channel = Channel(type=_OutputArtifact)
     spec = _BasicComponentSpec(folds=10,
                                proto=proto,
                                input=input_channel,
                                output=output_channel)
     # Verify proto property.
     self.assertIsInstance(spec.exec_properties['proto'], str)
     decoded_proto = json.loads(spec.exec_properties['proto'])
     self.assertCountEqual(['splits'], decoded_proto.keys())
     self.assertEqual(3, len(decoded_proto['splits']))
     self.assertCountEqual([serialized_param, 'name2', 'name3'],
                           list(s['name'] for s in decoded_proto['splits']))
     self.assertCountEqual(['pattern1', 'pattern2', 'pattern3'],
                           list(s['pattern']
                                for s in decoded_proto['splits']))
Пример #12
0
 def testConstructWithParameter(self):
     module_file = data_types.RuntimeParameter(name='module-file',
                                               ptype=Text)
     n_steps = data_types.RuntimeParameter(name='n-steps', ptype=int)
     trainer = component.Trainer(module_file=module_file,
                                 examples=self.examples,
                                 transform_graph=self.transform_graph,
                                 schema=self.schema,
                                 train_args=dict(splits=['train'],
                                                 num_steps=n_steps),
                                 eval_args=dict(splits=['eval'],
                                                num_steps=n_steps))
     self._verify_outputs(trainer)
     self.assertJsonEqual(
         str(module_file),
         str(trainer.spec.exec_properties[
             standard_component_specs.MODULE_FILE_KEY]))
Пример #13
0
 def testComponentSpecWithRuntimeParam(self):
     proto_str = '{"splits": [{"name": "name1", "pattern": "pattern1"}]}'
     param_proto = data_types.RuntimeParameter(name='proto',
                                               ptype=str,
                                               default=proto_str)
     param_int = data_types.RuntimeParameter(name='int', ptype=int)
     input_channel = Channel(type=_InputArtifact)
     output_channel = Channel(type=_OutputArtifact)
     spec = _BasicComponentSpec(folds=param_int,
                                proto=param_proto,
                                input=input_channel,
                                output=output_channel)
     self.assertIsInstance(spec.exec_properties['folds'],
                           data_types.RuntimeParameter)
     self.assertIsInstance(spec.exec_properties['proto'],
                           data_types.RuntimeParameter)
     self.assertEqual(spec.exec_properties['proto'].default, proto_str)
Пример #14
0
 def testConstructWithParameter(self):
     push_dir = data_types.RuntimeParameter(name='push-dir', ptype=Text)
     pusher = component.Pusher(
         model=self.model,
         model_blessing=self.model_blessing,
         push_destination={'filesystem': {
             'base_directory': push_dir
         }})
     self.assertEqual('ModelPushPath',
                      pusher.outputs['model_push'].type_name)
Пример #15
0
 def testConstructWithParameter(self):
     push_dir = data_types.RuntimeParameter(name='push-dir', ptype=Text)
     pusher = component.Pusher(
         model=self.model,
         model_blessing=self.model_blessing,
         push_destination={'filesystem': {
             'base_directory': push_dir
         }})
     self.assertEqual(standard_artifacts.PushedModel.TYPE_NAME,
                      pusher.outputs['pushed_model'].type_name)
Пример #16
0
 def testConstructWithParameter(self):
   infer_shape = data_types.RuntimeParameter(name='infer-shape', ptype=bool)
   schema_gen = component.SchemaGen(
       statistics=channel_utils.as_channel(
           [standard_artifacts.ExampleStatistics(split='train')]),
       infer_feature_shape=infer_shape)
   self.assertEqual('SchemaPath', schema_gen.outputs['schema'].type_name)
   self.assertJsonEqual(
       str(schema_gen.spec.exec_properties['infer_feature_shape']),
       str(infer_shape))
Пример #17
0
 def test_construct_with_parameter(self):
   module_file = data_types.RuntimeParameter(name='module-file', ptype=Text)
   transform = component.Transform(
       examples=self.examples,
       schema=self.schema,
       module_file=module_file,
   )
   self._verify_outputs(transform)
   self.assertJsonEqual(
       str(module_file), str(transform.exec_properties['module_file']))
Пример #18
0
 def testConstructWithParameter(self):
   statistics_artifact = standard_artifacts.ExampleStatistics()
   statistics_artifact.split_names = artifact_utils.encode_split_names(
       ['train'])
   infer_shape = data_types.RuntimeParameter(name='infer-shape', ptype=bool)
   schema_gen = component.SchemaGen(
       statistics=channel_utils.as_channel([statistics_artifact]),
       infer_feature_shape=infer_shape)
   self.assertEqual(standard_artifacts.Schema.TYPE_NAME,
                    schema_gen.outputs['schema'].type_name)
   self.assertJsonEqual(
       str(schema_gen.spec.exec_properties['infer_feature_shape']),
       str(infer_shape))
Пример #19
0
 def testMakeDefaultOutputConfigWithParameter(self):
   split_name_param = data_types.RuntimeParameter(
       name='split-name', ptype=str, default=u'train')
   output_config = utils.make_default_output_config({
       'splits': [{
           'name': split_name_param,
           'pattern': 'train/*'
       }, {
           'name': 'eval',
           'pattern': 'eval/*'
       }]
   })
   self.assertEqual(0, len(output_config.split_config.splits))
Пример #20
0
def _two_step_pipeline() -> tfx_pipeline.Pipeline:
  table_name = data_types.RuntimeParameter(
      name='table-name', ptype=Text, default='default-table')
  example_gen = big_query_example_gen_component.BigQueryExampleGen(
      query='SELECT * FROM %s' % str(table_name))
  statistics_gen = statistics_gen_component.StatisticsGen(
      examples=example_gen.outputs['examples'])
  return tfx_pipeline.Pipeline(
      pipeline_name='two_step_pipeline',
      pipeline_root='pipeline_root',
      metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
      components=[example_gen, statistics_gen],
  )
Пример #21
0
    def setUp(self):
        super().setUp()

        example_gen_output_config = data_types.RuntimeParameter(
            name='example-gen-output-config', ptype=str)

        example_gen = csv_example_gen_component.CsvExampleGen(
            input_base='data_root', output_config=example_gen_output_config)
        statistics_gen = statistics_gen_component.StatisticsGen(
            examples=example_gen.outputs['examples']).with_id('foo')

        test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')
        pipeline = tfx_pipeline.Pipeline(
            pipeline_name=self._test_pipeline_name,
            pipeline_root='test_pipeline_root',
            metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
            components=[example_gen, statistics_gen],
        )

        self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
        self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
        self._tfx_ir = pipeline_pb2.Pipeline()
        with dsl.Pipeline('test_pipeline'):
            self.example_gen = base_component.BaseComponent(
                component=example_gen,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                tfx_ir=self._tfx_ir,
                pod_labels_to_attach={},
                runtime_parameters=[example_gen_output_config])
            self.statistics_gen = base_component.BaseComponent(
                component=statistics_gen,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                tfx_ir=self._tfx_ir,
                pod_labels_to_attach={},
                runtime_parameters=[])

        self.tfx_example_gen = example_gen
        self.tfx_statistics_gen = statistics_gen
Пример #22
0
  def testProtoTypeCheck(self):
    param = data_types.RuntimeParameter(name='split-1', ptype=Text)
    # Dict representation of a example_gen_pb2.Input proto message.
    # The second split has int-typed pattern, which is wrong.
    proto = dict(splits=[
        dict(name=param, pattern='pattern1'),
        dict(name='name2', pattern=42),
        dict(name='name3', pattern='pattern3'),
    ])
    input_channel = Channel(type_name='InputType')
    output_channel = Channel(type_name='OutputType')

    with self.assertRaisesRegexp(
        ParseError,
        'Failed to parse .* field: expected string or '
        '(bytes-like object|buffer)'):
      spec = _BasicComponentSpec(  # pylint: disable=unused-variable
          folds=10, proto=proto, input=input_channel, output=output_channel)
Пример #23
0
def build_query_seed():
    """
    Do the elaborate proto packing necessary to feed
    a QueryExampleGen custom_config.
    """

    seed_runtime = data_types.RuntimeParameter(
        name='seed_pattern',
        default="'%meni%','%avw3%'",
        ptype=str
    )

    bigquery_seed_proto = bigquery_example_gen_pb2.BigQuerySeed()
    bigquery_seed_proto.seed = json_utils.dumps(seed_runtime)

    any_proto = any_pb2.Any()
    any_proto.Pack(bigquery_seed_proto, 'bigqueryseed.dstillery.com')

    return example_gen_pb2.CustomConfig(custom_config=any_proto)
Пример #24
0
def _two_step_pipeline() -> tfx_pipeline.Pipeline:
    default_input_config = json.dumps({
        'splits': [{
            'name': 'single_split',
            'pattern': 'SELECT * FROM default-table'
        }]
    })
    input_config = data_types.RuntimeParameter(name='input_config',
                                               ptype=str,
                                               default=default_input_config)
    example_gen = big_query_example_gen_component.BigQueryExampleGen(
        input_config=input_config, output_config=example_gen_pb2.Output())
    statistics_gen = statistics_gen_component.StatisticsGen(
        examples=example_gen.outputs['examples'])
    return tfx_pipeline.Pipeline(
        pipeline_name='two_step_pipeline',
        pipeline_root='pipeline_root',
        metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
        components=[example_gen, statistics_gen],
    )
Пример #25
0
def pipeline_with_runtime_parameter() -> tfx_pipeline.Pipeline:
  """Pipeline which contains a runtime parameter."""

  producer_task = primitive_producer_component()

  consumer_task = primitive_consumer_component(
      input_string=producer_task.outputs['output_string'],
      input_int=producer_task.outputs['output_int'],
      input_float=producer_task.outputs['output_float'],
      param_string=data_types.RuntimeParameter(
          ptype=Text, name='string_param', default='string value'),
      param_int=42,
      param_float=3.14,
  )

  return tfx_pipeline.Pipeline(
      pipeline_name='pipeline-with-runtime-parameter',
      pipeline_root=_TEST_PIPELINE_ROOT,
      components=[producer_task, consumer_task],
  )
Пример #26
0
  def testMakeOutputSplitNamesWithParameter(self):
    split_name_param = data_types.RuntimeParameter(
        name='split-name', ptype=str, default=u'train')
    split_names = utils.generate_output_split_names(
        input_config={
            'splits': [{
                'name': split_name_param,
                'pattern': 'train/*'
            }, {
                'name': 'eval',
                'pattern': 'eval/*'
            }]
        },
        output_config=example_gen_pb2.Output())
    # Assert the json serialized version because RuntimeParameters only get
    # serialized after that.
    self.assertEqual(
        json_utils.dumps([split_name_param, 'eval']),
        json_utils.dumps(split_names))

    split_names = utils.generate_output_split_names(
        input_config=example_gen_pb2.Input(splits=[
            example_gen_pb2.Input.Split(name='single', pattern='single/*')
        ]),
        output_config={
            'split_config': {
                'splits': [{
                    'name': split_name_param,
                    'hash_buckets': 2
                }, {
                    'name': 'eval',
                    'hash_buckets': 1
                }]
            }
        })
    # Assert the json serialized version because RuntimeParameters only get
    # serialized after that.
    self.assertEqual(
        json_utils.dumps([split_name_param, 'eval']),
        json_utils.dumps(split_names))
Пример #27
0
class KubeflowGcpPerfTest(kubeflow_test_utils.BaseKubeflowTest):

    # The endpoint of the KFP instance.
    # This test fixture assumes an established KFP instance authenticated via
    # inverse proxy.
    _KFP_ENDPOINT = os.environ['KFP_E2E_ENDPOINT']

    # The namespace where KFP is deployed.
    _KFP_NAMESPACE = 'kubeflow'

    # Timeout for a single pipeline run. Set to 6 hours.
    # TODO(b/158009615): Tune this timeout to align with our observation.
    # Note: the Chicago Taxi dataset is a dataset growing with time. The 6 hour
    # timeout here was calibrated according to our empirical study in
    # b/150222976. This might need to be adjusted occasionally.
    _TIME_OUT = datetime.timedelta(hours=6)

    # KFP client polling interval, in seconds
    _POLLING_INTERVAL = 60

    # TODO(b/156784019): temporary workaround.
    # Number of retries when `get_run` returns remote error.
    _N_RETRIES = 5

    # The base container image name to use when building the image used in tests.
    _BASE_CONTAINER_IMAGE = os.environ['KFP_E2E_BASE_CONTAINER_IMAGE']

    # The project id to use to run tests.
    _GCP_PROJECT_ID = os.environ['KFP_E2E_GCP_PROJECT_ID']

    # The GCP region in which the end-to-end test is run.
    _GCP_REGION = os.environ['KFP_E2E_GCP_REGION']

    # The GCP zone in which the cluster is created.
    _GCP_ZONE = os.environ['KFP_E2E_GCP_ZONE']

    # The GCP bucket to use to write output artifacts.
    _BUCKET_NAME = os.environ['KFP_E2E_BUCKET_NAME']

    # The GCP GKE cluster name where the KFP deployment is installed.
    _CLUSTER_NAME = os.environ['KFP_E2E_CLUSTER_NAME']

    # The location of test user module file.
    # It is retrieved from inside the container subject to testing.
    _MODULE_FILE = '/tfx-src/tfx/examples/chicago_taxi_pipeline/taxi_utils.py'

    # Parameterize worker type/count for easily ramping up the pipeline scale.
    _WORKER_COUNT = data_types.RuntimeParameter(
        name='worker_count',
        default=2,
        ptype=int,
    )

    _WORKER_TYPE = data_types.RuntimeParameter(
        name='worker_type',
        default='standard',
        ptype=str,
    )

    # Parameterize parameter server count for easily ramping up the scale.
    _PARAMETER_SERVER_COUNT = data_types.RuntimeParameter(
        name='parameter_server_count',
        default=1,
        ptype=int,
    )

    _MODEL_NAME = 'chicago_taxi'

    _AI_PLATFORM_SERVING_ARGS = {
        'model_name': _MODEL_NAME,
        'project_id': _GCP_PROJECT_ID,
        'regions': [_GCP_REGION],
    }

    # TODO(b/151114974): Remove `disk_size_gb` flag after default is increased.
    # TODO(b/151116587): Remove `shuffle_mode` flag after default is changed.
    _BEAM_PIPELINE_ARGS = [
        '--runner=DataflowRunner',
        '--project=' + _GCP_PROJECT_ID,
        '--temp_location=gs://' +
        os.path.join(_BUCKET_NAME, 'dataflow', 'tmp'),
        '--region=' + _GCP_REGION,

        # In order not to consume in-use global IP addresses by Dataflow workers,
        # configure workers to not use public IPs. If workers needs access to
        # public Internet, CloudNAT needs to be configured for the VPC in which
        # Dataflow runs.
        '--no_use_public_ips',

        # Temporary overrides of defaults.
        '--disk_size_gb=50',
        '--experiments=shuffle_mode=auto',
    ]

    @classmethod
    def tearDownClass(cls):
        super(kubeflow_test_utils.BaseKubeflowTest, cls).tearDownClass()
        # Delete the cluster created in the test.
        delete_cluster_command = [
            'gcloud', 'container', 'clusters', 'delete', cls._CLUSTER_NAME,
            '--region=%s' % cls._GCP_ZONE, '--quiet'
        ]
        logging.info(
            subprocess.check_output(delete_cluster_command).decode('utf-8'))

    def _get_workflow_name(self, pipeline_name: Text) -> Text:
        """Gets the Argo workflow name using pipeline name."""
        get_workflow_name_command = (
            'argo --namespace %s list | grep -o "%s[^ ]*"' %
            (self._KFP_NAMESPACE, pipeline_name))
        # Need to explicitly decode because the test fixture is running on
        # Python 3.5. Also need to remove the new line at the end of the string.
        return subprocess.check_output(get_workflow_name_command,
                                       shell=True).decode('utf-8')[:-1]

    def _get_workflow_log(self, pipeline_name: Text) -> Text:
        """Gets the workflow log for all the pods using pipeline name."""
        get_workflow_log_command = [
            'argo', '--namespace', self._KFP_NAMESPACE, 'logs', '-w',
            self._get_workflow_name(pipeline_name)
        ]
        # Need to explicitly decode because the test fixture is running on
        # Python 3.5.
        return subprocess.check_output(get_workflow_log_command).decode(
            'utf-8')

    def _assert_successful_run_completion(self, host: Text, run_id: Text,
                                          pipeline_name: Text,
                                          timeout: datetime.timedelta):
        """Waits and asserts a successful KFP pipeline execution.

    Args:
      host: the endpoint of the KFP deployment.
      run_id: the run ID of the execution, can be obtained from the respoonse
        when submitting the pipeline.
      pipeline_name: the name of the pipeline under test.
      timeout: maximal waiting time for this execution, in timedelta.

    Raises:
      RuntimeError: when timeout exceeds after waiting for specified duration.
    """

        status = kubeflow_test_utils.poll_kfp_with_retry(
            host=host,
            run_id=run_id,
            retry_limit=self._N_RETRIES,
            timeout=timeout,
            polling_interval=self._POLLING_INTERVAL)

        workflow_log = self._get_workflow_log(pipeline_name)

        self.assertEqual(
            status.lower(), kubeflow_test_utils.KFP_SUCCESS_STATUS,
            'Pipeline %s failed to complete successfully: %s' %
            (pipeline_name, workflow_log))

    def _compile_and_run_pipeline(self, pipeline: tfx_pipeline.Pipeline,
                                  **kwargs):
        """Compiles and runs a KFP pipeline.

    In this method, provided TFX pipeline will be submitted via kfp.Client()
    instead of from Argo.

    Args:
      pipeline: The logical pipeline to run.
      **kwargs: Key-value pairs of runtime paramters passed to the pipeline
        execution.
    """
        client = kfp.Client(host=self._KFP_ENDPOINT)

        pipeline_name = pipeline.pipeline_info.pipeline_name
        config = kubeflow_dag_runner.KubeflowDagRunnerConfig(
            kubeflow_metadata_config=self._get_kubeflow_metadata_config(),
            tfx_image=self._CONTAINER_IMAGE)
        kubeflow_dag_runner.KubeflowDagRunner(config=config).run(pipeline)

        file_path = os.path.join(self._test_dir,
                                 '{}.tar.gz'.format(pipeline_name))
        self.assertTrue(tf.io.gfile.exists(file_path))

        run_result = client.create_run_from_pipeline_package(
            pipeline_file=file_path, arguments=kwargs)
        run_id = run_result.run_id

        self._assert_successful_run_completion(host=self._KFP_ENDPOINT,
                                               run_id=run_id,
                                               pipeline_name=pipeline_name,
                                               timeout=self._TIME_OUT)

    def testFullTaxiGcpPipeline(self):
        pipeline_name = 'gcp-perf-test-full-e2e-test-{}'.format(
            test_utils.random_id())

        # Custom CAIP training job using a testing image.
        ai_platform_training_args = {
            'project': self._GCP_PROJECT_ID,
            'region': self._GCP_REGION,
            'scaleTier': 'CUSTOM',
            'masterType': 'large_model',
            'masterConfig': {
                'imageUri': self._CONTAINER_IMAGE
            },
            'workerType': self._WORKER_TYPE,
            'parameterServerType': 'standard',
            'workerCount': self._WORKER_COUNT,
            'parameterServerCount': self._PARAMETER_SERVER_COUNT
        }

        pipeline = taxi_pipeline_kubeflow_gcp.create_pipeline(
            pipeline_name=pipeline_name,
            pipeline_root=self._pipeline_root(pipeline_name),
            module_file=self._MODULE_FILE,
            ai_platform_training_args=ai_platform_training_args,
            ai_platform_serving_args=self._AI_PLATFORM_SERVING_ARGS,
            beam_pipeline_args=self._BEAM_PIPELINE_ARGS)
        # TODO(b/162451308): Add this clean-up back after we re-enable AIP pusher
        # when AIP prediction service supports TF>=2.3.
        # self.addCleanup(kubeflow_test_utils.delete_ai_platform_model,
        #                 self._MODEL_NAME)
        self._compile_and_run_pipeline(
            pipeline=pipeline,
            query_sample_rate=1,
            # (1M * batch_size=200) / 200M records ~ 1 epoch
            train_steps=1000000,
            eval_steps=10000,
            worker_count=20,
            parameter_server_count=3,
        )
Пример #28
0
def generate_pipeline(pipeline_name, pipeline_root, train_data, test_data,
                      train_steps, eval_steps, pusher_target, runner):
    module_file = 'util.py'  # util.py is a file in the same folder

    # RuntimeParameter is only supported on KubeflowDagRunner currently
    if runner == 'kubeflow':
        pipeline_root_param = os.path.join('gs://{{kfp-default-bucket}}',
                                           pipeline_name, '{{workflow.uid}}')
        train_data_param = data_types.RuntimeParameter(
            name='train-data',
            default=
            'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/train',
            ptype=Text)
        test_data_param = data_types.RuntimeParameter(
            name='test-data',
            default=
            'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/test',
            ptype=Text)
        pusher_target_param = data_types.RuntimeParameter(
            name='pusher-destination',
            default=
            'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/serving',
            ptype=Text)
    else:
        pipeline_root_param = pipeline_root
        train_data_param = train_data
        test_data_param = test_data
        pusher_target_param = pusher_target

    examples = external_input(train_data_param)
    example_gen = CsvExampleGen(input=examples, instance_name="train")

    test_examples = external_input(test_data_param)
    test_example_gen = CsvExampleGen(input=test_examples,
                                     output_config={
                                         'split_config': {
                                             'splits': [{
                                                 'name': 'test',
                                                 'hash_buckets': 1
                                             }]
                                         }
                                     },
                                     instance_name="test")

    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True
                           )  # infer_feature_shape controls sparse or dense

    # Transform is too slow in my side.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    trainer = Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        module_file=module_file,
        train_args=trainer_pb2.TrainArgs(num_steps=train_steps),
        eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps),
        instance_name="train",
        enable_cache=False)

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='target')],
        # tfma.SlicingSpec(feature_keys=['var_0', 'var_1']) when add more, Evaluator can't ouptput BLESSED status. It should be a bug in TFMA.
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'binary_accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.4}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        # baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config,
        instance_name="eval5")

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination={
                        'filesystem': {
                            'base_directory': pusher_target_param
                        }
                    })

    bulk_inferrer = BulkInferrer(
        examples=test_example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        # model_blessing=evaluator.outputs['blessing'],
        data_spec=bulk_inferrer_pb2.DataSpec(),
        model_spec=bulk_inferrer_pb2.ModelSpec(),
        instance_name="bulkInferrer")

    hello = component.HelloComponent(
        input_data=bulk_inferrer.outputs['inference_result'],
        instance_name='csvGen')

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root_param,
        components=[
            example_gen, statistics_gen, schema_gen, transform, trainer,
            model_resolver, evaluator, pusher, hello, test_example_gen,
            bulk_inferrer
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            os.path.join(pipeline_root, 'metadata.sqlite')),
        beam_pipeline_args=['--direct_num_workers=0'])
Пример #29
0
def create_pipeline(
        pipeline_name: Text,
        pipeline_root: Text,
        module_file: Text,
        ai_platform_training_args: Dict[Text, Text],
        ai_platform_serving_args: Dict[Text, Text],
        beam_pipeline_args: Optional[List[Text]] = None) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.

  Args:
    pipeline_name: name of the TFX pipeline being created.
    pipeline_root: root directory of the pipeline. Should be a valid GCS path.
    module_file: uri of the module files used in Trainer and Transform
      components.
    ai_platform_training_args: Args of CAIP training job. Please refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#Job
      for detailed description.
    ai_platform_serving_args: Args of CAIP model deployment. Please refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.models
      for detailed description.
    beam_pipeline_args: Optional list of beam pipeline options. Please refer to
      https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options.
      When this argument is not provided, the default is to use GCP
      DataflowRunner with 50GB disk size as specified in this function. If an
      empty list is passed in, default specified by Beam will be used, which can
      be found at
      https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options

  Returns:
    A TFX pipeline object.
  """

    # The rate at which to sample rows from the Taxi dataset using BigQuery.
    # The full taxi dataset is > 200M record.  In the interest of resource
    # savings and time, we've set the default for this example to be much smaller.
    # Feel free to crank it up and process the full dataset!
    # By default it generates a 0.1% random sample.
    query_sample_rate = data_types.RuntimeParameter(name='query_sample_rate',
                                                    ptype=float,
                                                    default=0.001)

    # This is the upper bound of FARM_FINGERPRINT in Bigquery (ie the max value of
    # signed int64).
    max_int64 = '0x7FFFFFFFFFFFFFFF'

    # The query that extracts the examples from BigQuery. The Chicago Taxi dataset
    # used for this example is a public dataset available on Google AI Platform.
    # https://console.cloud.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips
    query = """
          SELECT
            pickup_community_area,
            fare,
            EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month,
            EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour,
            EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day,
            UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp,
            pickup_latitude,
            pickup_longitude,
            dropoff_latitude,
            dropoff_longitude,
            trip_miles,
            pickup_census_tract,
            dropoff_census_tract,
            payment_type,
            company,
            trip_seconds,
            dropoff_community_area,
            tips
          FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
          WHERE (ABS(FARM_FINGERPRINT(unique_key)) / {max_int64})
            < {query_sample_rate}""".format(
        max_int64=max_int64, query_sample_rate=str(query_sample_rate))

    # Beam args to run data processing on DataflowRunner.
    # TODO(b/151114974): Remove `disk_size_gb` flag after default is increased.
    # TODO(b/151116587): Remove `shuffle_mode` flag after default is changed.
    if beam_pipeline_args is None:
        beam_pipeline_args = [
            '--runner=DataflowRunner',
            '--experiments=shuffle_mode=auto',
            '--project=' + _project_id,
            '--temp_location=' + os.path.join(_output_bucket, 'tmp'),
            '--region=' + _gcp_region,
            '--disk_size_gb=50',
        ]

    # Number of epochs in training.
    train_steps = data_types.RuntimeParameter(
        name='train_steps',
        default=10000,
        ptype=int,
    )

    # Number of epochs in evaluation.
    eval_steps = data_types.RuntimeParameter(
        name='eval_steps',
        default=5000,
        ptype=int,
    )

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = BigQueryExampleGen(query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Update ai_platform_training_args if distributed training was enabled.
    # Number of worker machines used in distributed training.
    worker_count = data_types.RuntimeParameter(
        name='worker_count',
        default=2,
        ptype=int,
    )

    # Type of worker machines used in distributed training.
    worker_type = data_types.RuntimeParameter(
        name='worker_type',
        default='standard',
        ptype=str,
    )

    local_training_args = copy.deepcopy(ai_platform_training_args)

    if FLAGS.distributed_training:
        local_training_args.update({
            # You can specify the machine types, the number of replicas for workers
            # and parameter servers.
            # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#ScaleTier
            'scaleTier': 'CUSTOM',
            'masterType': 'large_model',
            'workerType': worker_type,
            'parameterServerType': 'standard',
            'workerCount': worker_count,
            'parameterServerCount': 1
        })

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    trainer = Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(
            ai_platform_trainer_executor.Executor),
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args={'num_steps': train_steps},
        eval_args={'num_steps': eval_steps},
        custom_config={
            ai_platform_trainer_executor.TRAINING_ARGS_KEY: local_training_args
        })

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'binary_accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)

    # Checks whether the model passed the validation steps and pushes the model
    # to  Google Cloud AI Platform if check passed.
    pusher = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec(
        ai_platform_pusher_executor.Executor),
                    model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    custom_config={
                        ai_platform_pusher_executor.SERVING_ARGS_KEY:
                        ai_platform_serving_args
                    })

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, schema_gen, example_validator,
            transform, trainer, model_resolver, evaluator, pusher
        ],
        beam_pipeline_args=beam_pipeline_args,
    )
Пример #30
0
# Name of pipeline_root parameter.
_PIPELINE_ROOT = 'pipeline-root'

# Pipeline root is by default specified as a RuntimeParameter when runnning on
# KubeflowDagRunner. This constant offers users an easy access to the pipeline
# root placeholder when defining a pipeline. For example,
#
# pusher = Pusher(
#     model_export=trainer.outputs['model'],
#     model_blessing=model_validator.outputs['blessing'],
#     push_destination=pusher_pb2.PushDestination(
#         filesystem=pusher_pb2.PushDestination.Filesystem(
#             base_directory=os.path.join(
#                 str(pipeline.ROOT_PARAMETER), 'model_serving'))))
ROOT_PARAMETER = data_types.RuntimeParameter(name=_PIPELINE_ROOT, ptype=Text)


class Pipeline(object):
    """Logical TFX pipeline object.

  Attributes:
    pipeline_args: kwargs used to create real pipeline implementation. This is
      forwarded to PipelineRunners instead of consumed in this class. This
      should include:
      - pipeline_name: Required. The unique name of this pipeline.
      - pipeline_root: Required. The root of the pipeline outputs.
    components: logical components of this pipeline.
    pipeline_info: An instance of data_types.PipelineInfo that contains basic
      properties of the pipeline.
    enable_cache: whether or not cache is enabled for this run.