Python Trainer 예제들, tfx.components.trainer.component.Trainer Python 예제들

예제 #1

0

파일 보기

파일: training_pipeline.py 프로젝트: vingovan/zenml

    def get_tfx_component_list(self, config: Dict[Text, Any]) -> List:
        """
        Builds the training pipeline as a series of TFX components.

        Args:
            config: A ZenML configuration in dictionary format.

        Returns:
            A chronological list of TFX components making up the training
             pipeline.

        """
        steps = config[keys.GlobalKeys.STEPS]

        component_list = []

        ############
        # RAW DATA #
        ############
        data_config = steps[keys.TrainingSteps.DATA]
        data = DataGen(source=data_config[keys.StepKeys.SOURCE],
                       source_args=data_config[keys.StepKeys.ARGS]).with_id(
                           GDPComponent.DataGen.name)

        statistics_data = StatisticsGen(
            examples=data.outputs.examples).with_id(
                GDPComponent.DataStatistics.name)

        schema_data = SchemaGen(
            statistics=statistics_data.outputs.output, ).with_id(
                GDPComponent.DataSchema.name)

        component_list.extend([data, statistics_data, schema_data])

        datapoints = data.outputs.examples

        #################
        #   SPLITTING   #
        #################
        # Block to read the data from the corresponding BQ table
        split_config = steps[keys.TrainingSteps.SPLIT]
        splits = SplitGen(
            input_examples=datapoints,
            source=split_config[keys.StepKeys.SOURCE],
            source_args=split_config[keys.StepKeys.ARGS],
            schema=schema_data.outputs.schema,
            statistics=statistics_data.outputs.output,
        ).with_id(GDPComponent.SplitGen.name)

        datapoints = splits.outputs.examples

        statistics_split = StatisticsGen(examples=datapoints).with_id(
            GDPComponent.SplitStatistics.name)

        schema_split = SchemaGen(
            statistics=statistics_split.outputs.output, ).with_id(
                GDPComponent.SplitSchema.name)

        schema = schema_split.outputs.schema

        component_list.extend([splits, statistics_split, schema_split])

        #################
        # PREPROCESSING #
        #################
        transform = Transform(
            preprocessing_fn=constants.PREPROCESSING_FN,
            examples=datapoints,
            schema=schema,
            custom_config=steps[keys.TrainingSteps.PREPROCESSING]).with_id(
                GDPComponent.Transform.name)

        component_list.extend([transform])

        ############
        # TRAINING #
        ############
        training_backend: TrainingLocalBackend = \
            self.backends_dict[TrainingLocalBackend.BACKEND_KEY]
        training_kwargs = {
            'custom_executor_spec': training_backend.get_executor_spec(),
            'custom_config': steps[keys.TrainingSteps.TRAINING]
        }
        training_kwargs['custom_config'].update(
            training_backend.get_custom_config())

        trainer = Trainer(
            transformed_examples=transform.outputs.transformed_examples,
            transform_graph=transform.outputs.transform_graph,
            run_fn=constants.TRAINER_FN,
            schema=schema,
            train_args=trainer_pb2.TrainArgs(),
            eval_args=trainer_pb2.EvalArgs(),
            **training_kwargs).with_id(GDPComponent.Trainer.name)

        component_list.extend([trainer])

        #############
        # EVALUATOR #
        #############
        if keys.TrainingSteps.EVALUATION in steps:
            from zenml.utils import source_utils
            eval_module = '.'.join(
                constants.EVALUATOR_MODULE_FN.split('.')[:-1])
            eval_module_file = constants.EVALUATOR_MODULE_FN.split('.')[-1]
            abs_path = source_utils.get_absolute_path_from_module(eval_module)
            custom_extractor_path = os.path.join(abs_path,
                                                 eval_module_file) + '.py'
            eval_step: TFMAEvaluator = TFMAEvaluator.from_config(
                steps[keys.TrainingSteps.EVALUATION])
            eval_config = eval_step.build_eval_config()
            evaluator = Evaluator(
                examples=transform.outputs.transformed_examples,
                model=trainer.outputs.model,
                eval_config=eval_config,
                module_file=custom_extractor_path,
            ).with_id(GDPComponent.Evaluator.name)
            component_list.append(evaluator)

        ###########
        # SERVING #
        ###########
        if keys.TrainingSteps.DEPLOYMENT in steps:
            serving_args = steps[keys.TrainingSteps.DEPLOYMENT]['args']

            project_id = serving_args['project_id']
            output_base_dir = self.artifact_store.path
            if 'model_name' in serving_args:
                model_name = serving_args['model_name']
            else:
                model_name = self.pipeline_name().replace('-', '_')

            gcaip_deployer = GCAIPDeployer(output_base_dir=output_base_dir,
                                           project_id=project_id,
                                           model_name=model_name)

            pusher_config = gcaip_deployer.build_pusher_config()
            pusher_executor_spec = gcaip_deployer.get_executor_spec()

            pusher = Pusher(model_export=trainer.outputs.output,
                            custom_executor_spec=pusher_executor_spec,
                            **pusher_config).with_id(
                                GDPComponent.Deployer.name)

            component_list.append(pusher)

        return component_list

예제 #2

0

파일 보기

파일: parameterized_tfx_oss.py 프로젝트: xinzhangcmu/pipelines

def _create_pipeline(pipeline_root: Text,
                     csv_input_location: data_types.RuntimeParameter,
                     taxi_module_file: data_types.RuntimeParameter,
                     enable_cache: bool):
    """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline.

  Args:
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    taxi_module_file: The location of the module file for Transform/Trainer.
    enable_cache: Whether to enable cache or not.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """
    examples = external_input(csv_input_location)

    example_gen = CsvExampleGen(input=examples)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    infer_schema = SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=False,
    )
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'],
    )
    transform = Transform(
        examples=example_gen.outputs['examples'],
        schema=infer_schema.outputs['schema'],
        module_file=taxi_module_file,
    )
    trainer = Trainer(
        module_file=taxi_module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10),
        eval_args=trainer_pb2.EvalArgs(num_steps=5),
    )
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]),
    )
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=model_validator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(str(pipeline.ROOT_PARAMETER),
                                            'model_serving'))),
    )

    return pipeline.Pipeline(
        pipeline_name='parameterized_tfx_oss',
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=enable_cache,
    )

예제 #3

0

파일 보기

파일: taxi_pipeline_runtime_parameter.py 프로젝트: etarakci-hvl/tfx

def _create_parameterized_pipeline(
    pipeline_name: Text,
    pipeline_root: Optional[Text] = _pipeline_root,
    enable_cache: Optional[bool] = True,
    direct_num_workers: Optional[int] = 1) -> pipeline.Pipeline:
  """Creates a simple TFX pipeline with RuntimeParameter.

  Args:
    pipeline_name: The name of the pipeline.
    pipeline_root: The root of the pipeline output.
    enable_cache: Whether to enable cache in this pipeline.
    direct_num_workers: Number of workers executing the underlying beam pipeline
      in the executors.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """
  # First, define the pipeline parameters.
  # Path to the CSV data file, under which there should be a data.csv file.
  data_root_param = data_types.RuntimeParameter(
      name='data-root',
      default='gs://my-bucket/data',
      ptype=Text,
  )

  # Path to the module file.
  taxi_module_file_param = data_types.RuntimeParameter(
      name='module-file',
      default='gs://my-bucket/modules/taxi_utils.py',
      ptype=Text,
  )

  # Number of epochs in training.
  train_steps = data_types.RuntimeParameter(
      name='train-steps',
      default=10,
      ptype=int,
  )

  # Number of epochs in evaluation.
  eval_steps = data_types.RuntimeParameter(
      name='eval-steps',
      default=5,
      ptype=int,
  )

  # Column name for slicing.
  slicing_column = data_types.RuntimeParameter(
      name='slicing-column',
      default='trip_start_hour',
      ptype=Text,
  )

  # The input data location is parameterized by _data_root_param
  examples = external_input(data_root_param)
  example_gen = CsvExampleGen(input=examples)

  statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples'])
  infer_schema = SchemaGen(
      stats=statistics_gen.outputs['statistics'], infer_feature_shape=False)
  validate_stats = ExampleValidator(
      stats=statistics_gen.outputs['statistics'],
      schema=infer_schema.outputs['schema'])

  # The module file used in Transform and Trainer component is paramterized by
  # _taxi_module_file_param.
  transform = Transform(
      input_data=example_gen.outputs['examples'],
      schema=infer_schema.outputs['schema'],
      module_file=taxi_module_file_param)

  # The numbers of steps in train_args are specified as RuntimeParameter with
  # name 'train-steps' and 'eval-steps', respectively.
  trainer = Trainer(
      module_file=taxi_module_file_param,
      transformed_examples=transform.outputs['transformed_examples'],
      schema=infer_schema.outputs['schema'],
      transform_output=transform.outputs['transform_graph'],
      train_args={'num_steps': train_steps},
      eval_args={'num_steps': eval_steps})

  # The name of slicing column is specified as a RuntimeParameter.
  model_analyzer = Evaluator(
      examples=example_gen.outputs['examples'],
      model_exports=trainer.outputs['model'],
      feature_slicing_spec=dict(specs=[{
          'column_for_slicing': [slicing_column]
      }]))
  model_validator = ModelValidator(
      examples=example_gen.outputs['examples'], model=trainer.outputs['model'])

  # TODO(b/145949533) Currently we use this hack to ensure push_destination can
  # be correctly parameterized and interpreted.
  # pipeline root will be specified as a dsl.PipelineParam with the name
  # pipeline-root, see:
  # https://github.com/tensorflow/tfx/blob/1c670e92143c7856f67a866f721b8a9368ede385/tfx/orchestration/kubeflow/kubeflow_dag_runner.py#L226
  pipeline_root_param = dsl.PipelineParam(name='pipeline-root')
  pusher = Pusher(
      model_export=trainer.outputs['model'],
      model_blessing=model_validator.outputs['blessing'],
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=os.path.join(
                  str(pipeline_root_param), 'model_serving'))))

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          example_gen, statistics_gen, infer_schema, validate_stats, transform,
          trainer, model_analyzer, model_validator, pusher
      ],
      enable_cache=enable_cache,
      # TODO(b/141578059): The multi-processing API might change.
      beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
  )

예제 #4

0

파일 보기

파일: taxi_pipeline_portable_beam.py 프로젝트: mandarcthorat/tfx

def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""
    examples = csv_input(_data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=_taxi_module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=_taxi_module_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model_export=trainer.outputs.output,
                    model_blessing=model_validator.outputs.blessing,
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=_serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=_pipeline_name,
        pipeline_root=_pipeline_root,
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            _metadata_db_root),
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        additional_pipeline_args={
            # LINT.IfChange
            'beam_pipeline_args': [
                # ----- Beam Args -----.
                '--runner=PortableRunner',
                # Points to the job server started in
                # setup_beam_on_(flink|spark).sh
                '--job_endpoint=localhost:8099',
                '--environment_type=LOOPBACK',
                # TODO(BEAM-6754): Utilize multicore in LOOPBACK environment.  # pylint: disable=g-bad-todo
                # TODO(BEAM-5167): Use concurrency information from SDK Harness.  # pylint: disable=g-bad-todo
                # Note; We use 100 worker threads to mitigate the issue with
                # scheduling work between the Beam runner and SDK harness. Flink
                # and Spark can process unlimited work items concurrently while
                # SdkHarness can only process 1 work item per worker thread.
                # Having 100 threads will let 100 tasks execute concurrently
                # avoiding scheduling issue in most cases. In case the threads are
                # exhausted, beam print the relevant message in the log.
                '--experiments=worker_threads=100',
                # TODO(BEAM-7199): Obviate the need for setting pre_optimize=all.  # pylint: disable=g-bad-todo
                '--experiments=pre_optimize=all',
                # ----- Flink runner-specific Args -----.
                # TODO(b/126725506): Set the task parallelism based on cpu cores.
                # TODO(FLINK-10672): Obviate setting BATCH_FORCED.
                '--execution_mode_for_batch=BATCH_FORCED',
            ],
            # LINT.ThenChange(tfx/examples/chicago_taxi/setup_beam_on_portable_beam.sh)
        },
    )

예제 #5

0

파일 보기

def _create_pipeline(pipeline_name: Text, pipeline_root: Text,
                     module_file: Text,
                     presto_config: presto_config_pb2.PrestoConnConfig,
                     query: Text, serving_model_dir: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    # Brings data into the pipeline or otherwise joins/converts training data
    example_gen = PrestoExampleGen(presto_config, query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'])

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        additional_pipeline_args={},
    )

예제 #6

0

파일 보기

파일: taxi_pipeline_runtime_parameter.py 프로젝트: ysjeon7/tfx

def _create_parameterized_pipeline(
        pipeline_name: Text,
        pipeline_root: Optional[Text] = _pipeline_root,
        enable_cache: Optional[bool] = True,
        direct_num_workers: Optional[int] = 1) -> pipeline.Pipeline:
    """Creates a simple TFX pipeline with RuntimeParameter.

  Args:
    pipeline_name: The name of the pipeline.
    pipeline_root: The root of the pipeline output.
    enable_cache: Whether to enable cache in this pipeline.
    direct_num_workers: Number of workers executing the underlying beam pipeline
      in the executors.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """
    # First, define the pipeline parameters.
    # Path to the CSV data file, under which there should be a data.csv file.
    data_root = data_types.RuntimeParameter(
        name='data-root',
        default='gs://my-bucket/data',
        ptype=Text,
    )

    # Path to the transform module file.
    transform_module_file = data_types.RuntimeParameter(
        name='transform-module',
        default='gs://my-bucket/modules/transform_module.py',
        ptype=Text,
    )

    # Path to the trainer module file.
    trainer_module_file = data_types.RuntimeParameter(
        name='trainer-module',
        default='gs://my-bucket/modules/trainer_module.py',
        ptype=Text,
    )

    # Number of epochs in training.
    train_steps = data_types.RuntimeParameter(
        name='train-steps',
        default=10,
        ptype=int,
    )

    # Number of epochs in evaluation.
    eval_steps = data_types.RuntimeParameter(
        name='eval-steps',
        default=5,
        ptype=int,
    )

    # Column name for slicing.
    slicing_column = data_types.RuntimeParameter(
        name='slicing-column',
        default='trip_start_hour',
        ptype=Text,
    )

    # The input data location is parameterized by data_root
    examples = external_input(data_root)
    example_gen = CsvExampleGen(input=examples)

    statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples'])
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=False)
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # The module file used in Transform and Trainer component is paramterized by
    # transform_module_file.
    transform = Transform(input_data=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=transform_module_file)

    # The numbers of steps in train_args are specified as RuntimeParameter with
    # name 'train-steps' and 'eval-steps', respectively.
    trainer = Trainer(
        module_file=trainer_module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_output=transform.outputs['transform_graph'],
        train_args={'num_steps': train_steps},
        eval_args={'num_steps': eval_steps})

    # The name of slicing column is specified as a RuntimeParameter.
    model_analyzer = Evaluator(examples=example_gen.outputs['examples'],
                               model=trainer.outputs['model'],
                               feature_slicing_spec=dict(specs=[{
                                   'column_for_slicing': [slicing_column]
                               }]))
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    pusher = Pusher(
        model_export=trainer.outputs['model'],
        model_blessing=model_validator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(str(pipeline.ROOT_PARAMETER),
                                            'model_serving'))))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=enable_cache,
        # TODO(b/142684737): The multi-processing API might change.
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
    )

예제 #7

0

파일 보기

def _create_pipeline():
  """Implements the chicago taxi pipeline with TFX."""
  examples = csv_input(_data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input_base=examples)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output)

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      input_data=example_gen.outputs.examples,
      schema=infer_schema.outputs.output,
      module_file=_taxi_module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=_taxi_module_file,
      transformed_examples=transform.outputs.transformed_examples,
      schema=infer_schema.outputs.output,
      transform_output=transform.outputs.transform_output,
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=example_gen.outputs.examples,
      model_exports=trainer.outputs.output,
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=example_gen.outputs.examples, model=trainer.outputs.output)

  # This custom component serves as a bridge between pipeline and human model
  # reviewers to enable review-and-push workflow in model development cycle. It
  # utilizes Slack API to send message to user-defined Slack channel with model
  # URI info and wait for go / no-go decision from the same Slack channel:
  #   * To approve the model, users need to reply the thread sent out by the bot
  #     started by SlackComponent with 'lgtm' or 'approve'.
  #   * To reject the model, users need to reply the thread sent out by the bot
  #     started by SlackComponent with 'decline' or 'reject'.
  slack_validator = SlackComponent(
      model_export=trainer.outputs.output,
      model_blessing=model_validator.outputs.blessing,
      slack_token=_slack_token,
      channel_id=_channel_id,
      timeout_sec=3600,
  )

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model_export=trainer.outputs.output,
      model_blessing=slack_validator.outputs.slack_blessing,
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=_serving_model_dir)))

  return [
      example_gen, statistics_gen, infer_schema, validate_stats, transform,
      trainer, model_analyzer, model_validator, slack_validator, pusher
  ]

예제 #8

0

파일 보기

파일: taxi_pipeline_kubeflow_gcp_bqml.py 프로젝트: ysjeon7/tfx

def _create_pipeline(
    pipeline_name: Text, pipeline_root: Text, query: Text, module_file: Text,
    beam_pipeline_args: List[Text], ai_platform_training_args: Dict[Text, Text],
    bigquery_serving_args: Dict[Text, Text]) -> pipeline.Pipeline:
  """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = BigQueryExampleGen(query=query)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(
      statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=infer_schema.outputs['schema'])

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      examples=example_gen.outputs['examples'],
      schema=infer_schema.outputs['schema'],
      module_file=module_file)

  # Uses user-provided Python function that implements a model using TF-Learn
  # to train a model on Google Cloud AI Platform.
  trainer = Trainer(
      custom_executor_spec=executor_spec.ExecutorClassSpec(
          ai_platform_trainer_executor.Executor),
      module_file=module_file,
      transformed_examples=transform.outputs['transformed_examples'],
      schema=infer_schema.outputs['schema'],
      transform_graph=transform.outputs['transform_graph'],
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000),
      custom_config={'ai_platform_training_args': ai_platform_training_args})

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=example_gen.outputs['examples'],
      model=trainer.outputs['model'],
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=example_gen.outputs['examples'], model=trainer.outputs['model'])

  # Checks whether the model passed the validation steps and pushes the model
  # to  Google Cloud BigQuery ML if check passed.
  pusher = Pusher(
      custom_executor_spec=executor_spec.ExecutorClassSpec(
          bigquery_ml_pusher_executor.Executor),
      model=trainer.outputs['model'],
      model_blessing=model_validator.outputs['blessing'],
      custom_config={'bigquery_serving_args': bigquery_serving_args})

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          example_gen, statistics_gen, infer_schema, validate_stats, transform,
          trainer, model_analyzer, model_validator, pusher
      ],
      beam_pipeline_args=beam_pipeline_args,
  )

예제 #9

0

파일 보기

파일: taxi_pipeline_kubeflow.py 프로젝트: NunoEdgarGFlowHub/tfx

def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = BigQueryExampleGen(query=_query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=_taxi_utils)

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    try:
        from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor  # pylint: disable=g-import-not-at-top
        # Train using a custom executor. This requires TFX >= 0.14.
        trainer = Trainer(
            executor_class=ai_platform_trainer_executor.Executor,
            module_file=_taxi_utils,
            transformed_examples=transform.outputs.transformed_examples,
            schema=infer_schema.outputs.output,
            transform_output=transform.outputs.transform_output,
            train_args=trainer_pb2.TrainArgs(num_steps=10000),
            eval_args=trainer_pb2.EvalArgs(num_steps=5000),
            custom_config={
                'ai_platform_training_args': _ai_platform_training_args
            })
    except ImportError:
        # Train using a deprecated flag.
        trainer = Trainer(
            module_file=_taxi_utils,
            transformed_examples=transform.outputs.transformed_examples,
            schema=infer_schema.outputs.output,
            transform_output=transform.outputs.transform_output,
            train_args=trainer_pb2.TrainArgs(num_steps=10000),
            eval_args=trainer_pb2.EvalArgs(num_steps=5000),
            custom_config={'cmle_training_args': _ai_platform_training_args})

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Checks whether the model passed the validation steps and pushes the model
    # to a destination if check passed.
    try:
        from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor  # pylint: disable=g-import-not-at-top
        # Deploy the model on Google Cloud AI Platform. This requires TFX >=0.14.
        pusher = Pusher(executor_class=ai_platform_pusher_executor.Executor,
                        model_export=trainer.outputs.output,
                        model_blessing=model_validator.outputs.blessing,
                        custom_config={
                            'ai_platform_serving_args':
                            _ai_platform_serving_args
                        })
    except ImportError:
        # Deploy the model on Google Cloud AI Platform, using a deprecated flag.
        pusher = Pusher(
            model_export=trainer.outputs.output,
            model_blessing=model_validator.outputs.blessing,
            custom_config={'cmle_serving_args': _ai_platform_serving_args},
            push_destination=pusher_pb2.PushDestination(
                filesystem=pusher_pb2.PushDestination.Filesystem(
                    base_directory=_serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name='chicago_taxi_pipeline_kubeflow',
        pipeline_root=_pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        additional_pipeline_args={
            'beam_pipeline_args': [
                '--runner=DataflowRunner',
                '--experiments=shuffle_mode=auto',
                '--project=' + _project_id,
                '--temp_location=' + os.path.join(_output_bucket, 'tmp'),
                '--region=' + _gcp_region,
            ],
            # Optional args:
            # 'tfx_image': custom docker image to use for components.
            # This is needed if TFX package is not installed from an RC
            # or released version.
        },
        log_root='/var/tmp/tfx/logs',
    )

예제 #10

0

파일 보기

def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the Iris flowers pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output,
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(examples=example_gen.outputs.examples,
                               model_exports=trainer.outputs.output)

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs['output'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model_export=trainer.outputs.output,
                    model_blessing=model_validator.outputs.blessing,
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        additional_pipeline_args={},
    )

예제 #11

0

파일 보기

파일: parameterized_tfx_oss.py 프로젝트: ucdmkt/pipelines

def _create_test_pipeline(pipeline_root: Text, csv_input_location: Text,
    taxi_module_file: Text, enable_cache: bool):
  """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline.

  Args:
    pipeline_name: The name of the pipeline.
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    taxi_module_file: The location of the module file for Transform/Trainer.
    enable_cache: Whether to enable cache or not.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """
  examples = csv_input(csv_input_location)

  example_gen = CsvExampleGen(input_base=examples)
  statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)
  infer_schema = SchemaGen(
      stats=statistics_gen.outputs.output, infer_feature_shape=False)
  validate_stats = ExampleValidator(
      stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output)
  transform = Transform(
      input_data=example_gen.outputs.examples,
      schema=infer_schema.outputs.output,
      module_file=taxi_module_file)
  trainer = Trainer(
      module_file=taxi_module_file,
      transformed_examples=transform.outputs.transformed_examples,
      schema=infer_schema.outputs.output,
      transform_output=transform.outputs.transform_output,
      train_args=trainer_pb2.TrainArgs(num_steps=10),
      eval_args=trainer_pb2.EvalArgs(num_steps=5))
  model_analyzer = Evaluator(
      examples=example_gen.outputs.examples,
      model_exports=trainer.outputs.output,
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
        evaluator_pb2.SingleSlicingSpec(
            column_for_slicing=['trip_start_hour'])
      ]))
  model_validator = ModelValidator(
      examples=example_gen.outputs.examples, model=trainer.outputs.output)

  # Hack: ensuring push_destination can be correctly parameterized and interpreted.
  # pipeline root will be specified as a dsl.PipelineParam with the name
  # pipeline-root, see:
  # https://github.com/tensorflow/tfx/blob/1c670e92143c7856f67a866f721b8a9368ede385/tfx/orchestration/kubeflow/kubeflow_dag_runner.py#L226
  _pipeline_root_param = dsl.PipelineParam(name='pipeline-root')
  pusher = Pusher(
      model_export=trainer.outputs.output,
      model_blessing=model_validator.outputs.blessing,
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=os.path.join(str(_pipeline_root_param), 'model_serving'))))

  return pipeline.Pipeline(
      pipeline_name='parameterized_tfx_oss',
      pipeline_root=pipeline_root,
      components=[
        example_gen, statistics_gen, infer_schema, validate_stats, transform,
        trainer, model_analyzer, model_validator, pusher
      ],
      enable_cache=enable_cache,
  )

예제 #12

0

파일 보기

def _create_pipeline(pipeline_name: Text,
                     pipeline_root: Text,
                     data_root: Text,
                     module_file: Text,
                     serving_model_dir: Text,
                     metadata_path: Text,
                     direct_num_workers: int = 1) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        # Note that direct_num_workers != 1 will enable multi-process for TFX,
        # we hide the FnApiRunner[1] setting from user, but this is subject to
        # change if Beam offers pure flag setup.
        # [1]https://issues.apache.org/jira/browse/BEAM-3645
        beam_pipeline_args=['--direct_num_workers=%s' % direct_num_workers],
        additional_pipeline_args={},
    )

예제 #13

0

파일 보기

파일: training_pipeline.py 프로젝트: swipswaps/zenml

    def get_tfx_component_list(self, config: Dict[Text, Any]) -> List:
        """
        Builds the training pipeline as a series of TFX components.

        Args:
            config: A ZenML configuration in dictionary format.

        Returns:
            A chronological list of TFX components making up the training
             pipeline.

        """
        steps = config[keys.GlobalKeys.PIPELINE][keys.PipelineKeys.STEPS]

        component_list = []

        ############
        # RAW DATA #
        ############
        data_config = steps[keys.TrainingSteps.DATA]
        data = DataGen(name=self.datasource.name,
                       source=data_config[keys.StepKeys.SOURCE],
                       source_args=data_config[keys.StepKeys.ARGS]).with_id(
                           GDPComponent.DataGen.name)

        statistics_data = StatisticsGen(
            examples=data.outputs.examples).with_id(
                GDPComponent.DataStatistics.name)

        schema_data = SchemaGen(
            statistics=statistics_data.outputs.output, ).with_id(
                GDPComponent.DataSchema.name)

        component_list.extend([data, statistics_data, schema_data])

        datapoints = data.outputs.examples

        #############
        # SPLITTING #
        #############
        # Block to read the data from the corresponding BQ table
        split_config = steps[keys.TrainingSteps.SPLIT]
        splits = SplitGen(
            input_examples=datapoints,
            source=split_config[keys.StepKeys.SOURCE],
            source_args=split_config[keys.StepKeys.ARGS],
            schema=schema_data.outputs.schema,
            statistics=statistics_data.outputs.output,
        ).with_id(GDPComponent.SplitGen.name)

        datapoints = splits.outputs.examples

        statistics_split = StatisticsGen(examples=datapoints).with_id(
            GDPComponent.SplitStatistics.name)

        schema_split = SchemaGen(
            statistics=statistics_split.outputs.output, ).with_id(
                GDPComponent.SplitSchema.name)

        schema = schema_split.outputs.schema

        component_list.extend([splits, statistics_split, schema_split])

        ##############
        # SEQUENCING #
        ##############
        if keys.TrainingSteps.SEQUENCER in steps:
            sequencer_config = steps[keys.TrainingSteps.SEQUENCER]
            sequencer = Sequencer(
                input_examples=datapoints,
                schema=schema,
                statistics=statistics_split.outputs.statistics,
                source=sequencer_config[keys.StepKeys.SOURCE],
                source_args=sequencer_config[keys.StepKeys.ARGS]).with_id(
                    GDPComponent.Sequencer.name)

            sequencer_statistics = StatisticsGen(
                examples=sequencer.outputs.output_examples).with_id(
                    GDPComponent.SequencerStatistics.name)

            sequencer_schema = SchemaGen(
                statistics=sequencer_statistics.outputs.output,
                infer_feature_shape=True,
            ).with_id(GDPComponent.SequencerSchema.name)

            datapoints = sequencer.outputs.output_examples
            schema = sequencer_schema.outputs.schema

            component_list.extend(
                [sequencer, sequencer_statistics, sequencer_schema])

        #################
        # PREPROCESSING #
        #################
        transform = Transform(
            preprocessing_fn=constants.PREPROCESSING_FN,
            examples=datapoints,
            schema=schema,
            custom_config=steps[keys.TrainingSteps.PREPROCESSER]).with_id(
                GDPComponent.Transform.name)

        component_list.extend([transform])

        ############
        # TRAINING #
        ############
        training_backend: TrainingBaseBackend = \
            self.steps_dict[keys.TrainingSteps.TRAINER].backend

        # default to local
        if training_backend is None:
            training_backend = TrainingBaseBackend()

        training_kwargs = {
            'custom_executor_spec': training_backend.get_executor_spec(),
            'custom_config': steps[keys.TrainingSteps.TRAINER]
        }
        training_kwargs['custom_config'].update(
            training_backend.get_custom_config())

        trainer = Trainer(
            transformed_examples=transform.outputs.transformed_examples,
            transform_graph=transform.outputs.transform_graph,
            run_fn=constants.TRAINER_FN,
            schema=schema,
            train_args=trainer_pb2.TrainArgs(),
            eval_args=trainer_pb2.EvalArgs(),
            **training_kwargs).with_id(GDPComponent.Trainer.name)

        component_list.extend([trainer])

        #############
        # EVALUATOR #
        #############
        if keys.TrainingSteps.EVALUATOR in steps:
            from zenml.utils import source_utils
            eval_module = '.'.join(
                constants.EVALUATOR_MODULE_FN.split('.')[:-1])
            eval_module_file = constants.EVALUATOR_MODULE_FN.split('.')[-1]
            abs_path = source_utils.get_absolute_path_from_module(eval_module)
            custom_extractor_path = os.path.join(abs_path,
                                                 eval_module_file) + '.py'
            eval_step: TFMAEvaluator = TFMAEvaluator.from_config(
                steps[keys.TrainingSteps.EVALUATOR])
            eval_config = eval_step.build_eval_config()
            evaluator = Evaluator(
                examples=transform.outputs.transformed_examples,
                model=trainer.outputs.model,
                eval_config=eval_config,
                module_file=custom_extractor_path,
            ).with_id(GDPComponent.Evaluator.name)
            component_list.append(evaluator)

        ###########
        # SERVING #
        ###########
        if keys.TrainingSteps.DEPLOYER in steps:
            deployer: BaseDeployerStep = \
                self.steps_dict[keys.TrainingSteps.DEPLOYER]
            pusher_config = deployer._build_pusher_args()
            pusher_executor_spec = deployer._get_executor_spec()
            pusher = Pusher(model_export=trainer.outputs.output,
                            custom_executor_spec=pusher_executor_spec,
                            **pusher_config).with_id(
                                GDPComponent.Deployer.name)

            component_list.append(pusher)

        return component_list

예제 #14

0

파일 보기

def _create_pipeline():
  """Implements the chicago taxi pipeline with TFX."""

  query = """
          SELECT
            pickup_community_area,
            fare,
            EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month,
            EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour,
            EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day,
            UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp,
            pickup_latitude,
            pickup_longitude,
            dropoff_latitude,
            dropoff_longitude,
            trip_miles,
            pickup_census_tract,
            dropoff_census_tract,
            payment_type,
            company,
            trip_seconds,
            dropoff_community_area,
            tips
          FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
          WHERE (ABS(FARM_FINGERPRINT(unique_key)) / {max_int64})
            < {query_sample_rate}""".format(
                max_int64=_max_int64, query_sample_rate=_query_sample_rate)

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = BigQueryExampleGen(query=query)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output)

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      input_data=example_gen.outputs.examples,
      schema=infer_schema.outputs.output,
      module_file=_taxi_utils)

  # Uses user-provided Python function that implements a model using TF-Learn
  # to train a model on Google Cloud AI Platform.
  try:
    from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor  # pylint: disable=g-import-not-at-top
    # Train using a custom executor. This requires TFX >= 0.14.
    trainer = Trainer(
        executor_class=ai_platform_trainer_executor.Executor,
        module_file=_taxi_utils,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000),
        custom_config={'ai_platform_training_args': _ai_platform_training_args})
  except ImportError:
    # Train using a deprecated flag.
    trainer = Trainer(
        module_file=_taxi_utils,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000),
        custom_config={'cmle_training_args': _ai_platform_training_args})

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=example_gen.outputs.examples,
      model_exports=trainer.outputs.output,
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=example_gen.outputs.examples, model=trainer.outputs.output)

  # Checks whether the model passed the validation steps and pushes the model
  # to a destination if check passed.
  try:
    from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor  # pylint: disable=g-import-not-at-top
    # Deploy the model on Google Cloud AI Platform. This requires TFX >=0.14.
    pusher = Pusher(
        executor_class=ai_platform_pusher_executor.Executor,
        model_export=trainer.outputs.output,
        model_blessing=model_validator.outputs.blessing,
        custom_config={'ai_platform_serving_args': _ai_platform_serving_args},
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=_serving_model_dir)))
  except ImportError:
    # Deploy the model on Google Cloud AI Platform, using a deprecated flag.
    pusher = Pusher(
        model_export=trainer.outputs.output,
        model_blessing=model_validator.outputs.blessing,
        custom_config={'cmle_serving_args': _ai_platform_serving_args},
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=_serving_model_dir)))

  return [
      example_gen, statistics_gen, infer_schema, validate_stats, transform,
      trainer, model_analyzer, model_validator, pusher
  ]

예제 #15

0

파일 보기

    def get_tfx_component_list(self, config: Dict[Text, Any]) -> List:
        """
        Builds the NLP pipeline as a series of TFX components.

        Args:
            config: A ZenML configuration in dictionary format.

        Returns:
            A chronological list of TFX components making up the NLP
             pipeline.

        """
        steps = config[keys.GlobalKeys.PIPELINE][keys.PipelineKeys.STEPS]

        component_list = []

        ############
        # RAW DATA #
        ############
        data_config = steps[keys.NLPSteps.DATA]
        data = DataGen(name=self.datasource.name,
                       source=data_config[keys.StepKeys.SOURCE],
                       source_args=data_config[keys.StepKeys.ARGS]).with_id(
                           GDPComponent.DataGen.name)

        #############
        # TOKENIZER #
        #############
        tokenizer_config = steps[keys.NLPSteps.TOKENIZER]
        tokenizer = Tokenizer(
            source=tokenizer_config[keys.StepKeys.SOURCE],
            source_args=tokenizer_config[keys.StepKeys.ARGS],
            examples=data.outputs.examples,
        ).with_id(GDPComponent.Tokenizer.name)

        component_list.extend([tokenizer])

        # return component_list

        statistics_data = StatisticsGen(
            examples=tokenizer.outputs.output_examples).with_id(
                GDPComponent.DataStatistics.name)

        schema_data = SchemaGen(
            statistics=statistics_data.outputs.output,
            infer_feature_shape=True,
        ).with_id(GDPComponent.DataSchema.name)

        split_config = steps[keys.NLPSteps.SPLIT]
        splits = SplitGen(
            input_examples=tokenizer.outputs.output_examples,
            source=split_config[keys.StepKeys.SOURCE],
            source_args=split_config[keys.StepKeys.ARGS],
            schema=schema_data.outputs.schema,
            statistics=statistics_data.outputs.output,
        ).with_id(GDPComponent.SplitGen.name)

        component_list.extend([data, statistics_data, schema_data, splits])

        ############
        # TRAINING #
        ############
        training_backend: Optional[TrainingBaseBackend] = \
            self.steps_dict[keys.NLPSteps.TRAINER].backend

        # default to local
        if training_backend is None:
            training_backend = TrainingBaseBackend()

        training_kwargs = {
            'custom_executor_spec': training_backend.get_executor_spec(),
            'custom_config': steps[keys.NLPSteps.TRAINER]
        }
        training_kwargs['custom_config'].update(
            training_backend.get_custom_config())

        trainer = Trainer(examples=splits.outputs.examples,
                          run_fn=constants.TRAINER_FN,
                          schema=schema_data.outputs.schema,
                          train_args=trainer_pb2.TrainArgs(),
                          eval_args=trainer_pb2.EvalArgs(),
                          **training_kwargs).with_id(GDPComponent.Trainer.name)

        component_list.extend([trainer])

        return component_list

예제 #16

0

파일 보기

def _create__pipeline(pipeline_name: Text, pipeline_root: Text,
                      data_root: Text, module_file: Text,
                      ai_platform_training_args: Dict[Text, Text],
                      ai_platform_serving_args: Dict[Text, Text],
                      beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the online news pipeline with TFX."""

    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using
    # TensorFlow's Estimators API.
    trainer = Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(
            ai_platform_trainer_executor.Executor),
        module_file=module_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000),
        custom_config={'ai_platform_training_args': ai_platform_training_args})

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(column_for_slicing=['weekday'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(
        custom_executor_spec=executor_spec.ExecutorClassSpec(
            ai_platform_pusher_executor.Executor),
        model_export=trainer.outputs.output,
        model_blessing=model_validator.outputs.blessing,
        custom_config={'ai_platform_serving_args': ai_platform_serving_args})

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        # enable_cache=True,
        beam_pipeline_args=beam_pipeline_args)

예제 #17

0

파일 보기

  def testAIPlatformTrainerPipeline(self):
    """Trainer-only test pipeline on AI Platform Training."""
    pipeline_name = 'kubeflow-aip-trainer-test-{}'.format(self._random_id())
    pipeline = self._create_pipeline(
        pipeline_name,
        [
            self.schema_importer,
            self.transformed_examples_importer,
            self.transform_graph_importer,
            Trainer(
                custom_executor_spec=executor_spec.ExecutorClassSpec(
                    ai_platform_trainer_executor.Executor),
                module_file=self._trainer_module,
                transformed_examples=self.transformed_examples_importer
                .outputs['result'],
                schema=self.schema_importer.outputs['result'],
                transform_graph=self.transform_graph_importer.outputs['result'],
                train_args=trainer_pb2.TrainArgs(num_steps=10),
                eval_args=trainer_pb2.EvalArgs(num_steps=5),
                custom_config={
                    # Test that distributed training is behaves properly.
                    ai_platform_trainer_executor.TRAINING_ARGS_KEY: {
                        'project':
                            self._gcp_project_id,
                        'region':
                            self._gcp_region,
                        'jobDir':
                            os.path.join(
                                self._pipeline_root(pipeline_name), 'tmp'),
                        'masterConfig': {
                            'imageUri': self._container_image,
                        },
                        'scaleTier':
                            'CUSTOM',
                        'masterType':
                            'large_model',
                        'parameterServerType':
                            'standard',
                        'parameterServerCount':
                            1,
                        'workerType':
                            'standard',
                        'workerCount':
                            2,
                    }
                })
        ])
    self._compile_and_run_pipeline(pipeline)

    # There must be only one execution of Trainer.
    trainer_output_base_dir = os.path.join(
        self._pipeline_root(pipeline_name), 'Trainer', 'model')
    trainer_outputs = tf.io.gfile.listdir(trainer_output_base_dir)
    self.assertEqual(1, len(trainer_outputs))

    # There must be only one saved models each for serving and eval.
    model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0])
    self.assertEqual(
        1, len(tf.io.gfile.listdir(path_utils.eval_model_dir(model_uri))))
    self.assertEqual(
        1,
        len(
            tf.io.gfile.listdir(
                os.path.join(
                    path_utils.serving_model_dir(model_uri), 'export',
                    'chicago-taxi'))))

예제 #18

0

파일 보기

def _create_test_pipeline(pipeline_name: Text, pipeline_root: Text,
                          csv_input_location: Text, taxi_module_file: Text,
                          container_image: Text):
    """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline for testing.

  Args:
    pipeline_name: The name of the pipeline.
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    taxi_module_file: The location of the module file for Transform/Trainer.
    container_image: The container image to use.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """
    examples = dsl_utils.csv_input(csv_input_location)

    example_gen = CsvExampleGen(input_base=examples)
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)
    validate_stats = ExampleValidator(  # pylint: disable=unused-variable
        stats=statistics_gen.outputs.output,
        schema=infer_schema.outputs.output)
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=taxi_module_file)
    trainer = Trainer(
        module_file=taxi_module_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))
    model_analyzer = Evaluator(  # pylint: disable=unused-variable
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)
    pusher = Pusher(  # pylint: disable=unused-variable
        model_export=trainer.outputs.output,
        model_blessing=model_validator.outputs.blessing,
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(pipeline_root, 'model_serving'))))

    return tfx_pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        log_root='/var/tmp/tfx/logs',
        additional_pipeline_args={
            'tfx_image': container_image,
        },
    )

예제 #19

0

파일 보기

파일: pipeline.py 프로젝트: Jwuthri/tfx

def create_pipeline(pipeline_name, pipeline_root, input_path,
                    tf_transform_file, tf_trainer_file, serving_model_basedir,
                    **kwargs):

    examples = tfrecord_input(input_path)

    input_config = example_gen_pb2.Input(splits=[
        example_gen_pb2.Input.Split(name='tfrecord',
                                    pattern='data_tfrecord-*.gz'),
    ])  # todo add as airflow var

    output_config = example_gen_pb2.Output(
        split_config=example_gen_pb2.SplitConfig(splits=[
            example_gen_pb2.SplitConfig.Split(
                name='train', hash_buckets=2),  # todo add as airflow var
            example_gen_pb2.SplitConfig.Split(
                name='eval', hash_buckets=1)  # todo add as airflow var
        ]))
    example_gen = ImportExampleGen(input_base=examples,
                                   input_config=input_config,
                                   output_config=output_config)

    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=tf_transform_file)

    trainer = Trainer(
        module_file=tf_trainer_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=[])  # todo add your slicing column
        ]))

    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    pusher = Pusher(model_export=trainer.outputs.output,
                    model_blessing=model_validator.outputs.blessing,
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_basedir)))

    pipeline = Pipeline(pipeline_name=pipeline_name,
                        pipeline_root=pipeline_root,
                        **kwargs)
    pipeline.components = [
        example_gen, statistics_gen, infer_schema, validate_stats, transform,
        trainer, model_analyzer, model_validator, pusher
    ]

    return pipeline

예제 #20

0

파일 보기

파일: taxi_pipeline_kubeflow.py 프로젝트: robertlugg/tfx

def _create_pipeline(
        pipeline_name: Text, pipeline_root: Text, query: Text,
        module_file: Text, serving_model_dir: Text,
        beam_pipeline_args: List[Text], ai_platform_training_args: Dict[Text,
                                                                        Text],
        ai_platform_serving_args: Dict[Text, Text]) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = BigQueryExampleGen(query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output,
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    try:
        from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor  # pylint: disable=g-import-not-at-top
        # Train using a custom executor. This requires TFX >= 0.14.
        trainer = Trainer(
            custom_executor_spec=executor_spec.ExecutorClassSpec(
                ai_platform_trainer_executor.Executor),
            module_file=module_file,
            transformed_examples=transform.outputs.transformed_examples,
            schema=infer_schema.outputs.output,
            transform_output=transform.outputs.transform_output,
            train_args=trainer_pb2.TrainArgs(num_steps=10000),
            eval_args=trainer_pb2.EvalArgs(num_steps=5000),
            custom_config={
                'ai_platform_training_args': ai_platform_training_args
            })
    except ImportError:
        # Train using a deprecated flag.
        trainer = Trainer(
            module_file=module_file,
            transformed_examples=transform.outputs.transformed_examples,
            schema=infer_schema.outputs.output,
            transform_output=transform.outputs.transform_output,
            train_args=trainer_pb2.TrainArgs(num_steps=10000),
            eval_args=trainer_pb2.EvalArgs(num_steps=5000),
            custom_config={'cmle_training_args': ai_platform_training_args})

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Checks whether the model passed the validation steps and pushes the model
    # to a destination if check passed.
    try:
        from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor  # pylint: disable=g-import-not-at-top
        # Deploy the model on Google Cloud AI Platform. This requires TFX >=0.14.
        pusher = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec(
            ai_platform_pusher_executor.Executor),
                        model_export=trainer.outputs.output,
                        model_blessing=model_validator.outputs.blessing,
                        custom_config={
                            'ai_platform_serving_args':
                            ai_platform_serving_args
                        })
    except ImportError:
        # Deploy the model on Google Cloud AI Platform, using a deprecated flag.
        pusher = Pusher(
            model_export=trainer.outputs.output,
            model_blessing=model_validator.outputs.blessing,
            custom_config={'cmle_serving_args': ai_platform_serving_args},
            push_destination=pusher_pb2.PushDestination(
                filesystem=pusher_pb2.PushDestination.Filesystem(
                    base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        additional_pipeline_args={
            'beam_pipeline_args': beam_pipeline_args,
        },
        log_root='/var/tmp/tfx/logs',
    )

예제 #21

0

파일 보기

def _create_pipeline(pipeline_root: Text,
                     csv_input_location: data_types.RuntimeParameter,
                     taxi_module_file: data_types.RuntimeParameter,
                     enable_cache: bool):
    """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline.

  Args:
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    taxi_module_file: The location of the module file for Transform/Trainer.
    enable_cache: Whether to enable cache or not.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """
    examples = external_input(csv_input_location)

    example_gen = CsvExampleGen(input=examples)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    infer_schema = SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=False,
    )
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'],
    )
    transform = Transform(
        examples=example_gen.outputs['examples'],
        schema=infer_schema.outputs['schema'],
        module_file=taxi_module_file,
    )
    trainer = Trainer(
        module_file=taxi_module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10),
        eval_args=trainer_pb2.EvalArgs(num_steps=5),
    )
    # Set the TFMA config for Model Evaluation and Validation.
    eval_config = tfma.EvalConfig(
        model_specs=[
            # Using signature 'eval' implies the use of an EvalSavedModel. To use
            # a serving model remove the signature to defaults to 'serving_default'
            # and add a label_key.
            tfma.ModelSpec(signature_name='eval')
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                # The metrics added here are in addition to those saved with the
                # model (assuming either a keras model or EvalSavedModel is used).
                # Any metrics added into the saved model (for example using
                # model.compile(..., metrics=[...]), etc) will be computed
                # automatically.
                metrics=[tfma.MetricConfig(class_name='ExampleCount')],
                # To add validation thresholds for metrics saved with the model,
                # add them keyed by metric name to the thresholds map.
                thresholds={
                    'binary_accuracy':
                    tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.5}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ],
        slicing_specs=[
            # An empty slice spec means the overall slice, i.e. the whole dataset.
            tfma.SlicingSpec(),
            # Data can be sliced along a feature column. In this case, data is
            # sliced along feature column trip_start_hour.
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ])

    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        eval_config=eval_config,
    )

    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=model_analyzer.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(str(pipeline.ROOT_PARAMETER),
                                            'model_serving'))),
    )

    return pipeline.Pipeline(
        pipeline_name='parameterized_tfx_oss',
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, pusher
        ],
        enable_cache=enable_cache,
    )

예제 #22

0

파일 보기

def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    examples = external_input(data_root)
    example_gen = CsvExampleGen(input=examples)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=True)
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)
    trainer = Trainer(
        module_file=module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=1000),
        eval_args=trainer_pb2.EvalArgs(num_steps=500))
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='species')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'sparse_categorical_accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.9}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    model_analyzer = Evaluator(examples=example_gen.outputs['examples'],
                               model=trainer.outputs['model'],
                               baseline_model=model_resolver.outputs['model'],
                               eval_config=eval_config)
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_analyzer.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_resolver, model_analyzer, pusher
        ],
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
        enable_cache=True)

예제 #23

0

파일 보기

def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""
    examples = csv_input(_data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=_taxi_module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=_taxi_module_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model_export=trainer.outputs.output,
                    model_blessing=model_validator.outputs.blessing,
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=_serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name='taxi_solution',
        pipeline_root=_pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=True,
        metadata_db_root=_metadata_db_root,
        additional_pipeline_args={'logger_args': logger_overrides},
    )

예제 #24

0

파일 보기

파일: tfx-kfp.py 프로젝트: rakesh283343/kfp_notebook_example

def _create_pipeline(
        pipeline_name: Text, pipeline_root: Text, query: Text,
        module_file: Text, beam_pipeline_args: List[Text],
        ai_platform_training_args: Dict[Text, Text],
        ai_platform_serving_args: Dict[Text, Text]) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = BigQueryExampleGen(query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    trainer = Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(
            ai_platform_trainer_executor.Executor),
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000),
        custom_config={
            ai_platform_trainer_executor.TRAINING_ARGS_KEY:
            ai_platform_training_args
        })

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'binary_accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)

    # Checks whether the model passed the validation steps and pushes the model
    # to  Google Cloud AI Platform if check passed.
    pusher = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec(
        ai_platform_pusher_executor.Executor),
                    model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    custom_config={
                        ai_platform_pusher_executor.SERVING_ARGS_KEY:
                        ai_platform_serving_args
                    })

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, schema_gen, example_validator,
            transform, trainer, model_resolver, evaluator, pusher
        ],
        beam_pipeline_args=beam_pipeline_args,
    )

예제 #25

0

파일 보기

def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""

    query = """
          SELECT
            pickup_community_area,
            fare,
            EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month,
            EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour,
            EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day,
            UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp,
            pickup_latitude,
            pickup_longitude,
            dropoff_latitude,
            dropoff_longitude,
            trip_miles,
            pickup_census_tract,
            dropoff_census_tract,
            payment_type,
            company,
            trip_seconds,
            dropoff_community_area,
            tips
          FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
          WHERE RAND() < {}""".format(_query_sample_rate)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = BigQueryExampleGen(query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=_taxi_utils)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=_taxi_utils,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000),
        custom_config={'cmle_training_args': _cmle_training_args})

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model_export=trainer.outputs.output,
                    model_blessing=model_validator.outputs.blessing,
                    custom_config={'cmle_serving_args': _cmle_serving_args},
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=_serving_model_dir)))

    return [
        example_gen, statistics_gen, infer_schema, validate_stats, transform,
        trainer, model_analyzer, model_validator, pusher
    ]

예제 #26

0

파일 보기

def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the cifar10 pipeline with TFX."""
    examples = external_input(data_root)
    input_split = example_gen_pb2.Input(splits=[
        example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'),
        example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord')
    ])
    example_gen = ImportExampleGen(input=examples, input_config=input_split)
    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(module_file=module_file,
                      examples=transform.outputs['transformed_examples'],
                      schema=infer_schema.outputs['schema'],
                      transform_graph=transform.outputs['transform_graph'],
                      train_args=trainer_pb2.TrainArgs(num_steps=1000),
                      eval_args=trainer_pb2.EvalArgs(num_steps=500))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(
            specs=[evaluator_pb2.SingleSlicingSpec()]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, evaluator, model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        additional_pipeline_args={},
    )

예제 #27

0

파일 보기

def _create_pipeline(
        pipeline_name: Text, pipeline_root: Text, module_file: Text,
        ai_platform_training_args: Dict[Text, Text],
        ai_platform_serving_args: Dict[Text, Text]) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""

    # The rate at which to sample rows from the Taxi dataset using BigQuery.
    # The full taxi dataset is > 200M record.  In the interest of resource
    # savings and time, we've set the default for this example to be much smaller.
    # Feel free to crank it up and process the full dataset!
    # By default it generates a 0.1% random sample.
    query_sample_rate = data_types.RuntimeParameter(name='query-sample-rate',
                                                    ptype=float,
                                                    default=0.001)

    # This is the upper bound of FARM_FINGERPRINT in Bigquery (ie the max value of
    # signed int64).
    max_int64 = '0x7FFFFFFFFFFFFFFF'

    # The query that extracts the examples from BigQuery. The Chicago Taxi dataset
    # used for this example is a public dataset available on Google AI Platform.
    # https://console.cloud.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips
    query = """
          SELECT
            pickup_community_area,
            fare,
            EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month,
            EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour,
            EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day,
            UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp,
            pickup_latitude,
            pickup_longitude,
            dropoff_latitude,
            dropoff_longitude,
            trip_miles,
            pickup_census_tract,
            dropoff_census_tract,
            payment_type,
            company,
            trip_seconds,
            dropoff_community_area,
            tips
          FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
          WHERE (ABS(FARM_FINGERPRINT(unique_key)) / {max_int64})
            < {query_sample_rate}""".format(
        max_int64=max_int64, query_sample_rate=str(query_sample_rate))

    # Beam args to run data processing on DataflowRunner.
    # TODO(b/151114974): Remove `disk_size_gb` flag after default is increased.
    # TODO(b/151116587): Remove `shuffle_mode` flag after default is changed.
    beam_pipeline_args = [
        '--runner=DataflowRunner',
        '--experiments=shuffle_mode=auto',
        '--project=' + _project_id,
        '--temp_location=' + os.path.join(_output_bucket, 'tmp'),
        '--region=' + _gcp_region,
        '--disk_size_gb=50',
    ]

    # Number of epochs in training.
    train_steps = data_types.RuntimeParameter(
        name='train-steps',
        default=10000,
        ptype=int,
    )

    # Number of epochs in evaluation.
    eval_steps = data_types.RuntimeParameter(
        name='eval-steps',
        default=5000,
        ptype=int,
    )

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = BigQueryExampleGen(query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Update ai_platform_training_args if distributed training was enabled.
    # Number of worker machines used in distributed training.
    worker_count = data_types.RuntimeParameter(
        name='worker-count',
        default=2,
        ptype=int,
    )

    # Type of worker machines used in distributed training.
    worker_type = data_types.RuntimeParameter(
        name='worker-type',
        default='standard',
        ptype=str,
    )

    if FLAGS.distributed_training:
        ai_platform_training_args.update({
            # You can specify the machine types, the number of replicas for workers
            # and parameter servers.
            # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#ScaleTier
            'scaleTier': 'CUSTOM',
            'masterType': 'large_model',
            'workerType': worker_type,
            'parameterServerType': 'standard',
            'workerCount': worker_count,
            'parameterServerCount': 1
        })

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    trainer = Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(
            ai_platform_trainer_executor.Executor),
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args={'num_steps': train_steps},
        eval_args={'num_steps': eval_steps},
        custom_config={
            ai_platform_trainer_executor.TRAINING_ARGS_KEY:
            ai_platform_training_args
        })

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'binary_accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)

    # Checks whether the model passed the validation steps and pushes the model
    # to  Google Cloud AI Platform if check passed.
    pusher = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec(
        ai_platform_pusher_executor.Executor),
                    model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    custom_config={
                        ai_platform_pusher_executor.SERVING_ARGS_KEY:
                        ai_platform_serving_args
                    })

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, schema_gen, example_validator,
            transform, trainer, model_resolver, evaluator, pusher
        ],
        beam_pipeline_args=beam_pipeline_args,
    )

예제 #28

0

파일 보기

def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000),
    )

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'binary_accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)

    # Checks whether the model passed the validation steps and pushes the model
    # to  Google Cloud AI Platform if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_analyzer.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_resolver, model_analyzer, pusher
        ],
        # TODO(b/142684737): The multi-processing API might change.
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
    )

예제 #29

0

파일 보기

def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000),
    )

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to  Google Cloud AI Platform if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        # TODO(b/141578059): The multi-processing API might change.
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
        additional_pipeline_args={},
    )

예제 #30

0

파일 보기

파일: taxi_pipeline_with_inference.py 프로젝트: zxlzr/tfx

def _create_pipeline(pipeline_name: Text, pipeline_root: Text,
                     training_data_root: Text, inference_data_root: Text,
                     module_file: Text, metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    training_examples = external_input(training_data_root)

    # Brings training data into the pipeline or otherwise joins/converts
    # training data.
    training_example_gen = CsvExampleGen(input_base=training_examples,
                                         instance_name='training_example_gen')

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(
        input_data=training_example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs['output'],
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs['output'],
                                      schema=infer_schema.outputs['output'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=training_example_gen.outputs['examples'],
                          schema=infer_schema.outputs['output'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['output'],
        transform_output=transform.outputs['transform_output'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=training_example_gen.outputs['examples'],
        model_exports=trainer.outputs['output'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(
        examples=training_example_gen.outputs['examples'],
        model=trainer.outputs['output'])

    inference_examples = external_input(inference_data_root)

    # Brings inference data into the pipeline.
    inference_example_gen = CsvExampleGen(
        input_base=inference_examples,
        output_config=example_gen_pb2.Output(
            split_config=example_gen_pb2.SplitConfig(splits=[
                example_gen_pb2.SplitConfig.Split(name='unlabelled',
                                                  hash_buckets=100)
            ])),
        instance_name='inference_example_gen')

    # Performs offline batch inference over inference examples.
    bulk_inferrer = BulkInferrer(
        examples=inference_example_gen.outputs['examples'],
        model_export=trainer.outputs['output'],
        model_blessing=model_validator.outputs['blessing'],
        # Empty data_spec.example_splits will result in using all splits.
        data_spec=bulk_inferrer_pb2.DataSpec(),
        model_spec=bulk_inferrer_pb2.ModelSpec())

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            training_example_gen, inference_example_gen, statistics_gen,
            infer_schema, validate_stats, transform, trainer, model_analyzer,
            model_validator, bulk_inferrer
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        # TODO(b/141578059): The multi-processing API might change.
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])