Пример #1
0
                                 input_config=input_config)
    return span_example


# %%

if __name__ == '__main__':
    context = InteractiveContext(pipeline_root=config.PIPELINE_ROOT)

    # %%
    complaint_df = pd.read_csv(config.DATA_FILE_PATH, encoding='utf-8')

    # %%
    #ImportExampleGen with TFRecord
    complaint_tfrecord = tfrecord_data_writer(file_path=config.DATA_FILE_PATH)
    example_gen = ImportExampleGen(input_base=config.RECORD_DIR_PATH)
    context.run(example_gen)

    # %%
    #Plain simple csv file for CsvExampleGen
    example_gen = CsvExampleGen(input_base=config.DATA_DIR_PATH)
    context.run(example_gen)

    # %%
    #Data Split
    split_example_gen = data_split(file_path=config.DATA_SPLITS_DIR_PATH)
    context.run(split_example_gen)

    # %%
    #Existing Data Split
    #Won't run through as there is no train folder
Пример #2
0
def build_pipeline(timestamp: str) -> pipeline:
    """
    Gather tfx components and produce the output pipeline
    """

    conf['beam']['serving_model_dir'] = f"{conf['beam']['serving_model_dir']}/beam/OL{653374}/{timestamp}"
    conf['beam']['pipeline_root_dir'] = f"{conf['beam']['pipeline_root_dir']}/beam/OL{653374}/{timestamp}"
    conf['beam']['metadata_path'] = f"{conf['beam']['metadata_path']}/beam/OL{653374}"

    logging.info("Serving model dir is now %s",conf['beam']['serving_model_dir'])

    example_gen = ImportExampleGen(input_base=conf['train_data'])

    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    schema_gen = SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=False
    )
    
    transform = Transform(
        examples=example_gen.outputs['examples'],
        schema=schema_gen.outputs['schema'],
        module_file=conf['trainer_module_file']
    )

    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema']
    )

    trainer = Trainer(
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        module_file=conf['trainer_module_file'],
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), # define this to use run_fn instead of trainer_fn
        train_args=trainer_pb2.TrainArgs(num_steps=conf['train_args_steps']),
        eval_args=trainer_pb2.EvalArgs(num_steps=50)
    )

    metrics = [
        tfma.metrics.ExampleCount(name='example_count'),
        tfma.metrics.WeightedExampleCount(name='weighted_example_count'),
        tf.keras.metrics.BinaryCrossentropy(name='binary_crossentropy'),
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.AUC(name='auc', num_thresholds=10),
        tf.keras.metrics.AUC(
            name='auc_precision_recall', curve='PR', num_thresholds=100),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tfma.metrics.MeanLabel(name='mean_label'),
        tfma.metrics.MeanPrediction(name='mean_prediction'),
        tfma.metrics.Calibration(name='calibration'),
        tfma.metrics.ConfusionMatrixPlot(name='confusion_matrix_plot'),
        tfma.metrics.CalibrationPlot(name='calibration_plot')
    ]
    my_metrics_specs = tfma.metrics.specs_from_metrics(metrics)

    eval_config = tfma.EvalConfig(
        model_specs=[
            tfma.ModelSpec(label_key='label')
        ],
        metrics_specs=my_metrics_specs
        # [
            # tfma.MetricsSpec(
                # metrics=[
                #     # tfma.MetricConfig(class_name='ExampleCount'),
                #     tfma.MetricConfig(class_name='BinaryAccuracy',
                #       threshold=tfma.MetricThreshold(
                #           value_threshold=tfma.GenericValueThreshold(
                #               lower_bound={'value': 0.5}),
                #           change_threshold=tfma.GenericChangeThreshold(
                #               direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                #               absolute={'value': -1e-10})))
                # ]
            # )
        # ],
        ,
        slicing_specs=[
            tfma.SlicingSpec(),
        ])

    model_resolver = ResolverNode(
          instance_name='latest_blessed_model_resolver',
          resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
          model=Channel(type=Model),
          model_blessing=Channel(type=ModelBlessing))

    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        eval_config=eval_config)

    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=conf['beam']['serving_model_dir'])))

    components = [
        example_gen,
        statistics_gen,
        schema_gen,
        transform,
        example_validator,
        trainer,
        model_resolver,
        evaluator,
        pusher
    ]


    tfx_pipeline = pipeline.Pipeline(
        pipeline_name=conf['beam']['pipeline_name'],
        pipeline_root=conf['beam']['pipeline_root_dir'],
        components=components,
        enable_cache=False,
        metadata_connection_config=(
            metadata.sqlite_metadata_connection_config(conf['beam']['metadata_path'])

        )
    )

    return tfx_pipeline
Пример #3
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, module_file_lite: Text,
                     serving_model_dir: Text, serving_model_dir_lite: Text,
                     metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the handwritten digit classification example using TFX."""
    # Brings data into the pipeline.
    example_gen = ImportExampleGen(input_base=data_root)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    def _create_trainer(module_file, instance_name):
        return Trainer(module_file=module_file,
                       custom_executor_spec=executor_spec.ExecutorClassSpec(
                           GenericExecutor),
                       examples=transform.outputs['transformed_examples'],
                       transform_graph=transform.outputs['transform_graph'],
                       schema=schema_gen.outputs['schema'],
                       train_args=trainer_pb2.TrainArgs(num_steps=5000),
                       eval_args=trainer_pb2.EvalArgs(num_steps=100),
                       instance_name=instance_name)

    # Uses user-provided Python function that trains a Keras model.
    trainer = _create_trainer(module_file, 'mnist')

    # Trains the same model as the one above, but converts it into a TFLite one.
    trainer_lite = _create_trainer(module_file_lite, 'mnist_lite')

    # TODO(b/150949276): Add resolver back once it supports two trainers.

    # Uses TFMA to compute an evaluation statistics over features of a model and
    # performs quality validation of a candidate model.
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='image_class')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='SparseCategoricalAccuracy',
                    threshold=tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.8})))
            ])
        ])

    eval_config_lite = tfma.EvalConfig()
    eval_config_lite.CopyFrom(eval_config)
    # Informs the evaluator that the model is a TFLite model.
    eval_config_lite.model_specs[0].model_type = 'tf_lite'

    # Uses TFMA to compute the evaluation statistics over features of a model.
    evaluator = Evaluator(examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          eval_config=eval_config,
                          instance_name='mnist')

    # Uses TFMA to compute the evaluation statistics over features of a TFLite
    # model.
    evaluator_lite = Evaluator(examples=example_gen.outputs['examples'],
                               model=trainer_lite.outputs['model'],
                               eval_config=eval_config_lite,
                               instance_name='mnist_lite')

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)),
                    instance_name='mnist')

    # Checks whether the TFLite model passed the validation steps and pushes the
    # model to a file destination if check passed.
    pusher_lite = Pusher(model=trainer_lite.outputs['model'],
                         model_blessing=evaluator_lite.outputs['blessing'],
                         push_destination=pusher_pb2.PushDestination(
                             filesystem=pusher_pb2.PushDestination.Filesystem(
                                 base_directory=serving_model_dir_lite)),
                         instance_name='mnist_lite')

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            schema_gen,
            example_validator,
            transform,
            trainer,
            trainer_lite,
            evaluator,
            evaluator_lite,
            pusher,
            pusher_lite,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=beam_pipeline_args)
Пример #4
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text, beam_pipeline_args: List[Text]):
  """Creates pipeline."""
  pipeline_root = os.path.join(pipeline_root, 'pipelines', pipeline_name)

  example_gen = ImportExampleGen(
      input_base=data_root,
      # IMPORTANT: must set FORMAT_PROTO
      payload_format=example_gen_pb2.FORMAT_PROTO)

  data_view_provider = provider_component.TfGraphDataViewProvider(
      module_file=module_file,
      create_decoder_func='make_decoder')

  data_view_binder = binder_component.DataViewBinder(
      example_gen.outputs['examples'],
      data_view_provider.outputs['data_view'])

  statistics_gen = StatisticsGen(
      examples=data_view_binder.outputs['output_examples'])

  schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'])

  transform = Transform(
      examples=data_view_binder.outputs['output_examples'],
      schema=schema_gen.outputs['schema'],
      module_file=module_file,
      # important: must disable Transform materialization.
      materialize=False)

  trainer = Trainer(
      examples=data_view_binder.outputs['output_examples'],
      transform_graph=transform.outputs['transform_graph'],
      module_file=module_file,
      train_args=trainer_pb2.TrainArgs(num_steps=1000),
      schema=schema_gen.outputs['schema'],
      eval_args=trainer_pb2.EvalArgs(num_steps=10))

  eval_config = tfma.EvalConfig(
      model_specs=[
          tfma.ModelSpec(
              signature_name='',
              label_key='relevance',
              padding_options=tfma.config.PaddingOptions(
                  label_float_padding=-1.0, prediction_float_padding=-1.0))
      ],
      slicing_specs=[
          tfma.SlicingSpec(),
          tfma.SlicingSpec(feature_keys=['query_tokens']),
      ],
      metrics_specs=[
          tfma.MetricsSpec(
              per_slice_thresholds={
                  'metric/ndcg_10':
                      tfma.config.PerSliceMetricThresholds(thresholds=[
                          tfma.PerSliceMetricThreshold(
                              # The overall slice.
                              slicing_specs=[tfma.SlicingSpec()],
                              threshold=tfma.MetricThreshold(
                                  value_threshold=tfma.GenericValueThreshold(
                                      lower_bound={'value': 0.6})))
                      ])
              })
      ])

  evaluator = Evaluator(
      examples=data_view_binder.outputs['output_examples'],
      model=trainer.outputs['model'],
      eval_config=eval_config,
      schema=schema_gen.outputs['schema'])

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model=trainer.outputs['model'],
      model_blessing=evaluator.outputs['blessing'],
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=serving_model_dir)))

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          example_gen, data_view_provider, data_view_binder,
          statistics_gen,
          schema_gen,
          transform,
          trainer,
          evaluator,
          pusher,
      ],
      enable_cache=True,
      metadata_connection_config=metadata.sqlite_metadata_connection_config(
          metadata_path),
      beam_pipeline_args=beam_pipeline_args)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir_lite: Text,
                     metadata_path: Text, labels_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the CIFAR10 image classification pipeline using TFX."""
    # This is needed for datasets with pre-defined splits
    # Change the pattern argument to train_whole/* and test_whole/* to train
    # on the whole CIFAR-10 dataset
    input_config = example_gen_pb2.Input(splits=[
        example_gen_pb2.Input.Split(name='train', pattern='train/*'),
        example_gen_pb2.Input.Split(name='eval', pattern='test/*')
    ])

    # Brings data into the pipeline.
    example_gen = ImportExampleGen(input_base=data_root,
                                   input_config=input_config)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that trains a model.
    # When traning on the whole dataset, use 18744 for train steps, 156 for eval
    # steps. 18744 train steps correspond to 24 epochs on the whole train set, and
    # 156 eval steps correspond to 1 epoch on the whole test set. The
    # configuration below is for training on the dataset we provided in the data
    # folder, which has 128 train and 128 test samples. The 160 train steps
    # correspond to 40 epochs on this tiny train set, and 4 eval steps correspond
    # to 1 epoch on this tiny test set.
    trainer = Trainer(module_file=module_file,
                      examples=transform.outputs['transformed_examples'],
                      transform_graph=transform.outputs['transform_graph'],
                      schema=schema_gen.outputs['schema'],
                      train_args=trainer_pb2.TrainArgs(num_steps=160),
                      eval_args=trainer_pb2.EvalArgs(num_steps=4),
                      custom_config={'labels_path': labels_path})

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(
            type=ModelBlessing)).with_id('latest_blessed_model_resolver')

    # Uses TFMA to compute evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compare to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[
            tfma.ModelSpec(label_key='label_xf', model_type='tf_lite')
        ],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='SparseCategoricalAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.55}),
                        # Change threshold will be ignored if there is no
                        # baseline model resolved from MLMD (first run).
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-3})))
            ])
        ])

    # Uses TFMA to compute the evaluation statistics over features of a model.
    # We evaluate using the materialized examples that are output by Transform
    # because
    # 1. the decoding_png function currently performed within Transform are not
    # compatible with TFLite.
    # 2. MLKit requires deserialized (float32) tensor image inputs
    # Note that for deployment, the same logic that is performed within Transform
    # must be reproduced client-side.
    evaluator = Evaluator(examples=transform.outputs['transformed_examples'],
                          model=trainer.outputs['model'],
                          baseline_model=model_resolver.outputs['model'],
                          eval_config=eval_config)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir_lite)))

    components = [
        example_gen, statistics_gen, schema_gen, example_validator, transform,
        trainer, model_resolver, evaluator, pusher
    ]

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=beam_pipeline_args)
Пример #6
0
def create_pipeline(
    pipeline_name: Text,
    pipeline_root: Path,
    module_file: Path,
    serving_model_path: Path,
    metadata_path: Path,
    data_path: Path,
) -> pipeline.Pipeline:
    builder = Gta1()
    builder.download_and_prepare()

    input_config = example_gen_pb2.Input(splits=[
        example_gen_pb2.Input.Split(name="train", pattern="*.tfrecord-[0-9]*"),
    ], )

    output_config = example_gen_pb2.Output(
        split_config=example_gen_pb2.SplitConfig(splits=[
            example_gen_pb2.SplitConfig.Split(name="train", hash_buckets=9),
            example_gen_pb2.SplitConfig.Split(name="eval", hash_buckets=1),
        ], ), )

    # Bring the data in to the pipeline.
    example_gen = ImportExampleGen(
        input_base=builder.data_dir,
        input_config=input_config,
        output_config=output_config,
    )

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs["examples"])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(
        statistics=statistics_gen.outputs["statistics"],
        infer_feature_shape=True,
    )

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(
        module_file=str(module_file),
        examples=example_gen.outputs["examples"],
        schema=schema_gen.outputs["schema"],
        materialize=True,
    )

    # Uses user-provided Python function that trains a model.
    trainer = Trainer(
        module_file=str(module_file),
        examples=transform.outputs["transformed_examples"],
        transform_graph=transform.outputs["transform_graph"],
        schema=schema_gen.outputs["schema"],
        train_args=trainer_pb2.TrainArgs(num_steps=10_000),
        eval_args=trainer_pb2.EvalArgs(num_steps=500),
    )

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(
        model=trainer.outputs["model"],
        # model_blessing=evaluator.outputs["blessing"],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=str(serving_model_path), ), ),
    )

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=str(pipeline_root),
        components=[
            example_gen,
            statistics_gen,
            schema_gen,
            transform,
            trainer,
            pusher,
        ],
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            str(metadata_path), ),
        enable_cache=True,
    )
Пример #7
0
def create_pipeline(
    pipeline_name: Text,
    pipeline_root: Text,
    data_path: Text,
    enable_cache: bool,
    preprocessing_fn: Text,
    run_fn: Text,
    train_args: trainer_pb2.TrainArgs,
    eval_args: trainer_pb2.EvalArgs,
    serving_model_dir: Text,
    metadata_connection_config: Optional[
        metadata_store_pb2.ConnectionConfig] = None,
    beam_pipeline_args: Optional[List[Text]] = None,
    ai_platform_training_args: Optional[Dict[Text, Text]] = None,
    ai_platform_serving_args: Optional[Dict[Text, Any]] = None,
    trainer_custom_config: Optional[Dict[Text, Any]] = None,
) -> pipeline.Pipeline:
    components = []

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = ImportExampleGen(input=external_input(data_path))
    components.append(example_gen)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    components.append(statistics_gen)

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)
    components.append(schema_gen)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(  # pylint: disable=unused-variable
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])
    components.append(example_validator)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          preprocessing_fn=preprocessing_fn)
    components.append(transform)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer_args = {
        'run_fn':
        run_fn,
        'transformed_examples':
        transform.outputs['transformed_examples'],
        'schema':
        schema_gen.outputs['schema'],
        'transform_graph':
        transform.outputs['transform_graph'],
        'train_args':
        train_args,
        'eval_args':
        eval_args,
        'custom_executor_spec':
        executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor),
        'custom_config':
        trainer_custom_config,
    }

    if ai_platform_training_args is not None:
        trainer_args.update({
            'custom_executor_spec':
            executor_spec.ExecutorClassSpec(
                ai_platform_trainer_executor.GenericExecutor),
            'custom_config': {
                ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                ai_platform_training_args,
            }
        })
    trainer = Trainer(**trainer_args)
    components.append(trainer)

    # TODO in TFX <= 2.22.0 we need a workaround to enable the pusher. Pusher is disabled until we move sample to >
    #  TFX==2.22.00

    #
    # pusher_args = {
    #         'model': trainer.outputs['model'],
    #         'model_blessing': blessing_importer.outputs['result'],
    #         'push_destination': pusher_pb2.PushDestination(
    #                 filesystem=pusher_pb2.PushDestination.Filesystem(
    #                         base_directory=serving_model_dir)),
    # }
    # if ai_platform_serving_args is not None:
    #     pusher_args.update({
    #             'custom_executor_spec': executor_spec.ExecutorClassSpec(
    #                     ai_platform_pusher_executor.Executor),
    #             'custom_config': {
    #                     ai_platform_pusher_executor.SERVING_ARGS_KEY: ai_platform_serving_args
    #             },
    #     })
    # pusher = Pusher(**pusher_args)  # pylint: disable=unused-variable
    # Temporary disable pusher.
    # components.append(pusher)

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        enable_cache=enable_cache,
        metadata_connection_config=metadata_connection_config,
        beam_pipeline_args=beam_pipeline_args,
    )
Пример #8
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the cifar10 pipeline with TFX."""
    examples = external_input(data_root)
    input_split = example_gen_pb2.Input(splits=[
        example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'),
        example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord')
    ])
    example_gen = ImportExampleGen(input=examples, input_config=input_split)
    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(module_file=module_file,
                      examples=transform.outputs['transformed_examples'],
                      schema=infer_schema.outputs['schema'],
                      transform_graph=transform.outputs['transform_graph'],
                      train_args=trainer_pb2.TrainArgs(num_steps=1000),
                      eval_args=trainer_pb2.EvalArgs(num_steps=500))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(
            specs=[evaluator_pb2.SingleSlicingSpec()]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, evaluator, model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
    )
Пример #9
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the handwritten digit classification example using TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline.
    example_gen = ImportExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that trains a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=infer_schema.outputs['schema'],
        train_args=trainer_pb2.TrainArgs(num_steps=5000),
        eval_args=trainer_pb2.EvalArgs(num_steps=100))

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute an evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='image_class')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'sparse_categorical_accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.8}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(examples=example_gen.outputs['examples'],
                               model=trainer.outputs['model'],
                               baseline_model=model_resolver.outputs['model'],
                               eval_config=eval_config)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_analyzer.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            infer_schema,
            validate_stats,
            transform,
            trainer,
            model_resolver,
            model_analyzer,
            pusher,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        # TODO(b/142684737): The multi-processing API might change.
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
    )
Пример #10
0
def create_pipeline(
    pipeline_name: Text,
    pipeline_root: Text,
    data_path: Text,
    # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source.
    # query: Text,
    # preprocessing_fn: Text,
    # run_fn: Text,
    module_file: Text,
    train_args: trainer_pb2.TrainArgs,
    eval_args: trainer_pb2.EvalArgs,
    eval_accuracy_threshold: float,
    serving_model_dir: Text,
    metadata_connection_config: Optional[
        metadata_store_pb2.ConnectionConfig] = None,
    beam_pipeline_args: Optional[List[Text]] = None,
    ai_platform_training_args: Optional[Dict[Text, Text]] = None,
    ai_platform_serving_args: Optional[Dict[Text, Any]] = None,
) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""

    components = []

    # Brings data into the pipeline or otherwise joins/converts training data.
    #  example_gen = CsvExampleGen(input=external_input(data_path))
    example_gen = ImportExampleGen(input=external_input(data_path))
    # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source.
    # example_gen = BigQueryExampleGen(query=query)
    components.append(example_gen)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    # TODO(step 5): Uncomment here to add StatisticsGen to the pipeline.
    components.append(statistics_gen)

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)
    # TODO(step 5): Uncomment here to add SchemaGen to the pipeline.
    components.append(schema_gen)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(  # pylint: disable=unused-variable
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    components.append(example_validator)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    components.append(transform)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer_args = {
        'module_file':
        module_file,
        #   'examples': example_gen.outputs['examples'],
        'transformed_examples':
        transform.outputs['transformed_examples'],
        'schema':
        schema_gen.outputs['schema'],
        'transform_graph':
        transform.outputs['transform_graph'],
        'train_args':
        train_args,
        'eval_args':
        eval_args,
        'custom_executor_spec':
        executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor),
    }
    if ai_platform_training_args is not None:
        trainer_args.update({
            'custom_executor_spec':
            executor_spec.ExecutorClassSpec(
                ai_platform_trainer_executor.GenericExecutor),
            'custom_config': {
                ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                ai_platform_training_args,
            }
        })
    trainer = Trainer(**trainer_args)
    # TODO(step 6): Uncomment here to add Trainer to the pipeline.
    components.append(trainer)

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))
    # TODO(step 6): Uncomment here to add ResolverNode to the pipeline.
    components.append(model_resolver)

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='label')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='BinaryAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': eval_accuracy_threshold}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10})))
            ])
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)
    # TODO(step 6): Uncomment here to add Evaluator to the pipeline.
    components.append(evaluator)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher_args = {
        'model':
        trainer.outputs['model'],
        'model_blessing':
        evaluator.outputs['blessing'],
        'push_destination':
        pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=serving_model_dir)),
    }
    if ai_platform_serving_args is not None:
        pusher_args.update({
            'custom_executor_spec':
            executor_spec.ExecutorClassSpec(
                ai_platform_pusher_executor.Executor),
            'custom_config': {
                ai_platform_pusher_executor.SERVING_ARGS_KEY:
                ai_platform_serving_args
            },
        })
    pusher = Pusher(**pusher_args)  # pylint: disable=unused-variable
    # TODO(step 6): Uncomment here to add Pusher to the pipeline.
    components.append(pusher)

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        # TODO(step 8): Change this value to control caching of execution results.
        enable_cache=True,
        metadata_connection_config=metadata_connection_config,
        beam_pipeline_args=beam_pipeline_args,
    )
Пример #11
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the handwritten digit classification example using TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline.
    example_gen = ImportExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that trains a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=infer_schema.outputs['schema'],
        train_args=trainer_pb2.TrainArgs(num_steps=5000),
        eval_args=trainer_pb2.EvalArgs(num_steps=100))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        eval_config=tfma.EvalConfig(
            model_specs=[tfma.ModelSpec(label_key='image/class')],
            slicing_specs=[tfma.SlicingSpec()]))

    # TODO(ananthr): support infra validator, model validation in evaluator,
    # and pusher component.

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            infer_schema,
            validate_stats,
            transform,
            trainer,
            model_analyzer,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
    )
Пример #12
0
def create_pipeline(pipeline_name: Text,
                    pipeline_root: Text,
                    data_root: Text,
                    test_data_root: Text,
                    module_file: Text,
                    serving_model_dir: Text,
                    enable_cache: bool,
                    metadata_connection_config: Optional[
                        metadata_store_pb2.ConnectionConfig] = None,
                    beam_pipeline_args: Optional[List[Text]] = None):
    """create pipeline

    Args:
        pipeline_name (Text): pipeline name
        pipeline_root (Text): pipeline root path
        data_root (Text): input data path
        test_data_root (Text): test data path
        module_file (Text): Python module files to inject customized logic into the TFX components.
        serving_model_dir (Text): output directory path
        enable_cache (bool): Whether to use the cache or not
        metadata_connection_config (Optional[ metadata_store_pb2.ConnectionConfig], optional): [description]. Defaults to None.
        beam_pipeline_args (Optional[List[Text]], optional): [description]. Defaults to None.

    Returns:
        [type]: [description]
    """

    # train testで分かれているtfrecordを指定
    output_config = example_gen_pb2.Output(
        split_config=example_gen_pb2.SplitConfig(splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=8),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=2),
        ]))
    # パイプラインにデータをロード
    example_gen = ImportExampleGen(input_base=data_root,
                                   output_config=output_config,
                                   instance_name="train_data")

    test_example_gen = ImportExampleGen(input_base=test_data_root,
                                        instance_name="test_data")

    # データの統計量を計算
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # staticsGenの統計ファイルからスキーマを生成
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # データに欠損などがないかを検査
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    trainer = Trainer(
        module_file=module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        train_args=trainer_pb2.TrainArgs(num_steps=160),
        eval_args=trainer_pb2.EvalArgs(num_steps=4),
    )

    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # https://github.com/tensorflow/tfx/issues/3016
    eval_config = tfma.EvalConfig(
        model_specs=[
            tfma.ModelSpec(label_key='label',
                           model_type='tf_keras',
                           signature_name="serving_default")
        ],
        slicing_specs=[
            tfma.SlicingSpec(),
        ],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='SparseCategoricalAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.2}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-3})))
            ])
        ])

    evaluator = Evaluator(examples=test_example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          baseline_model=model_resolver.outputs['model'],
                          eval_config=eval_config)

    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    components = [
        example_gen, test_example_gen, statistics_gen, schema_gen,
        example_validator, transform, trainer, model_resolver, evaluator,
        pusher
    ]

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        enable_cache=enable_cache,
        metadata_connection_config=metadata_connection_config,
        beam_pipeline_args=beam_pipeline_args,
    )
Пример #13
0
def create_pipeline(
    pipeline_name: Text,
    pipeline_root: Text,
    data_path: Text,
    preprocessing_fn: Text,
    run_fn: Text,
    train_args: trainer_pb2.TrainArgs,
    eval_args: trainer_pb2.EvalArgs,
    eval_accuracy_threshold: float,
    serving_model_dir: Text,
    query: Optional[Text] = None,
    metadata_connection_config: Optional[
        metadata_store_pb2.ConnectionConfig] = None,
    beam_pipeline_args: Optional[List[Text]] = None,
    ai_platform_training_args: Optional[Dict[Text, Text]] = None,
    ai_platform_serving_args: Optional[Dict[Text, Any]] = None,
) -> pipeline.Pipeline:

  if query:
    example_gen = BigQueryExampleGen(query=query)
  else:
    # example_gen = CsvExampleGen(input=external_input(data_path))
    example_gen = ImportExampleGen(input=external_input(data_path))

  statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

  schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                         infer_feature_shape=False)

  example_validator = ExampleValidator(statistics=statistics_gen.outputs['statistics'],
                                       schema=schema_gen.outputs['schema'])

  transform = Transform(examples=example_gen.outputs['examples'],
                        schema=schema_gen.outputs['schema'],
                        preprocessing_fn=preprocessing_fn)

  trainer_args = {
    'run_fn': run_fn,
    'transformed_examples': transform.outputs['transformed_examples'],
    'schema': schema_gen.outputs['schema'],
    'transform_graph': transform.outputs['transform_graph'],
    'train_args': train_args,
    'eval_args': eval_args,
    'custom_executor_spec':
        executor_spec.ExecutorClassSpec(
          trainer_executor.GenericExecutor),
  }
  if ai_platform_training_args:
    trainer_args.update({
      'custom_executor_spec':
        executor_spec.ExecutorClassSpec(
            ai_platform_trainer_executor.GenericExecutor),
      'custom_config': {
        ai_platform_trainer_executor.TRAINING_ARGS_KEY:
          ai_platform_training_args,
      }
    })
  trainer = Trainer(**trainer_args)

  # model_resolver = ResolverNode(instance_name='latest_blessed_model_resolver',
  #                               resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
  #                               model=Channel(type=Model),
  #                               model_blessing=Channel(type=ModelBlessing))

  # eval_config = tfma.EvalConfig(
  #     model_specs=[tfma.ModelSpec(label_key='tips')],
  #     slicing_specs=[tfma.SlicingSpec()],
  #     metrics_specs=[
  #         tfma.MetricsSpec(
  #             thresholds={
  #                 'binary_accuracy':
  #                 tfma.config.MetricThreshold(
  #                     value_threshold=tfma.GenericValueThreshold(
  #                         lower_bound={'value': eval_accuracy_threshold}),
  #                     change_threshold=tfma.GenericChangeThreshold(
  #                         direction=tfma.MetricDirection.HIGHER_IS_BETTER,
  #                         absolute={'value': -1e-10}))
  #             })
  #     ])
  # evaluator = Evaluator(examples=example_gen.outputs['examples'],
  #                       model=trainer.outputs['model'],
  #                       baseline_model=model_resolver.outputs['model'],
  #                       eval_config=eval_config)

  # pusher_args = {
  #   'model':
  #     trainer.outputs['model'],
  #   'model_blessing':
  #     evaluator.outputs['blessing'],
  #   'push_destination':
  #     pusher_pb2.PushDestination(
  #       filesystem=pusher_pb2.PushDestination.Filesystem(
  #         base_directory=serving_model_dir)),
  # }
  # if ai_platform_serving_args:
  #   pusher_args.update({
  #     'custom_executor_spec': 
  #       executor_spec.ExecutorClassSpec(
  #         ai_platform_pusher_executor.Executor),
  #     'custom_config': {
  #       ai_platform_pusher_executor.SERVING_ARGS_KEY:
  #         ai_platform_serving_args
  #     },
  #   })
  # pusher = Pusher(**pusher_args)

  return pipeline.Pipeline(
    pipeline_name=pipeline_name,
    pipeline_root=pipeline_root,
    components=[
      example_gen,
      statistics_gen,
      schema_gen,
      example_validator,
      transform,
      # trainer,
      # model_resolver,
      # evaluator,
      # pusher
    ],
    enable_cache=True,
    metadata_connection_config=metadata_connection_config,
    beam_pipeline_args=beam_pipeline_args,
  )
Пример #14
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     custom_config: Dict[Text, Any], module_file: Text,
                     serving_model_dir: Text, metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the handwritten digit classification example using TFX."""
    # Store the configuration along with the pipeline run so results can be reproduced
    pipeline_configuration = FromCustomConfig(custom_config=custom_config)

    # Brings data into the pipeline.
    example_gen = ImportExampleGen(input_base=data_root)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Create a filtered dataset - today we only want a model for small digits
    filter = Filter(examples=example_gen.outputs['examples'],
                    pipeline_configuration=pipeline_configuration.
                    outputs['pipeline_configuration'],
                    splits_to_transform=['train', 'eval'],
                    splits_to_copy=[])

    # Create a stratified dataset for evaluation
    stratified_examples = StratifiedSampler(
        examples=filter.outputs['filtered_examples'],
        pipeline_configuration=pipeline_configuration.
        outputs['pipeline_configuration'],
        samples_per_key=1200,
        splits_to_transform=['eval'],
        splits_to_copy=['train'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=filter.outputs['filtered_examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that trains a Keras model.
    trainer = Trainer(
        module_file=module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        custom_config=custom_config,
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        train_args=trainer_pb2.TrainArgs(num_steps=5000),
        eval_args=trainer_pb2.EvalArgs(num_steps=100)).with_id(u'trainer')

    # Uses TFMA to compute evaluation statistics over features of a model and
    # performs quality validation of a candidate model.
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='image_class')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='SparseCategoricalAccuracy',
                    threshold=tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.8})))
            ])
        ])

    # Uses TFMA to compute the evaluation statistics over features of a model.
    evaluator = Evaluator(
        examples=stratified_examples.outputs['stratified_examples'],
        model=trainer.outputs['model'],
        eval_config=eval_config).with_id(u'evaluator')

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=serving_model_dir))).with_id(u'pusher')

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            pipeline_configuration,
            example_gen,
            filter,
            stratified_examples,
            statistics_gen,
            schema_gen,
            example_validator,
            transform,
            trainer,
            evaluator,
            pusher,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=beam_pipeline_args)
Пример #15
0
def create_pipeline(pipeline_name: Text, pipeline_root: Text,
                    metadata_path: Text) -> Pipeline:
    # Read the dataset and split to train / eval
    output_config = example_gen_pb2.Output(
        split_config=example_gen_pb2.SplitConfig(splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=4),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ]))
    examples = tfrecord_input(DATA_PATH)
    example_gen = ImportExampleGen(input=examples, output_config=output_config)

    # Generate dataset statistics
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generate schema based on statistics
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Validate data and perform anomaly detection
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Feature engineering
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=TRANSFORM_MODULE)

    trainer = Trainer(
        module_file=TRAINER_MODULE,
        examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        train_args=trainer_pb2.TrainArgs(num_steps=200),
        eval_args=trainer_pb2.EvalArgs(num_steps=35))

    model_spec = tfma.ModelSpec(label_key=LABEL_KEY)
    slicing_spec = tfma.SlicingSpec()

    value_threshold = tfma.GenericValueThreshold(upper_bound={'value': 0.7})
    threshold = tfma.MetricThreshold(value_threshold=value_threshold)
    metric_config = tfma.MetricConfig(class_name='MeanAbsoluteError',
                                      threshold=threshold)
    metrics_spec = tfma.MetricsSpec(metrics=[metric_config])

    eval_config = tfma.EvalConfig(model_specs=[model_spec],
                                  slicing_specs=[slicing_spec],
                                  metrics_specs=[metrics_spec])
    evaluator = Evaluator(examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          eval_config=eval_config)

    filesystem = pusher_pb2.PushDestination.Filesystem(
        base_directory=SERVING_MODEL_DIR)
    push_destination = pusher_pb2.PushDestination(filesystem=filesystem)
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=push_destination)

    pipeline = Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        components=[
            example_gen, statistics_gen, schema_gen, example_validator,
            transform, trainer, evaluator, pusher
        ],
        enable_cache=True,
        beam_pipeline_args=['--direct_num_workers=0'])
    return pipeline
Пример #16
0
def create_pipeline(
    pipeline_name: Text,
    pipeline_root: Text,
    data_path: Text,
    # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source.
    # query: Text,
    preprocessing_fn: Text,
    run_fn: Text,
    train_args: trainer_pb2.TrainArgs,
    eval_args: trainer_pb2.EvalArgs,
    eval_accuracy_threshold: float,
    serving_model_dir: Text,
    metadata_connection_config: Optional[
        metadata_store_pb2.ConnectionConfig] = None,
    beam_pipeline_args: Optional[List[Text]] = None,
    ai_platform_training_args: Optional[Dict[Text, Text]] = None,
    ai_platform_serving_args: Optional[Dict[Text, Any]] = None,
) -> pipeline.Pipeline:
    """Implements the Centernet pipeline with TFX."""
    components = []

    output_config = example_gen_pb2.Output(
        split_config=example_gen_pb2.
        SplitConfig(splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=3),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ],
                    partition_feature_name='image/filename'))

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = ImportExampleGen(input=external_input(data_path),
                                   output_config=output_config)
    components.append(example_gen)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'],
                                   stats_options=STATS_OPTIONS)
    components.append(statistics_gen)

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)
    components.append(schema_gen)

    # Import manually crafted schema
    importer_node = ImporterNode(
        instance_name='import_user_schema',
        source_uri="gs://raw_data_layer/schema/",
        artifact_type=tfx.types.standard_artifacts.Schema)
    components.append(importer_node)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(  # pylint: disable=unused-variable
        statistics=statistics_gen.outputs['statistics'],
        schema=importer_node.outputs['result'])
    components.append(example_validator)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=importer_node.outputs['result'],
                          preprocessing_fn=preprocessing_fn)
    components.append(transform)

    # update training_args per once use.
    trainer_args = {
        'run_fn':
        run_fn,
        'transformed_examples':
        transform.outputs['transformed_examples'],
        'schema':
        importer_node.outputs['result'],
        'transform_graph':
        transform.outputs['transform_graph'],
        'train_args':
        train_args,
        'eval_args':
        eval_args,
        'custom_executor_spec':
        executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor),
    }
    if ai_platform_training_args is not None:
        trainer_args.update({
            'custom_executor_spec':
            executor_spec.ExecutorClassSpec(
                ai_platform_trainer_executor.GenericExecutor),
            'custom_config': {
                ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                ai_platform_training_args,
            }
        })
    trainer = Trainer(**trainer_args)
    components.append(trainer)

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        # Change this value to control caching of execution results. Default value
        # is `False`.
        enable_cache=True,
        metadata_connection_config=metadata_connection_config,
        beam_pipeline_args=beam_pipeline_args,
    )