def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) return pipeline.Pipeline( pipeline_name='chicago_taxi_simple', pipeline_root=_pipeline_root, components=[example_gen, statistics_gen, infer_schema, validate_stats], enable_cache=True, metadata_db_root=_metadata_db_root, )
def create_pipeline(): # Read data in; can split data here examples = csv_input(DATA_DIR) example_gen = CsvExampleGen(input_base=examples, name='iris_example') # Generate feature statistics statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Infer schema for data infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Identify anomomalies in training and serving data validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs feature engineering; emits a SavedModel that does preprocessing transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=TRANSFORM_MODULE_FILE) # Trains a model trainer = Trainer( module_file=MODEL_MODULE_FILE, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Evaluates the model on different slices of the data (bias detection?!) model_analyzer = Evaluator(examples=example_gen.outputs.examples, model_exports=trainer.outputs.output) # Compares new model against a baseline; both models evaluated on a dataset model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Pushes a blessed model to a deployment target (tfserving) pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=SERVING_DIR))) return pipeline.Pipeline(pipeline_name=PIPELINE_NAME, pipeline_root=DAGS_DIR, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_db_root=METADATA_DIR, additional_pipeline_args={ 'logger_args': { 'log_root': LOGS_DIR, 'log_level': logging.INFO } })
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) return pipeline.Pipeline( pipeline_name='chicago_taxi_pipeline_kubeflow', pipeline_root=_pipeline_root, components=[example_gen, statistics_gen, infer_schema], additional_pipeline_args={ 'beam_pipeline_args': [ '--runner=DataflowRunner', '--experiments=shuffle_mode=auto', '--project=' + _project_id, '--temp_location=' + os.path.join(_output_dir, 'tmp'), '--region=' + _gcp_region, ], }, log_root='/var/tmp/tfx/logs', )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs['output']) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs['output'], schema=infer_schema.outputs['output']) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, statistics_gen, infer_schema, validate_stats], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, )
def _create_test_pipeline(pipeline_root: Text, csv_input_location: Text, taxi_module_file: Text, output_bucket: Text, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = csv_input(csv_input_location) example_gen = CsvExampleGen(input_base=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output, infer_feature_shape=False) validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_module_file) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(output_bucket, 'model_serving')))) return pipeline.Pipeline( pipeline_name='chicago_taxi_pipeline_simple', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, )
def testTaxiPipelineNewStyleCompatibility(self): examples = external_input('/tmp/fake/path') example_gen = CsvExampleGen(input=examples) self.assertIs(example_gen.inputs['input'], example_gen.inputs['input_base']) statistics_gen = StatisticsGen( examples=example_gen.outputs['examples']) self.assertIs(statistics_gen.inputs['examples'], statistics_gen.inputs['input_data']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics']) self.assertIs(infer_schema.inputs['statistics'], infer_schema.inputs['stats']) self.assertIs(infer_schema.outputs['schema'], infer_schema.outputs['output']) validate_examples = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) self.assertIs(validate_examples.inputs['statistics'], validate_examples.inputs['stats']) self.assertIs(validate_examples.outputs['anomalies'], validate_examples.outputs['output']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file='/tmp/fake/module/file') self.assertIs(transform.inputs['examples'], transform.inputs['input_data']) self.assertIs(transform.outputs['transform_graph'], transform.outputs['transform_output']) trainer = Trainer( module_file='/tmp/fake/module/file', transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) self.assertIs(trainer.inputs['transform_graph'], trainer.inputs['transform_output']) self.assertIs(trainer.outputs['model'], trainer.outputs['output']) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) self.assertIs(evaluator.inputs['model'], evaluator.inputs['model_exports']) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher(model=trainer.outputs['output'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory='/fake/serving/dir'))) self.assertIs(pusher.inputs['model'], pusher.inputs['model_export']) self.assertIs(pusher.outputs['pushed_model'], pusher.outputs['model_push'])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model']) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, )
def create_e2e_components( pipeline_root: Text, csv_input_location: Text, transform_module: Text, trainer_module: Text, ) -> List[BaseComponent]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ examples = dsl_utils.csv_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=transform_module) trainer = Trainer( transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module, ) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def testCsvExampleGenOnDataflowRunner(self): """CsvExampleGen-only test pipeline on DataflowRunner invocation.""" pipeline_name = 'kubeflow-csv-example-gen-dataflow-test-{}'.format( self._random_id()) pipeline = self._create_dataflow_pipeline(pipeline_name, [ CsvExampleGen(input=dsl_utils.csv_input(self._data_root)), ]) self._compile_and_run_pipeline(pipeline)
def create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. # statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Step 3 # Generates schema based on statistics files. # schema_gen = SchemaGen(stats=statistics_gen.outputs.output) # Step 3 # Performs anomaly detection based on statistics and data schema. # validate_stats = ExampleValidator( # Step 3 # stats=statistics_gen.outputs.output, # Step 3 # schema=schema_gen.outputs.output) # Step 3 # Performs transformations and feature engineering in training and serving. # transform = Transform( # Step 4 # input_data=example_gen.outputs.examples, # Step 4 # schema=schema_gen.outputs.output, # Step 4 # module_file=taxi_module_file) # Step 4 # Uses user-provided Python function that implements a model using TF-Learn. # trainer = Trainer( # Step 5 # module_file=taxi_module_file, # Step 5 # transformed_examples=transform.outputs.transformed_examples, # Step 5 # schema=schema_gen.outputs.output, # Step 5 # transform_output=transform.outputs.transform_output, # Step 5 # train_steps=10000, # Step 5 # eval_steps=5000, # Step 5 # warm_starting=True) # Step 5 # Uses TFMA to compute a evaluation statistics over features of a model. # model_analyzer = Evaluator( # Step 6 # examples=example_gen.outputs.examples, # Step 6 # model_exports=trainer.outputs.output) # Step 6 # Performs quality validation of a candidate model (compared to a baseline). # model_validator = ModelValidator( # Step 7 # examples=example_gen.outputs.examples, model=trainer.outputs.output) # Step 7 # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. # pusher = Pusher( # Step 7 # model_export=trainer.outputs.output, # Step 7 # model_blessing=model_validator.outputs.blessing, # Step 7 # serving_model_dir=serving_model_dir) # Step 7 return [ example_gen, # statistics_gen, schema_gen, validate_stats, # Step 3 # transform, # Step 4 # trainer, # Step 5 # model_analyzer, # Step 6 # model_validator, pusher # Step 7 ]
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def create_pipeline(): """Implements the titanic taxi pipeline with TFX.""" examples = csv_input(data_dir) # Brings data into the pipeline example_gen = CsvExampleGen(input_base=examples) return [example_gen]
def create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(os.path.join(data_root, 'simple')) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_steps=10000, eval_steps=5000, warm_starting=True) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, serving_model_dir=serving_model_dir) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=data_root) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen], enable_cache=True, additional_pipeline_args={}, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text) -> pipeline.Pipeline: examples = external_input(data_root) input_split = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='iris_training.csv'), example_gen_pb2.Input.Split(name='eval', pattern='iris_test.csv') ]) example_gen = CsvExampleGen(input_base=examples, input_config=input_split) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output) validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=module_file) trainer = Trainer( module_file=module_file, examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=1000), eval_args=trainer_pb2.EvalArgs(num_steps=500)) model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec() ])) model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], log_root='/var/tmp/tfx/logs', )
def _create_pipeline(pipeline_name: str, pipeline_root: str, data_root: str, metadata_path: str) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=data_root) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, statistics_gen], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path))
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int) -> pipeline.Pipeline: examples = external_input(data_root) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) validate_stats = ExampleValidator( statistics=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) trainer = Trainer( module_file=module_file, examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) eval_config = tfma.EvalConfig( slicing_specs=[tfma.SlicingSpec()] ) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config) model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher( model=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher], beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers] )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=data_root) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, statistics_gen, infer_schema], enable_cache=True, additional_pipeline_args={}, )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) return pipeline.Pipeline( pipeline_name='chicago_taxi_simple', pipeline_root=_pipeline_root, components=[example_gen, statistics_gen, infer_schema], enable_cache=True, metadata_db_root=_metadata_db_root, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={ # LINT.IfChange 'beam_pipeline_args': [ # ----- Beam Args -----. '--runner=PortableRunner', # Points to the job server started in # setup_beam_on_(flink|spark).sh '--job_endpoint=localhost:8099', '--environment_type=LOOPBACK', # TODO(BEAM-6754): Utilize multicore in LOOPBACK environment. # pylint: disable=g-bad-todo # TODO(BEAM-5167): Use concurrency information from SDK Harness. # pylint: disable=g-bad-todo # Note; We use 100 worker threads to mitigate the issue with # scheduling work between the Beam runner and SDK harness. Flink # and Spark can process unlimited work items concurrently while # SdkHarness can only process 1 work item per worker thread. # Having 100 threads will let 100 tasks execute concurrently # avoiding scheduling issue in most cases. In case the threads are # exhausted, beam print the relevant message in the log. '--experiments=worker_threads=100', # TODO(BEAM-7199): Obviate the need for setting pre_optimize=all. # pylint: disable=g-bad-todo '--experiments=pre_optimize=all', # ----- Flink runner-specific Args -----. # TODO(b/126725506): Set the task parallelism based on cpu cores. # TODO(FLINK-10672): Obviate setting BATCH_FORCED. '--execution_mode_for_batch=BATCH_FORCED', ], # LINT.ThenChange(setup/setup_beam_on_spark.sh) # LINT.ThenChange(../chicago_taxi/setup_beam_on_flink.sh) })
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), ) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud AI Platform if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], additional_pipeline_args={}, )
def _create_parameterized_pipeline( pipeline_name: Text, pipeline_root: Optional[Text] = _pipeline_root, enable_cache: Optional[bool] = True, direct_num_workers: Optional[int] = 1) -> pipeline.Pipeline: """Creates a simple TFX pipeline with RuntimeParameter. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. enable_cache: Whether to enable cache in this pipeline. direct_num_workers: Number of workers executing the underlying beam pipeline in the executors. Returns: A logical TFX pipeline.Pipeline object. """ # First, define the pipeline parameters. # Path to the CSV data file, under which there should be a data.csv file. data_root_param = data_types.RuntimeParameter( name='data-root', default='gs://my-bucket/data', ptype=Text, ) # Path to the module file. taxi_module_file_param = data_types.RuntimeParameter( name='module-file', default='gs://my-bucket/modules/taxi_utils.py', ptype=Text, ) # Number of epochs in training. train_steps = data_types.RuntimeParameter( name='train-steps', default=10, ptype=int, ) # Number of epochs in evaluation. eval_steps = data_types.RuntimeParameter( name='eval-steps', default=5, ptype=int, ) # Column name for slicing. slicing_column = data_types.RuntimeParameter( name='slicing-column', default='trip_start_hour', ptype=Text, ) # The input data location is parameterized by _data_root_param examples = external_input(data_root_param) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) infer_schema = SchemaGen( stats=statistics_gen.outputs['statistics'], infer_feature_shape=False) validate_stats = ExampleValidator( stats=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # The module file used in Transform and Trainer component is paramterized by # _taxi_module_file_param. transform = Transform( input_data=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=taxi_module_file_param) # The numbers of steps in train_args are specified as RuntimeParameter with # name 'train-steps' and 'eval-steps', respectively. trainer = Trainer( module_file=taxi_module_file_param, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_output=transform.outputs['transform_graph'], train_args={'num_steps': train_steps}, eval_args={'num_steps': eval_steps}) # The name of slicing column is specified as a RuntimeParameter. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=dict(specs=[{ 'column_for_slicing': [slicing_column] }])) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # TODO(b/145949533) Currently we use this hack to ensure push_destination can # be correctly parameterized and interpreted. # pipeline root will be specified as a dsl.PipelineParam with the name # pipeline-root, see: # https://github.com/tensorflow/tfx/blob/1c670e92143c7856f67a866f721b8a9368ede385/tfx/orchestration/kubeflow/kubeflow_dag_runner.py#L226 pipeline_root_param = dsl.PipelineParam(name='pipeline-root') pusher = Pusher( model_export=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join( str(pipeline_root_param), 'model_serving')))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def _create_pipeline(pipeline_root: Text, csv_input_location: data_types.RuntimeParameter, taxi_module_file: data_types.RuntimeParameter, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = external_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False, ) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema'], ) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=taxi_module_file, ) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), ) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ]), ) model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(pipeline.ROOT_PARAMETER), 'model_serving'))), ) return pipeline.Pipeline( pipeline_name='parameterized_tfx_oss', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, )
def _create_pipeline(pipeline_root: Text, csv_input_location: data_types.RuntimeParameter, taxi_module_file: data_types.RuntimeParameter, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = external_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False, ) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema'], ) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=taxi_module_file, ) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), ) # Set the TFMA config for Model Evaluation and Validation. eval_config = tfma.EvalConfig( model_specs=[ # Using signature 'eval' implies the use of an EvalSavedModel. To use # a serving model remove the signature to defaults to 'serving_default' # and add a label_key. tfma.ModelSpec(signature_name='eval') ], metrics_specs=[ tfma.MetricsSpec( # The metrics added here are in addition to those saved with the # model (assuming either a keras model or EvalSavedModel is used). # Any metrics added into the saved model (for example using # model.compile(..., metrics=[...]), etc) will be computed # automatically. metrics=[tfma.MetricConfig(class_name='ExampleCount')], # To add validation thresholds for metrics saved with the model, # add them keyed by metric name to the thresholds map. thresholds={ 'binary_accuracy': tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.5}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ # An empty slice spec means the overall slice, i.e. the whole dataset. tfma.SlicingSpec(), # Data can be sliced along a feature column. In this case, data is # sliced along feature column trip_start_hour. tfma.SlicingSpec(feature_keys=['trip_start_hour']) ]) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config, ) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(pipeline.ROOT_PARAMETER), 'model_serving'))), ) return pipeline.Pipeline( pipeline_name='parameterized_tfx_oss', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, pusher ], enable_cache=enable_cache, )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # This custom component serves as a bridge between pipeline and human model # reviewers to enable review-and-push workflow in model development cycle. It # utilizes Slack API to send message to user-defined Slack channel with model # URI info and wait for go / no-go decision from the same Slack channel: # * To approve the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'lgtm' or 'approve'. # * To reject the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'decline' or 'reject'. slack_validator = SlackComponent( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, slack_token=_slack_token, channel_id=_channel_id, timeout_sec=3600, ) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=slack_validator.outputs.slack_blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, slack_validator, pusher ]
def _create_parameterized_pipeline( pipeline_name: Text, pipeline_root: Optional[Text] = _pipeline_root, enable_cache: Optional[bool] = True, direct_num_workers: Optional[int] = 1) -> pipeline.Pipeline: """Creates a simple TFX pipeline with RuntimeParameter. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. enable_cache: Whether to enable cache in this pipeline. direct_num_workers: Number of workers executing the underlying beam pipeline in the executors. Returns: A logical TFX pipeline.Pipeline object. """ # First, define the pipeline parameters. # Path to the CSV data file, under which there should be a data.csv file. data_root = data_types.RuntimeParameter( name='data-root', default='gs://my-bucket/data', ptype=Text, ) # Path to the transform module file. transform_module_file = data_types.RuntimeParameter( name='transform-module', default='gs://my-bucket/modules/transform_module.py', ptype=Text, ) # Path to the trainer module file. trainer_module_file = data_types.RuntimeParameter( name='trainer-module', default='gs://my-bucket/modules/trainer_module.py', ptype=Text, ) # Number of epochs in training. train_steps = data_types.RuntimeParameter( name='train-steps', default=10, ptype=int, ) # Number of epochs in evaluation. eval_steps = data_types.RuntimeParameter( name='eval-steps', default=5, ptype=int, ) # Column name for slicing. slicing_column = data_types.RuntimeParameter( name='slicing-column', default='trip_start_hour', ptype=Text, ) # The input data location is parameterized by data_root examples = external_input(data_root) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # The module file used in Transform and Trainer component is paramterized by # transform_module_file. transform = Transform(input_data=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=transform_module_file) # The numbers of steps in train_args are specified as RuntimeParameter with # name 'train-steps' and 'eval-steps', respectively. trainer = Trainer( module_file=trainer_module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_output=transform.outputs['transform_graph'], train_args={'num_steps': train_steps}, eval_args={'num_steps': eval_steps}) # The name of slicing column is specified as a RuntimeParameter. model_analyzer = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=dict(specs=[{ 'column_for_slicing': [slicing_column] }])) model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model_export=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(pipeline.ROOT_PARAMETER), 'model_serving')))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int) -> pipeline.Pipeline: examples = external_input(data_root) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=1000), eval_args=trainer_pb2.EvalArgs(num_steps=500)) model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='species')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'sparse_categorical_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.9}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) model_analyzer = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_resolver, model_analyzer, pusher ], beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], enable_cache=True)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), ) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'binary_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud AI Platform if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_resolver, model_analyzer, pusher ], # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. # pylint: disable=line-too-long # statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Step 3 # pylint: enable=line-too-long # Generates schema based on statistics files. # infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Step 3 # Performs anomaly detection based on statistics and data schema. # validate_stats = ExampleValidator( # Step 3 # stats=statistics_gen.outputs.output, # Step 3 # schema=infer_schema.outputs.output) # Step 3 # Performs transformations and feature engineering in training and serving. # transform = Transform( # Step 4 # input_data=example_gen.outputs.examples, # Step 4 # schema=infer_schema.outputs.output, # Step 4 # module_file=_taxi_module_file) # Step 4 # Uses user-provided Python function that implements a model using TF-Learn. # trainer = Trainer( # Step 5 # module_file=_taxi_module_file, # Step 5 # transformed_examples=transform.outputs.transformed_examples, # Step 5 # schema=infer_schema.outputs.output, # Step 5 # transform_output=transform.outputs.transform_output, # Step 5 # train_args=trainer_pb2.TrainArgs(num_steps=10000), # Step 5 # eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Step 5 # Uses TFMA to compute a evaluation statistics over features of a model. # model_analyzer = Evaluator( # Step 6 # examples=example_gen.outputs.examples, # Step 6 # model_exports=trainer.outputs.output, # Step 6 # feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ # Step 6 # evaluator_pb2.SingleSlicingSpec( # Step 6 # column_for_slicing=['trip_start_hour']) # Step 6 # ])) # Step 6 # Performs quality validation of a candidate model (compared to a baseline). # model_validator = ModelValidator( # Step 7 # examples=example_gen.outputs.examples, # Step 7 # model=trainer.outputs.output) # Step 7 # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. # pusher = Pusher( # Step 7 # model_export=trainer.outputs.output, # Step 7 # model_blessing=model_validator.outputs.blessing, # Step 7 # push_destination=pusher_pb2.PushDestination( # Step 7 # filesystem=pusher_pb2.PushDestination.Filesystem( # Step 7 # base_directory=_serving_model_dir))) # Step 7 return pipeline.Pipeline( pipeline_name='taxi', pipeline_root=_pipeline_root, components=[ example_gen, # statistics_gen, infer_schema, validate_stats, # Step 3 # transform, # Step 4 # trainer, # Step 5 # model_analyzer, # Step 6 # model_validator, pusher # Step 7 ], enable_cache=True, metadata_db_root=_metadata_db_root, additional_pipeline_args={'logger_args': logger_overrides}, )