def get_tfx_component_list(self, config: Dict[Text, Any]) -> List: """ Builds the training pipeline as a series of TFX components. Args: config: A ZenML configuration in dictionary format. Returns: A chronological list of TFX components making up the training pipeline. """ steps = config[keys.GlobalKeys.STEPS] component_list = [] ############ # RAW DATA # ############ data_config = steps[keys.TrainingSteps.DATA] data = DataGen(source=data_config[keys.StepKeys.SOURCE], source_args=data_config[keys.StepKeys.ARGS]).with_id( GDPComponent.DataGen.name) statistics_data = StatisticsGen( examples=data.outputs.examples).with_id( GDPComponent.DataStatistics.name) schema_data = SchemaGen( statistics=statistics_data.outputs.output, ).with_id( GDPComponent.DataSchema.name) component_list.extend([data, statistics_data, schema_data]) datapoints = data.outputs.examples ################# # SPLITTING # ################# # Block to read the data from the corresponding BQ table split_config = steps[keys.TrainingSteps.SPLIT] splits = SplitGen( input_examples=datapoints, source=split_config[keys.StepKeys.SOURCE], source_args=split_config[keys.StepKeys.ARGS], schema=schema_data.outputs.schema, statistics=statistics_data.outputs.output, ).with_id(GDPComponent.SplitGen.name) datapoints = splits.outputs.examples statistics_split = StatisticsGen(examples=datapoints).with_id( GDPComponent.SplitStatistics.name) schema_split = SchemaGen( statistics=statistics_split.outputs.output, ).with_id( GDPComponent.SplitSchema.name) schema = schema_split.outputs.schema component_list.extend([splits, statistics_split, schema_split]) ################# # PREPROCESSING # ################# transform = Transform( preprocessing_fn=constants.PREPROCESSING_FN, examples=datapoints, schema=schema, custom_config=steps[keys.TrainingSteps.PREPROCESSING]).with_id( GDPComponent.Transform.name) component_list.extend([transform]) ############ # TRAINING # ############ training_backend: TrainingLocalBackend = \ self.backends_dict[TrainingLocalBackend.BACKEND_KEY] training_kwargs = { 'custom_executor_spec': training_backend.get_executor_spec(), 'custom_config': steps[keys.TrainingSteps.TRAINING] } training_kwargs['custom_config'].update( training_backend.get_custom_config()) trainer = Trainer( transformed_examples=transform.outputs.transformed_examples, transform_graph=transform.outputs.transform_graph, run_fn=constants.TRAINER_FN, schema=schema, train_args=trainer_pb2.TrainArgs(), eval_args=trainer_pb2.EvalArgs(), **training_kwargs).with_id(GDPComponent.Trainer.name) component_list.extend([trainer]) ############# # EVALUATOR # ############# if keys.TrainingSteps.EVALUATION in steps: from zenml.utils import source_utils eval_module = '.'.join( constants.EVALUATOR_MODULE_FN.split('.')[:-1]) eval_module_file = constants.EVALUATOR_MODULE_FN.split('.')[-1] abs_path = source_utils.get_absolute_path_from_module(eval_module) custom_extractor_path = os.path.join(abs_path, eval_module_file) + '.py' eval_step: TFMAEvaluator = TFMAEvaluator.from_config( steps[keys.TrainingSteps.EVALUATION]) eval_config = eval_step.build_eval_config() evaluator = Evaluator( examples=transform.outputs.transformed_examples, model=trainer.outputs.model, eval_config=eval_config, module_file=custom_extractor_path, ).with_id(GDPComponent.Evaluator.name) component_list.append(evaluator) ########### # SERVING # ########### if keys.TrainingSteps.DEPLOYMENT in steps: serving_args = steps[keys.TrainingSteps.DEPLOYMENT]['args'] project_id = serving_args['project_id'] output_base_dir = self.artifact_store.path if 'model_name' in serving_args: model_name = serving_args['model_name'] else: model_name = self.pipeline_name().replace('-', '_') gcaip_deployer = GCAIPDeployer(output_base_dir=output_base_dir, project_id=project_id, model_name=model_name) pusher_config = gcaip_deployer.build_pusher_config() pusher_executor_spec = gcaip_deployer.get_executor_spec() pusher = Pusher(model_export=trainer.outputs.output, custom_executor_spec=pusher_executor_spec, **pusher_config).with_id( GDPComponent.Deployer.name) component_list.append(pusher) return component_list
def _create_pipeline(pipeline_root: Text, csv_input_location: data_types.RuntimeParameter, taxi_module_file: data_types.RuntimeParameter, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = external_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False, ) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema'], ) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=taxi_module_file, ) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), ) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ]), ) model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(pipeline.ROOT_PARAMETER), 'model_serving'))), ) return pipeline.Pipeline( pipeline_name='parameterized_tfx_oss', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, )
def _create_parameterized_pipeline( pipeline_name: Text, pipeline_root: Optional[Text] = _pipeline_root, enable_cache: Optional[bool] = True, direct_num_workers: Optional[int] = 1) -> pipeline.Pipeline: """Creates a simple TFX pipeline with RuntimeParameter. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. enable_cache: Whether to enable cache in this pipeline. direct_num_workers: Number of workers executing the underlying beam pipeline in the executors. Returns: A logical TFX pipeline.Pipeline object. """ # First, define the pipeline parameters. # Path to the CSV data file, under which there should be a data.csv file. data_root_param = data_types.RuntimeParameter( name='data-root', default='gs://my-bucket/data', ptype=Text, ) # Path to the module file. taxi_module_file_param = data_types.RuntimeParameter( name='module-file', default='gs://my-bucket/modules/taxi_utils.py', ptype=Text, ) # Number of epochs in training. train_steps = data_types.RuntimeParameter( name='train-steps', default=10, ptype=int, ) # Number of epochs in evaluation. eval_steps = data_types.RuntimeParameter( name='eval-steps', default=5, ptype=int, ) # Column name for slicing. slicing_column = data_types.RuntimeParameter( name='slicing-column', default='trip_start_hour', ptype=Text, ) # The input data location is parameterized by _data_root_param examples = external_input(data_root_param) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) infer_schema = SchemaGen( stats=statistics_gen.outputs['statistics'], infer_feature_shape=False) validate_stats = ExampleValidator( stats=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # The module file used in Transform and Trainer component is paramterized by # _taxi_module_file_param. transform = Transform( input_data=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=taxi_module_file_param) # The numbers of steps in train_args are specified as RuntimeParameter with # name 'train-steps' and 'eval-steps', respectively. trainer = Trainer( module_file=taxi_module_file_param, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_output=transform.outputs['transform_graph'], train_args={'num_steps': train_steps}, eval_args={'num_steps': eval_steps}) # The name of slicing column is specified as a RuntimeParameter. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=dict(specs=[{ 'column_for_slicing': [slicing_column] }])) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # TODO(b/145949533) Currently we use this hack to ensure push_destination can # be correctly parameterized and interpreted. # pipeline root will be specified as a dsl.PipelineParam with the name # pipeline-root, see: # https://github.com/tensorflow/tfx/blob/1c670e92143c7856f67a866f721b8a9368ede385/tfx/orchestration/kubeflow/kubeflow_dag_runner.py#L226 pipeline_root_param = dsl.PipelineParam(name='pipeline-root') pusher = Pusher( model_export=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join( str(pipeline_root_param), 'model_serving')))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return pipeline.Pipeline( pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( _metadata_db_root), components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], additional_pipeline_args={ # LINT.IfChange 'beam_pipeline_args': [ # ----- Beam Args -----. '--runner=PortableRunner', # Points to the job server started in # setup_beam_on_(flink|spark).sh '--job_endpoint=localhost:8099', '--environment_type=LOOPBACK', # TODO(BEAM-6754): Utilize multicore in LOOPBACK environment. # pylint: disable=g-bad-todo # TODO(BEAM-5167): Use concurrency information from SDK Harness. # pylint: disable=g-bad-todo # Note; We use 100 worker threads to mitigate the issue with # scheduling work between the Beam runner and SDK harness. Flink # and Spark can process unlimited work items concurrently while # SdkHarness can only process 1 work item per worker thread. # Having 100 threads will let 100 tasks execute concurrently # avoiding scheduling issue in most cases. In case the threads are # exhausted, beam print the relevant message in the log. '--experiments=worker_threads=100', # TODO(BEAM-7199): Obviate the need for setting pre_optimize=all. # pylint: disable=g-bad-todo '--experiments=pre_optimize=all', # ----- Flink runner-specific Args -----. # TODO(b/126725506): Set the task parallelism based on cpu cores. # TODO(FLINK-10672): Obviate setting BATCH_FORCED. '--execution_mode_for_batch=BATCH_FORCED', ], # LINT.ThenChange(tfx/examples/chicago_taxi/setup_beam_on_portable_beam.sh) }, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, module_file: Text, presto_config: presto_config_pb2.PrestoConnConfig, query: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data example_gen = PrestoExampleGen(presto_config, query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, )
def _create_parameterized_pipeline( pipeline_name: Text, pipeline_root: Optional[Text] = _pipeline_root, enable_cache: Optional[bool] = True, direct_num_workers: Optional[int] = 1) -> pipeline.Pipeline: """Creates a simple TFX pipeline with RuntimeParameter. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. enable_cache: Whether to enable cache in this pipeline. direct_num_workers: Number of workers executing the underlying beam pipeline in the executors. Returns: A logical TFX pipeline.Pipeline object. """ # First, define the pipeline parameters. # Path to the CSV data file, under which there should be a data.csv file. data_root = data_types.RuntimeParameter( name='data-root', default='gs://my-bucket/data', ptype=Text, ) # Path to the transform module file. transform_module_file = data_types.RuntimeParameter( name='transform-module', default='gs://my-bucket/modules/transform_module.py', ptype=Text, ) # Path to the trainer module file. trainer_module_file = data_types.RuntimeParameter( name='trainer-module', default='gs://my-bucket/modules/trainer_module.py', ptype=Text, ) # Number of epochs in training. train_steps = data_types.RuntimeParameter( name='train-steps', default=10, ptype=int, ) # Number of epochs in evaluation. eval_steps = data_types.RuntimeParameter( name='eval-steps', default=5, ptype=int, ) # Column name for slicing. slicing_column = data_types.RuntimeParameter( name='slicing-column', default='trip_start_hour', ptype=Text, ) # The input data location is parameterized by data_root examples = external_input(data_root) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # The module file used in Transform and Trainer component is paramterized by # transform_module_file. transform = Transform(input_data=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=transform_module_file) # The numbers of steps in train_args are specified as RuntimeParameter with # name 'train-steps' and 'eval-steps', respectively. trainer = Trainer( module_file=trainer_module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_output=transform.outputs['transform_graph'], train_args={'num_steps': train_steps}, eval_args={'num_steps': eval_steps}) # The name of slicing column is specified as a RuntimeParameter. model_analyzer = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=dict(specs=[{ 'column_for_slicing': [slicing_column] }])) model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model_export=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(pipeline.ROOT_PARAMETER), 'model_serving')))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # This custom component serves as a bridge between pipeline and human model # reviewers to enable review-and-push workflow in model development cycle. It # utilizes Slack API to send message to user-defined Slack channel with model # URI info and wait for go / no-go decision from the same Slack channel: # * To approve the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'lgtm' or 'approve'. # * To reject the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'decline' or 'reject'. slack_validator = SlackComponent( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, slack_token=_slack_token, channel_id=_channel_id, timeout_sec=3600, ) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=slack_validator.outputs.slack_blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, slack_validator, pusher ]
def _create_pipeline( pipeline_name: Text, pipeline_root: Text, query: Text, module_file: Text, beam_pipeline_args: List[Text], ai_platform_training_args: Dict[Text, Text], bigquery_serving_args: Dict[Text, Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'ai_platform_training_args': ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud BigQuery ML if check passed. pusher = Pusher( custom_executor_spec=executor_spec.ExecutorClassSpec( bigquery_ml_pusher_executor.Executor), model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], custom_config={'bigquery_serving_args': bigquery_serving_args}) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], beam_pipeline_args=beam_pipeline_args, )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=_query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_utils) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. try: from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor # pylint: disable=g-import-not-at-top # Train using a custom executor. This requires TFX >= 0.14. trainer = Trainer( executor_class=ai_platform_trainer_executor.Executor, module_file=_taxi_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={ 'ai_platform_training_args': _ai_platform_training_args }) except ImportError: # Train using a deprecated flag. trainer = Trainer( module_file=_taxi_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'cmle_training_args': _ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a destination if check passed. try: from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor # pylint: disable=g-import-not-at-top # Deploy the model on Google Cloud AI Platform. This requires TFX >=0.14. pusher = Pusher(executor_class=ai_platform_pusher_executor.Executor, model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={ 'ai_platform_serving_args': _ai_platform_serving_args }) except ImportError: # Deploy the model on Google Cloud AI Platform, using a deprecated flag. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={'cmle_serving_args': _ai_platform_serving_args}, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return pipeline.Pipeline( pipeline_name='chicago_taxi_pipeline_kubeflow', pipeline_root=_pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], additional_pipeline_args={ 'beam_pipeline_args': [ '--runner=DataflowRunner', '--experiments=shuffle_mode=auto', '--project=' + _project_id, '--temp_location=' + os.path.join(_output_bucket, 'tmp'), '--region=' + _gcp_region, ], # Optional args: # 'tfx_image': custom docker image to use for components. # This is needed if TFX package is not installed from an RC # or released version. }, log_root='/var/tmp/tfx/logs', )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output, infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator(examples=example_gen.outputs.examples, model_exports=trainer.outputs.output) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs['output']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, )
def _create_test_pipeline(pipeline_root: Text, csv_input_location: Text, taxi_module_file: Text, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = csv_input(csv_input_location) example_gen = CsvExampleGen(input_base=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen( stats=statistics_gen.outputs.output, infer_feature_shape=False) validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_module_file) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5)) model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # Hack: ensuring push_destination can be correctly parameterized and interpreted. # pipeline root will be specified as a dsl.PipelineParam with the name # pipeline-root, see: # https://github.com/tensorflow/tfx/blob/1c670e92143c7856f67a866f721b8a9368ede385/tfx/orchestration/kubeflow/kubeflow_dag_runner.py#L226 _pipeline_root_param = dsl.PipelineParam(name='pipeline-root') pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(_pipeline_root_param), 'model_serving')))) return pipeline.Pipeline( pipeline_name='parameterized_tfx_oss', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int = 1) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # Note that direct_num_workers != 1 will enable multi-process for TFX, # we hide the FnApiRunner[1] setting from user, but this is subject to # change if Beam offers pure flag setup. # [1]https://issues.apache.org/jira/browse/BEAM-3645 beam_pipeline_args=['--direct_num_workers=%s' % direct_num_workers], additional_pipeline_args={}, )
def get_tfx_component_list(self, config: Dict[Text, Any]) -> List: """ Builds the training pipeline as a series of TFX components. Args: config: A ZenML configuration in dictionary format. Returns: A chronological list of TFX components making up the training pipeline. """ steps = config[keys.GlobalKeys.PIPELINE][keys.PipelineKeys.STEPS] component_list = [] ############ # RAW DATA # ############ data_config = steps[keys.TrainingSteps.DATA] data = DataGen(name=self.datasource.name, source=data_config[keys.StepKeys.SOURCE], source_args=data_config[keys.StepKeys.ARGS]).with_id( GDPComponent.DataGen.name) statistics_data = StatisticsGen( examples=data.outputs.examples).with_id( GDPComponent.DataStatistics.name) schema_data = SchemaGen( statistics=statistics_data.outputs.output, ).with_id( GDPComponent.DataSchema.name) component_list.extend([data, statistics_data, schema_data]) datapoints = data.outputs.examples ############# # SPLITTING # ############# # Block to read the data from the corresponding BQ table split_config = steps[keys.TrainingSteps.SPLIT] splits = SplitGen( input_examples=datapoints, source=split_config[keys.StepKeys.SOURCE], source_args=split_config[keys.StepKeys.ARGS], schema=schema_data.outputs.schema, statistics=statistics_data.outputs.output, ).with_id(GDPComponent.SplitGen.name) datapoints = splits.outputs.examples statistics_split = StatisticsGen(examples=datapoints).with_id( GDPComponent.SplitStatistics.name) schema_split = SchemaGen( statistics=statistics_split.outputs.output, ).with_id( GDPComponent.SplitSchema.name) schema = schema_split.outputs.schema component_list.extend([splits, statistics_split, schema_split]) ############## # SEQUENCING # ############## if keys.TrainingSteps.SEQUENCER in steps: sequencer_config = steps[keys.TrainingSteps.SEQUENCER] sequencer = Sequencer( input_examples=datapoints, schema=schema, statistics=statistics_split.outputs.statistics, source=sequencer_config[keys.StepKeys.SOURCE], source_args=sequencer_config[keys.StepKeys.ARGS]).with_id( GDPComponent.Sequencer.name) sequencer_statistics = StatisticsGen( examples=sequencer.outputs.output_examples).with_id( GDPComponent.SequencerStatistics.name) sequencer_schema = SchemaGen( statistics=sequencer_statistics.outputs.output, infer_feature_shape=True, ).with_id(GDPComponent.SequencerSchema.name) datapoints = sequencer.outputs.output_examples schema = sequencer_schema.outputs.schema component_list.extend( [sequencer, sequencer_statistics, sequencer_schema]) ################# # PREPROCESSING # ################# transform = Transform( preprocessing_fn=constants.PREPROCESSING_FN, examples=datapoints, schema=schema, custom_config=steps[keys.TrainingSteps.PREPROCESSER]).with_id( GDPComponent.Transform.name) component_list.extend([transform]) ############ # TRAINING # ############ training_backend: TrainingBaseBackend = \ self.steps_dict[keys.TrainingSteps.TRAINER].backend # default to local if training_backend is None: training_backend = TrainingBaseBackend() training_kwargs = { 'custom_executor_spec': training_backend.get_executor_spec(), 'custom_config': steps[keys.TrainingSteps.TRAINER] } training_kwargs['custom_config'].update( training_backend.get_custom_config()) trainer = Trainer( transformed_examples=transform.outputs.transformed_examples, transform_graph=transform.outputs.transform_graph, run_fn=constants.TRAINER_FN, schema=schema, train_args=trainer_pb2.TrainArgs(), eval_args=trainer_pb2.EvalArgs(), **training_kwargs).with_id(GDPComponent.Trainer.name) component_list.extend([trainer]) ############# # EVALUATOR # ############# if keys.TrainingSteps.EVALUATOR in steps: from zenml.utils import source_utils eval_module = '.'.join( constants.EVALUATOR_MODULE_FN.split('.')[:-1]) eval_module_file = constants.EVALUATOR_MODULE_FN.split('.')[-1] abs_path = source_utils.get_absolute_path_from_module(eval_module) custom_extractor_path = os.path.join(abs_path, eval_module_file) + '.py' eval_step: TFMAEvaluator = TFMAEvaluator.from_config( steps[keys.TrainingSteps.EVALUATOR]) eval_config = eval_step.build_eval_config() evaluator = Evaluator( examples=transform.outputs.transformed_examples, model=trainer.outputs.model, eval_config=eval_config, module_file=custom_extractor_path, ).with_id(GDPComponent.Evaluator.name) component_list.append(evaluator) ########### # SERVING # ########### if keys.TrainingSteps.DEPLOYER in steps: deployer: BaseDeployerStep = \ self.steps_dict[keys.TrainingSteps.DEPLOYER] pusher_config = deployer._build_pusher_args() pusher_executor_spec = deployer._get_executor_spec() pusher = Pusher(model_export=trainer.outputs.output, custom_executor_spec=pusher_executor_spec, **pusher_config).with_id( GDPComponent.Deployer.name) component_list.append(pusher) return component_list
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" query = """ SELECT pickup_community_area, fare, EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month, EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour, EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day, UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp, pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, trip_miles, pickup_census_tract, dropoff_census_tract, payment_type, company, trip_seconds, dropoff_community_area, tips FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` WHERE (ABS(FARM_FINGERPRINT(unique_key)) / {max_int64}) < {query_sample_rate}""".format( max_int64=_max_int64, query_sample_rate=_query_sample_rate) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_utils) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. try: from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor # pylint: disable=g-import-not-at-top # Train using a custom executor. This requires TFX >= 0.14. trainer = Trainer( executor_class=ai_platform_trainer_executor.Executor, module_file=_taxi_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'ai_platform_training_args': _ai_platform_training_args}) except ImportError: # Train using a deprecated flag. trainer = Trainer( module_file=_taxi_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'cmle_training_args': _ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a destination if check passed. try: from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor # pylint: disable=g-import-not-at-top # Deploy the model on Google Cloud AI Platform. This requires TFX >=0.14. pusher = Pusher( executor_class=ai_platform_pusher_executor.Executor, model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={'ai_platform_serving_args': _ai_platform_serving_args}, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) except ImportError: # Deploy the model on Google Cloud AI Platform, using a deprecated flag. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={'cmle_serving_args': _ai_platform_serving_args}, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def get_tfx_component_list(self, config: Dict[Text, Any]) -> List: """ Builds the NLP pipeline as a series of TFX components. Args: config: A ZenML configuration in dictionary format. Returns: A chronological list of TFX components making up the NLP pipeline. """ steps = config[keys.GlobalKeys.PIPELINE][keys.PipelineKeys.STEPS] component_list = [] ############ # RAW DATA # ############ data_config = steps[keys.NLPSteps.DATA] data = DataGen(name=self.datasource.name, source=data_config[keys.StepKeys.SOURCE], source_args=data_config[keys.StepKeys.ARGS]).with_id( GDPComponent.DataGen.name) ############# # TOKENIZER # ############# tokenizer_config = steps[keys.NLPSteps.TOKENIZER] tokenizer = Tokenizer( source=tokenizer_config[keys.StepKeys.SOURCE], source_args=tokenizer_config[keys.StepKeys.ARGS], examples=data.outputs.examples, ).with_id(GDPComponent.Tokenizer.name) component_list.extend([tokenizer]) # return component_list statistics_data = StatisticsGen( examples=tokenizer.outputs.output_examples).with_id( GDPComponent.DataStatistics.name) schema_data = SchemaGen( statistics=statistics_data.outputs.output, infer_feature_shape=True, ).with_id(GDPComponent.DataSchema.name) split_config = steps[keys.NLPSteps.SPLIT] splits = SplitGen( input_examples=tokenizer.outputs.output_examples, source=split_config[keys.StepKeys.SOURCE], source_args=split_config[keys.StepKeys.ARGS], schema=schema_data.outputs.schema, statistics=statistics_data.outputs.output, ).with_id(GDPComponent.SplitGen.name) component_list.extend([data, statistics_data, schema_data, splits]) ############ # TRAINING # ############ training_backend: Optional[TrainingBaseBackend] = \ self.steps_dict[keys.NLPSteps.TRAINER].backend # default to local if training_backend is None: training_backend = TrainingBaseBackend() training_kwargs = { 'custom_executor_spec': training_backend.get_executor_spec(), 'custom_config': steps[keys.NLPSteps.TRAINER] } training_kwargs['custom_config'].update( training_backend.get_custom_config()) trainer = Trainer(examples=splits.outputs.examples, run_fn=constants.TRAINER_FN, schema=schema_data.outputs.schema, train_args=trainer_pb2.TrainArgs(), eval_args=trainer_pb2.EvalArgs(), **training_kwargs).with_id(GDPComponent.Trainer.name) component_list.extend([trainer]) return component_list
def _create__pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, ai_platform_training_args: Dict[Text, Text], ai_platform_serving_args: Dict[Text, Text], beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the online news pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=module_file) # Uses user-provided Python function that implements a model using # TensorFlow's Estimators API. trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'ai_platform_training_args': ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec(column_for_slicing=['weekday']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_pusher_executor.Executor), model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={'ai_platform_serving_args': ai_platform_serving_args}) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], # enable_cache=True, beam_pipeline_args=beam_pipeline_args)
def testAIPlatformTrainerPipeline(self): """Trainer-only test pipeline on AI Platform Training.""" pipeline_name = 'kubeflow-aip-trainer-test-{}'.format(self._random_id()) pipeline = self._create_pipeline( pipeline_name, [ self.schema_importer, self.transformed_examples_importer, self.transform_graph_importer, Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=self._trainer_module, transformed_examples=self.transformed_examples_importer .outputs['result'], schema=self.schema_importer.outputs['result'], transform_graph=self.transform_graph_importer.outputs['result'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), custom_config={ # Test that distributed training is behaves properly. ai_platform_trainer_executor.TRAINING_ARGS_KEY: { 'project': self._gcp_project_id, 'region': self._gcp_region, 'jobDir': os.path.join( self._pipeline_root(pipeline_name), 'tmp'), 'masterConfig': { 'imageUri': self._container_image, }, 'scaleTier': 'CUSTOM', 'masterType': 'large_model', 'parameterServerType': 'standard', 'parameterServerCount': 1, 'workerType': 'standard', 'workerCount': 2, } }) ]) self._compile_and_run_pipeline(pipeline) # There must be only one execution of Trainer. trainer_output_base_dir = os.path.join( self._pipeline_root(pipeline_name), 'Trainer', 'model') trainer_outputs = tf.io.gfile.listdir(trainer_output_base_dir) self.assertEqual(1, len(trainer_outputs)) # There must be only one saved models each for serving and eval. model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0]) self.assertEqual( 1, len(tf.io.gfile.listdir(path_utils.eval_model_dir(model_uri)))) self.assertEqual( 1, len( tf.io.gfile.listdir( os.path.join( path_utils.serving_model_dir(model_uri), 'export', 'chicago-taxi'))))
def _create_test_pipeline(pipeline_name: Text, pipeline_root: Text, csv_input_location: Text, taxi_module_file: Text, container_image: Text): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline for testing. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. container_image: The container image to use. Returns: A logical TFX pipeline.Pipeline object. """ examples = dsl_utils.csv_input(csv_input_location) example_gen = CsvExampleGen(input_base=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output) validate_stats = ExampleValidator( # pylint: disable=unused-variable stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_module_file) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) model_analyzer = Evaluator( # pylint: disable=unused-variable examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher( # pylint: disable=unused-variable model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return tfx_pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], log_root='/var/tmp/tfx/logs', additional_pipeline_args={ 'tfx_image': container_image, }, )
def create_pipeline(pipeline_name, pipeline_root, input_path, tf_transform_file, tf_trainer_file, serving_model_basedir, **kwargs): examples = tfrecord_input(input_path) input_config = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='tfrecord', pattern='data_tfrecord-*.gz'), ]) # todo add as airflow var output_config = example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), # todo add as airflow var example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) # todo add as airflow var ])) example_gen = ImportExampleGen(input_base=examples, input_config=input_config, output_config=output_config) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output) validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=tf_transform_file) trainer = Trainer( module_file=tf_trainer_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=[]) # todo add your slicing column ])) model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_basedir))) pipeline = Pipeline(pipeline_name=pipeline_name, pipeline_root=pipeline_root, **kwargs) pipeline.components = [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ] return pipeline
def _create_pipeline( pipeline_name: Text, pipeline_root: Text, query: Text, module_file: Text, serving_model_dir: Text, beam_pipeline_args: List[Text], ai_platform_training_args: Dict[Text, Text], ai_platform_serving_args: Dict[Text, Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output, infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. try: from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor # pylint: disable=g-import-not-at-top # Train using a custom executor. This requires TFX >= 0.14. trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={ 'ai_platform_training_args': ai_platform_training_args }) except ImportError: # Train using a deprecated flag. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'cmle_training_args': ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a destination if check passed. try: from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor # pylint: disable=g-import-not-at-top # Deploy the model on Google Cloud AI Platform. This requires TFX >=0.14. pusher = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_pusher_executor.Executor), model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={ 'ai_platform_serving_args': ai_platform_serving_args }) except ImportError: # Deploy the model on Google Cloud AI Platform, using a deprecated flag. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={'cmle_serving_args': ai_platform_serving_args}, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], additional_pipeline_args={ 'beam_pipeline_args': beam_pipeline_args, }, log_root='/var/tmp/tfx/logs', )
def _create_pipeline(pipeline_root: Text, csv_input_location: data_types.RuntimeParameter, taxi_module_file: data_types.RuntimeParameter, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = external_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False, ) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema'], ) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=taxi_module_file, ) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), ) # Set the TFMA config for Model Evaluation and Validation. eval_config = tfma.EvalConfig( model_specs=[ # Using signature 'eval' implies the use of an EvalSavedModel. To use # a serving model remove the signature to defaults to 'serving_default' # and add a label_key. tfma.ModelSpec(signature_name='eval') ], metrics_specs=[ tfma.MetricsSpec( # The metrics added here are in addition to those saved with the # model (assuming either a keras model or EvalSavedModel is used). # Any metrics added into the saved model (for example using # model.compile(..., metrics=[...]), etc) will be computed # automatically. metrics=[tfma.MetricConfig(class_name='ExampleCount')], # To add validation thresholds for metrics saved with the model, # add them keyed by metric name to the thresholds map. thresholds={ 'binary_accuracy': tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.5}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ # An empty slice spec means the overall slice, i.e. the whole dataset. tfma.SlicingSpec(), # Data can be sliced along a feature column. In this case, data is # sliced along feature column trip_start_hour. tfma.SlicingSpec(feature_keys=['trip_start_hour']) ]) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config, ) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(pipeline.ROOT_PARAMETER), 'model_serving'))), ) return pipeline.Pipeline( pipeline_name='parameterized_tfx_oss', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, pusher ], enable_cache=enable_cache, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int) -> pipeline.Pipeline: examples = external_input(data_root) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=1000), eval_args=trainer_pb2.EvalArgs(num_steps=500)) model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='species')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'sparse_categorical_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.9}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) model_analyzer = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_resolver, model_analyzer, pusher ], beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], enable_cache=True)
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return pipeline.Pipeline( pipeline_name='taxi_solution', pipeline_root=_pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_db_root=_metadata_db_root, additional_pipeline_args={'logger_args': logger_overrides}, )
def _create_pipeline( pipeline_name: Text, pipeline_root: Text, query: Text, module_file: Text, beam_pipeline_args: List[Text], ai_platform_training_args: Dict[Text, Text], ai_platform_serving_args: Dict[Text, Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={ ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args }) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'binary_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud AI Platform if check passed. pusher = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_pusher_executor.Executor), model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], custom_config={ ai_platform_pusher_executor.SERVING_ARGS_KEY: ai_platform_serving_args }) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher ], beam_pipeline_args=beam_pipeline_args, )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" query = """ SELECT pickup_community_area, fare, EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month, EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour, EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day, UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp, pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, trip_miles, pickup_census_tract, dropoff_census_tract, payment_type, company, trip_seconds, dropoff_community_area, tips FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` WHERE RAND() < {}""".format(_query_sample_rate) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_utils) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'cmle_training_args': _cmle_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={'cmle_serving_args': _cmle_serving_args}, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the cifar10 pipeline with TFX.""" examples = external_input(data_root) input_split = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'), example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord') ]) example_gen = ImportExampleGen(input=examples, input_config=input_split) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer(module_file=module_file, examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=1000), eval_args=trainer_pb2.EvalArgs(num_steps=500)) # Uses TFMA to compute a evaluation statistics over features of a model. evaluator = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec( specs=[evaluator_pb2.SingleSlicingSpec()])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, evaluator, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, )
def _create_pipeline( pipeline_name: Text, pipeline_root: Text, module_file: Text, ai_platform_training_args: Dict[Text, Text], ai_platform_serving_args: Dict[Text, Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # The rate at which to sample rows from the Taxi dataset using BigQuery. # The full taxi dataset is > 200M record. In the interest of resource # savings and time, we've set the default for this example to be much smaller. # Feel free to crank it up and process the full dataset! # By default it generates a 0.1% random sample. query_sample_rate = data_types.RuntimeParameter(name='query-sample-rate', ptype=float, default=0.001) # This is the upper bound of FARM_FINGERPRINT in Bigquery (ie the max value of # signed int64). max_int64 = '0x7FFFFFFFFFFFFFFF' # The query that extracts the examples from BigQuery. The Chicago Taxi dataset # used for this example is a public dataset available on Google AI Platform. # https://console.cloud.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips query = """ SELECT pickup_community_area, fare, EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month, EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour, EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day, UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp, pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, trip_miles, pickup_census_tract, dropoff_census_tract, payment_type, company, trip_seconds, dropoff_community_area, tips FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` WHERE (ABS(FARM_FINGERPRINT(unique_key)) / {max_int64}) < {query_sample_rate}""".format( max_int64=max_int64, query_sample_rate=str(query_sample_rate)) # Beam args to run data processing on DataflowRunner. # TODO(b/151114974): Remove `disk_size_gb` flag after default is increased. # TODO(b/151116587): Remove `shuffle_mode` flag after default is changed. beam_pipeline_args = [ '--runner=DataflowRunner', '--experiments=shuffle_mode=auto', '--project=' + _project_id, '--temp_location=' + os.path.join(_output_bucket, 'tmp'), '--region=' + _gcp_region, '--disk_size_gb=50', ] # Number of epochs in training. train_steps = data_types.RuntimeParameter( name='train-steps', default=10000, ptype=int, ) # Number of epochs in evaluation. eval_steps = data_types.RuntimeParameter( name='eval-steps', default=5000, ptype=int, ) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Update ai_platform_training_args if distributed training was enabled. # Number of worker machines used in distributed training. worker_count = data_types.RuntimeParameter( name='worker-count', default=2, ptype=int, ) # Type of worker machines used in distributed training. worker_type = data_types.RuntimeParameter( name='worker-type', default='standard', ptype=str, ) if FLAGS.distributed_training: ai_platform_training_args.update({ # You can specify the machine types, the number of replicas for workers # and parameter servers. # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#ScaleTier 'scaleTier': 'CUSTOM', 'masterType': 'large_model', 'workerType': worker_type, 'parameterServerType': 'standard', 'workerCount': worker_count, 'parameterServerCount': 1 }) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args={'num_steps': train_steps}, eval_args={'num_steps': eval_steps}, custom_config={ ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args }) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'binary_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud AI Platform if check passed. pusher = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_pusher_executor.Executor), model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], custom_config={ ai_platform_pusher_executor.SERVING_ARGS_KEY: ai_platform_serving_args }) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher ], beam_pipeline_args=beam_pipeline_args, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), ) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'binary_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud AI Platform if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_resolver, model_analyzer, pusher ], # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), ) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud AI Platform if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], additional_pipeline_args={}, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, training_data_root: Text, inference_data_root: Text, module_file: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" training_examples = external_input(training_data_root) # Brings training data into the pipeline or otherwise joins/converts # training data. training_example_gen = CsvExampleGen(input_base=training_examples, instance_name='training_example_gen') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen( input_data=training_example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs['output'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs['output'], schema=infer_schema.outputs['output']) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=training_example_gen.outputs['examples'], schema=infer_schema.outputs['output'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['output'], transform_output=transform.outputs['transform_output'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=training_example_gen.outputs['examples'], model_exports=trainer.outputs['output'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=training_example_gen.outputs['examples'], model=trainer.outputs['output']) inference_examples = external_input(inference_data_root) # Brings inference data into the pipeline. inference_example_gen = CsvExampleGen( input_base=inference_examples, output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='unlabelled', hash_buckets=100) ])), instance_name='inference_example_gen') # Performs offline batch inference over inference examples. bulk_inferrer = BulkInferrer( examples=inference_example_gen.outputs['examples'], model_export=trainer.outputs['output'], model_blessing=model_validator.outputs['blessing'], # Empty data_spec.example_splits will result in using all splits. data_spec=bulk_inferrer_pb2.DataSpec(), model_spec=bulk_inferrer_pb2.ModelSpec()) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ training_example_gen, inference_example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, bulk_inferrer ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])