def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int = 1) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), ) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud AI Platform if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%s' % direct_num_workers], additional_pipeline_args={}, )
def create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=data_root) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Get the latest blessed model for model validation. model_resolver = resolver.Resolver( strategy_class=latest_blessed_model_resolver.LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel( type=ModelBlessing)).with_id('latest_blessed_model_resolver') # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), # Change threshold will be ignored if there is no # baseline model resolved from MLMD (first run). change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) config = kubernetes_dag_runner.get_default_kubernetes_metadata_config() return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher, ], enable_cache=False, metadata_connection_config=config, beam_pipeline_args=beam_pipeline_args)
def _create__pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, ai_platform_training_args: Dict[Text, Text], ai_platform_serving_args: Dict[Text, Text], beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the online news pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=module_file) # Uses user-provided Python function that implements a model using # TensorFlow's Estimators API. trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_graph=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={ ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args }) # Uses TFMA to compute a evaluation statistics over features of a model. eval_config = tfma.EvalConfig( model_specs=[ # This assumes a serving model with signature 'serving_default'. If # using estimator based EvalSavedModel, add signature_name='eval' and # remove the label_key. Note, if using a TFLite model, then you must set # model_type='tf_lite'. tfma.ModelSpec(signature_name='eval') ], metrics_specs=[ tfma.MetricsSpec( # The metrics added here are in addition to those saved with the # model (assuming either a keras model or EvalSavedModel is used). # Any metrics added into the saved model (for example using # model.compile(..., metrics=[...]), etc) will be computed # automatically. # metrics=[ # tfma.MetricConfig(class_name='ExampleCount') # ], # To add validation thresholds for metrics saved with the model, # add them keyed by metric name to the thresholds map. thresholds={ "accuracy": tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.1}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ # An empty slice spec means the overall slice, i.e. the whole dataset. tfma.SlicingSpec(), # Data can be sliced along a feature column. In this case, data is # sliced along feature column trip_start_hour. tfma.SlicingSpec(feature_keys=['weekday']) ]) model_analyzer = Evaluator(examples=example_gen.outputs.examples, model=trainer.outputs.output, eval_config=eval_config) # Performs quality validation of a candidate model (compared to a baseline). # model_validator = ModelValidator( # examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs.output, model_blessing=model_analyzer.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'serving_model'))), custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_pusher_executor.Executor), custom_config={ ai_platform_pusher_executor.SERVING_ARGS_KEY: ai_platform_serving_args }) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, pusher ], # enable_cache=True, beam_pipeline_args=beam_pipeline_args)
def create_pipeline_components( pipeline_root: Text, transform_module: Text, trainer_module: Text, bigquery_query: Text = '', csv_input_location: Text = '', ) -> List[base_node.BaseNode]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. bigquery_query: The query to get input data from BigQuery. If not empty, BigQueryExampleGen will be used. csv_input_location: The location of the input data directory. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ if bool(bigquery_query) == bool(csv_input_location): raise ValueError( 'Exactly one example gen is expected. ', 'Please provide either bigquery_query or csv_input_location.') if bigquery_query: example_gen = big_query_example_gen_component.BigQueryExampleGen( query=bigquery_query) else: examples = dsl_utils.external_input(csv_input_location) example_gen = components.CsvExampleGen(input=examples) statistics_gen = components.StatisticsGen( examples=example_gen.outputs['examples']) schema_gen = components.SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) example_validator = components.ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) transform = components.Transform( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=transform_module) latest_model_resolver = components.ResolverNode( instance_name='latest_model_resolver', resolver_class=latest_artifacts_resolver.LatestArtifactsResolver, model=channel.Channel(type=standard_artifacts.Model)) trainer = components.Trainer( transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], base_model=latest_model_resolver.outputs['model'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module, ) # Get the latest blessed model for model validation. model_resolver = components.ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver, model=channel.Channel(type=standard_artifacts.Model), model_blessing=channel.Channel(type=standard_artifacts.ModelBlessing)) # Set the TFMA config for Model Evaluation and Validation. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], metrics_specs=[ tfma.MetricsSpec( metrics=[tfma.MetricConfig(class_name='ExampleCount')], thresholds={ 'binary_accuracy': tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.5}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ]) evaluator = components.Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) pusher = components.Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, schema_gen, example_validator, transform, latest_model_resolver, trainer, model_resolver, evaluator, pusher ]
def _create_pipeline( pipeline_name: Text, pipeline_root: Text, query: Text, module_file: Text, serving_model_dir: Text, beam_pipeline_args: List[Text], ai_platform_training_args: Dict[Text, Text], ai_platform_serving_args: Dict[Text, Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. try: from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor # pylint: disable=g-import-not-at-top # Train using a custom executor. This requires TFX >= 0.14. trainer = Trainer( executor_class=ai_platform_trainer_executor.Executor, module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={ 'ai_platform_training_args': ai_platform_training_args }) except ImportError: # Train using a deprecated flag. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'cmle_training_args': ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a destination if check passed. try: from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor # pylint: disable=g-import-not-at-top # Deploy the model on Google Cloud AI Platform. This requires TFX >=0.14. pusher = Pusher(executor_class=ai_platform_pusher_executor.Executor, model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={ 'ai_platform_serving_args': ai_platform_serving_args }) except ImportError: # Deploy the model on Google Cloud AI Platform, using a deprecated flag. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={'cmle_serving_args': ai_platform_serving_args}, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], additional_pipeline_args={ 'beam_pipeline_args': beam_pipeline_args, # Optional args: # 'tfx_image': custom docker image to use for components. # This is needed if TFX package is not installed from an RC # or released version. }, log_root='/var/tmp/tfx/logs', )
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Push model to target directory if blessed. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. A push action delivers the model exports produced by Trainer to the destination defined in component config. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: A dict of execution properties, including: - push_destination: JSON string of pusher_pb2.PushDestination instance, providing instruction of destination to push model. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): self._MarkNotPushed(model_push) return model_export = artifact_utils.get_single_instance(input_dict[MODEL_KEY]) model_path = path_utils.serving_model_path(model_export.uri) # Push model to the destination, which can be listened by a model server. # # If model is already successfully copied to outside before, stop copying. # This is because model validator might blessed same model twice (check # mv driver) with different blessing output, we still want Pusher to # handle the mv output again to keep metadata tracking, but no need to # copy to outside path again.. # TODO(jyzhao): support rpc push and verification. push_destination = pusher_pb2.PushDestination() json_format.Parse(exec_properties['push_destination'], push_destination) destination_kind = push_destination.WhichOneof('destination') if destination_kind == 'filesystem': fs_config = push_destination.filesystem if fs_config.versioning == _Versioning.AUTO: fs_config.versioning = _Versioning.UNIX_TIMESTAMP if fs_config.versioning == _Versioning.UNIX_TIMESTAMP: model_version = str(int(time.time())) else: raise NotImplementedError( 'Invalid Versioning {}'.format(fs_config.versioning)) logging.info('Model version: %s', model_version) serving_path = os.path.join(fs_config.base_directory, model_version) if fileio.exists(serving_path): logging.info( 'Destination directory %s already exists, skipping current push.', serving_path) else: # tf.serving won't load partial model, it will retry until fully copied. io_utils.copy_dir(model_path, serving_path) logging.info('Model written to serving path %s.', serving_path) else: raise NotImplementedError( 'Invalid push destination {}'.format(destination_kind)) # Copy the model to pushing uri for archiving. io_utils.copy_dir(model_path, model_push.uri) self._MarkPushed(model_push, pushed_destination=serving_path, pushed_version=model_version) logging.info('Model pushed to %s.', model_push.uri)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(_data_root_param) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=_module_file_param) # Uses user-provided Python function that implements a model using Keras. trainer = Trainer( module_file=_module_file_param, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=100), eval_args=trainer_pb2.EvalArgs(num_steps=50)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute an evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). # Note: to compile this successfully you'll need TFMA at >= 0.21.5 eval_config = tfma.EvalConfig( model_specs=[ tfma.ModelSpec(name='candidate', label_key='variety'), tfma.ModelSpec(name='baseline', label_key='variety', is_baseline=True) ], slicing_specs=[ tfma.SlicingSpec(), # Data can be sliced along a feature column. Required by TFMA visualization. tfma.SlicingSpec(feature_keys=['sepal_length']) ], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.9}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10}))) ]) ]) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(pipeline.ROOT_PARAMETER), 'model_serving')))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_resolver, model_analyzer, pusher ], enable_cache=True, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, trainer_module_file: Text, evaluator_module_file: Text, serving_model_dir: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the Penguin pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=data_root) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # TODO(humichael): Handle applying transformation component in Milestone 3. # Uses user-provided Python function that trains a model using TF-Learn. # Num_steps is not provided during evaluation because the scikit-learn model # loads and evaluates the entire test set at once. trainer = Trainer(module_file=trainer_module_file, examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=2000), eval_args=trainer_pb2.EvalArgs()) # Get the latest blessed model for model validation. model_resolver = ResolverNode( resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel( type=ModelBlessing)).with_id('latest_blessed_model_resolver') # Uses TFMA to compute evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='species')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='Accuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10}))) ]) ]) evaluator = Evaluator(module_file=evaluator_module_file, examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, trainer, model_resolver, evaluator, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args, )
def _create_pipeline(pipeline_name: str, pipeline_root: str, data_root: str, module_file: str, serving_model_dir: str, metadata_path: str, beam_pipeline_args: List[str]) -> pipeline.Pipeline: """Implements the Bert classication on Cola dataset pipline with TFX.""" input_config = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train/*'), example_gen_pb2.Input.Split(name='eval', pattern='validation/*') ]) # Brings data into the pipline example_gen = CsvExampleGen(input_base=data_root, input_config=input_config) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that trains a model. trainer = Trainer( module_file=module_file, examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], # Adjust these steps when training on the full dataset. train_args=trainer_pb2.TrainArgs(num_steps=2), eval_args=trainer_pb2.EvalArgs(num_steps=1)) # Get the latest blessed model for model validation. model_resolver = resolver.Resolver( strategy_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel( type=ModelBlessing)).with_id('latest_blessed_model_resolver') # Uses TFMA to compute evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='label')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( # Adjust the threshold when training on the # full dataset. lower_bound={'value': 0.5}), # Change threshold will be ignored if there is no # baseline model resolved from MLMD (first run). change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-2}))) ]) ]) evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) components = [ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher, ] return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), enable_cache=True, beam_pipeline_args=beam_pipeline_args, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='tips')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='BinaryAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10}))) ]) ]) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_resolver, model_analyzer, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args)
def create_pipeline( prev_run_root: Text, run_root: Text, pipeline_name: Text, pipeline_mod: Text, query: Text, beam_pipeline_args: Optional[List[Text]] = None, metadata_path: Optional[Text] = None, custom_config: Optional[Dict[Text, Any]] = None) -> pipeline.Pipeline: """Implements the incremental pipeline..""" example_gen = BigQueryExampleGen( query=query, output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=20), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ]))) schema_importer = ImporterNode(instance_name='import_schema', source_uri=os.path.join( prev_run_root, 'serving/schema'), artifact_type=standard_artifacts.Schema, reimport=False) graph_importer = ImporterNode( instance_name='import_transform_graph', source_uri=os.path.join(prev_run_root, 'serving/transform_graph'), artifact_type=standard_artifacts.TransformGraph, reimport=False) model_importer = ImporterNode(instance_name='import_model', source_uri=os.path.join( prev_run_root, 'serving/model'), artifact_type=standard_artifacts.Model, reimport=False) # Performs transformations and feature engineering in training and serving. transform = TransformWithGraph( examples=example_gen.outputs['examples'], schema=schema_importer.outputs['result'], transform_graph=graph_importer.outputs['result']) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( transformed_examples=transform.outputs['transformed_examples'], schema=schema_importer.outputs['result'], transform_graph=graph_importer.outputs['result'], train_args=trainer_pb2.TrainArgs(), eval_args=trainer_pb2.EvalArgs(), trainer_fn='{}.trainer_fn'.format(pipeline_mod), base_model=model_importer.outputs['result'], custom_config=custom_config) # Not depdent on blessing. Always pushes regardless of quality. pusher = AlwaysPusher( model=trainer.outputs['model'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(run_root, 'serving', 'model')))) schema_pusher = SchemaPusher( artifact=schema_importer.outputs['result'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(run_root, 'serving', 'schema'))), instance_name='schema_pusher') transform_graph_pusher = TransformGraphPusher( artifact=graph_importer.outputs['result'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(run_root, 'serving', 'transform_graph'))), instance_name='transform_graph_pusher') pipeline_kwargs = {} if metadata_path is not None: pipeline_kwargs = { 'metadata_connection_config': metadata.sqlite_metadata_connection_config(metadata_path), } return pipeline.Pipeline(pipeline_name=pipeline_name, pipeline_root=os.path.join(run_root, 'data'), components=[ example_gen, schema_importer, graph_importer, model_importer, transform, trainer, pusher, schema_pusher, transform_graph_pusher ], enable_cache=True, beam_pipeline_args=beam_pipeline_args, **pipeline_kwargs)
def testTaxiPipelineNewStyleCompatibility(self): examples = external_input('/tmp/fake/path') example_gen = CsvExampleGen(input=examples) self.assertIs(example_gen.inputs['input'], example_gen.inputs['input_base']) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) self.assertIs(statistics_gen.inputs['examples'], statistics_gen.inputs['input_data']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) self.assertIs(infer_schema.inputs['statistics'], infer_schema.inputs['stats']) self.assertIs(infer_schema.outputs['schema'], infer_schema.outputs['output']) validate_examples = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) self.assertIs(validate_examples.inputs['statistics'], validate_examples.inputs['stats']) self.assertIs(validate_examples.outputs['anomalies'], validate_examples.outputs['output']) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file='/tmp/fake/module/file') self.assertIs(transform.inputs['examples'], transform.inputs['input_data']) self.assertIs(transform.outputs['transform_graph'], transform.outputs['transform_output']) trainer = Trainer( module_file='/tmp/fake/module/file', transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) self.assertIs(trainer.inputs['transform_graph'], trainer.inputs['transform_output']) self.assertIs(trainer.outputs['model'], trainer.outputs['output']) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) self.assertIs(evaluator.inputs['model'], evaluator.inputs['model_exports']) self.assertIs(evaluator.outputs['evaluation'], evaluator.outputs['output']) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model=trainer.outputs['output'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory='/fake/serving/dir'))) self.assertIs(pusher.inputs['model'], pusher.inputs['model_export']) self.assertIs(pusher.outputs['pushed_model'], pusher.outputs['model_push'])
def create_tfx_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data. train_data_path = os.path.join(data_root, 'train') test_data_path = os.path.join(data_root, 'test') train_examples = tfrecord_input(train_data_path) train_example_gen = ImportExampleGen(input=train_examples, instance_name='train_example_gen') test_examples = tfrecord_input(test_data_path) test_example_gen = ImportExampleGen(input=test_examples, instance_name='test_example_gen') # Computes statistics over data for visualization and example validation. train_statistics_gen = StatisticsGen(examples=train_example_gen.outputs['examples']) # test_statistics_gen = StatisticsGen(examples=test_example_gen.outputs['examples']) # Generates schema based on statistics files. train_infer_schema = SchemaGen( statistics=train_statistics_gen.outputs['statistics'], infer_feature_shape=False) train_transform = Transform( examples=train_example_gen.outputs['examples'], schema=train_infer_schema.outputs['schema'], module_file=module_file, instance_name='train_transformer') test_transform = Transform( examples=test_example_gen.outputs['examples'], schema=train_infer_schema.outputs['schema'], module_file=module_file, instance_name='test_transformer') # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, # need to use custom executor spec custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), transformed_examples=train_transform.outputs['transformed_examples'], transform_graph=train_transform.outputs['transform_graph'], schema=train_infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=20), eval_args=trainer_pb2.EvalArgs(num_steps=10)) test_pred = custom_component.TestPredComponent( transformed_examples=test_transform.outputs['transformed_examples'], model=trainer.outputs['model'] ) eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='Survived')], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'BinaryAccuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6})) }) ]) evaluator = Evaluator( examples=train_example_gen.outputs['examples'], model=trainer.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ train_example_gen, train_statistics_gen, train_infer_schema, train_transform, trainer, test_example_gen, test_transform, test_pred, # evaluator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer(module_file=_taxi_module_file, examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # This custom component serves as a bridge between pipeline and human model # reviewers to enable review-and-push workflow in model development cycle. It # utilizes Slack API to send message to user-defined Slack channel with model # URI info and wait for go / no-go decision from the same Slack channel: # * To approve the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'lgtm' or 'approve'. # * To reject the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'decline' or 'reject'. slack_validator = SlackComponent( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], slack_token=_slack_token, slack_channel_id=_slack_channel_id, timeout_sec=3600, ) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=slack_validator.outputs['slack_blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return pipeline.Pipeline( pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, slack_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( _metadata_db_root), )
def create_pipeline( pipeline_name: Text, pipeline_root: Text, data_path: Text, # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source. # query: Text, preprocessing_fn: Text, run_fn: Text, train_args: trainer_pb2.TrainArgs, eval_args: trainer_pb2.EvalArgs, eval_accuracy_threshold: float, serving_model_dir: Text, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, beam_pipeline_args: Optional[List[Text]] = None, ai_platform_training_args: Optional[Dict[Text, Text]] = None, ai_platform_serving_args: Optional[Dict[Text, Any]] = None, ) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" components = [] # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=external_input(data_path)) # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source. # example_gen = big_query_example_gen_component.BigQueryExampleGen( # query=query) components.append(example_gen) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # TODO(step 5): Uncomment here to add StatisticsGen to the pipeline. # components.append(statistics_gen) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # TODO(step 5): Uncomment here to add SchemaGen to the pipeline. # components.append(schema_gen) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( # pylint: disable=unused-variable statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # TODO(step 5): Uncomment here to add ExampleValidator to the pipeline. # components.append(example_validator) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], preprocessing_fn=preprocessing_fn) # TODO(step 6): Uncomment here to add Transform to the pipeline. # components.append(transform) # Uses user-provided Python function that implements a model using TF-Learn. trainer_args = { 'run_fn': run_fn, 'transformed_examples': transform.outputs['transformed_examples'], 'schema': schema_gen.outputs['schema'], 'transform_graph': transform.outputs['transform_graph'], 'train_args': train_args, 'eval_args': eval_args, 'custom_executor_spec': executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor), } if ai_platform_training_args is not None: trainer_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.GenericExecutor), 'custom_config': { ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args, } }) trainer = Trainer(**trainer_args) # TODO(step 6): Uncomment here to add Trainer to the pipeline. # components.append(trainer) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # TODO(step 6): Uncomment here to add ResolverNode to the pipeline. # components.append(model_resolver) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='big_tipper')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='BinaryAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': eval_accuracy_threshold}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10}))) ]) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # TODO(step 6): Uncomment here to add Evaluator to the pipeline. # components.append(evaluator) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher_args = { 'model': trainer.outputs['model'], 'model_blessing': evaluator.outputs['blessing'], 'push_destination': pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir)), } if ai_platform_serving_args is not None: pusher_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_pusher_executor.Executor), 'custom_config': { ai_platform_pusher_executor.SERVING_ARGS_KEY: ai_platform_serving_args }, }) pusher = Pusher(**pusher_args) # pylint: disable=unused-variable # TODO(step 6): Uncomment here to add Pusher to the pipeline. # components.append(pusher) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, # TODO(step 8): Change this value to control caching of execution results. enable_cache=True, metadata_connection_config=metadata_connection_config, beam_pipeline_args=beam_pipeline_args, )
def _create_parameterized_pipeline( pipeline_name: Text, pipeline_root: Text, enable_cache: bool, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Creates a simple TFX pipeline with RuntimeParameter. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. enable_cache: Whether to enable cache in this pipeline. beam_pipeline_args: Pipeline args for Beam jobs within Components. Returns: A logical TFX pipeline.Pipeline object. """ # First, define the pipeline parameters. # Path to the CSV data file, under which there should be a data.csv file. data_root = data_types.RuntimeParameter( name='data-root', default='gs://my-bucket/data', ptype=Text, ) # Path to the transform module file. transform_module_file = data_types.RuntimeParameter( name='transform-module', default='gs://my-bucket/modules/transform_module.py', ptype=Text, ) # Path to the trainer module file. trainer_module_file = data_types.RuntimeParameter( name='trainer-module', default='gs://my-bucket/modules/trainer_module.py', ptype=Text, ) # Number of epochs in training. train_steps = data_types.RuntimeParameter( name='train-steps', default=10, ptype=int, ) # Number of epochs in evaluation. eval_steps = data_types.RuntimeParameter( name='eval-steps', default=5, ptype=int, ) # The input data location is parameterized by data_root examples = external_input(data_root) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # The module file used in Transform and Trainer component is paramterized by # transform_module_file. transform = Transform(input_data=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=transform_module_file) # The numbers of steps in train_args are specified as RuntimeParameter with # name 'train-steps' and 'eval-steps', respectively. trainer = Trainer( module_file=trainer_module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_output=transform.outputs['transform_graph'], train_args={'num_steps': train_steps}, eval_args={'num_steps': eval_steps}) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) pusher = Pusher( model_export=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(pipeline.ROOT_PARAMETER), 'model_serving')))) return pipeline.Pipeline(pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher ], enable_cache=enable_cache, beam_pipeline_args=beam_pipeline_args)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, enable_tuning: bool, direct_num_workers: int) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Tunes the hyperparameters for model training based on user-provided Python # function. Note that once the hyperparameters are tuned, you can drop the # Tuner component from pipeline and feed Trainer with tuned hyperparameters. if enable_tuning: tuner = Tuner(module_file=module_file, examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=20), eval_args=trainer_pb2.EvalArgs(num_steps=5)) # Uses user-provided Python function that trains a model. trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], # If Tuner is in the pipeline, Trainer can take Tuner's output # best_hyperparameters artifact as input and utilize it in the user module # code. # # If there isn't Tuner in the pipeline, either use ImporterNode to import # a previous Tuner's output to feed to Trainer, or directly use the tuned # hyperparameters in user module code and set hyperparameters to None # here. # # Example of ImporterNode, # hparams_importer = ImporterNode( # instance_name='import_hparams', # source_uri='path/to/best_hyperparameters.txt', # artifact_type=HyperParameters) # ... # hyperparameters = hparams_importer.outputs['result'], hyperparameters=(tuner.outputs['best_hyperparameters'] if enable_tuning else None), train_args=trainer_pb2.TrainArgs(num_steps=100), eval_args=trainer_pb2.EvalArgs(num_steps=5)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute an evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='variety')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10}))) ]) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) components = [ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher, ] if enable_tuning: components.append(tuner) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, module_file_lite: Text, serving_model_dir: Text, serving_model_dir_lite: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the handwritten digit classification example using TFX.""" examples = external_input(data_root) # Brings data into the pipeline. example_gen = ImportExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) def _create_trainer(module_file, instance_name): return Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=5000), eval_args=trainer_pb2.EvalArgs(num_steps=100), instance_name=instance_name) # Uses user-provided Python function that trains a Keras model. trainer = _create_trainer(module_file, 'mnist') # Trains the same model as the one above, but converts it into a TFLite one. trainer_lite = _create_trainer(module_file_lite, 'mnist_lite') # TODO(b/150949276): Add resolver back once it supports two trainers. # Uses TFMA to compute an evaluation statistics over features of a model and # performs quality validation of a candidate model. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='image_class')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.8}))) ]) ]) eval_config_lite = tfma.EvalConfig() eval_config_lite.CopyFrom(eval_config) # Informs the evaluator that the model is a TFLite model. eval_config_lite.model_specs[0].model_type = 'tf_lite' # Uses TFMA to compute the evaluation statistics over features of a model. evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config, instance_name='mnist') # Uses TFMA to compute the evaluation statistics over features of a TFLite # model. evaluator_lite = Evaluator( examples=example_gen.outputs['examples'], model=trainer_lite.outputs['model'], eval_config=eval_config_lite, instance_name='mnist_lite') # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir)), instance_name='mnist') # Checks whether the TFLite model passed the validation steps and pushes the # model to a file destination if check passed. pusher_lite = Pusher( model=trainer_lite.outputs['model'], model_blessing=evaluator_lite.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir_lite)), instance_name='mnist_lite') return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, trainer_lite, evaluator, evaluator_lite, pusher, pusher_lite, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args)
def generate_pipeline(pipeline_name, pipeline_root, data_root, train_steps, eval_steps, pusher_target): examples = external_input(data_root) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file='util.py', # util.py is a file in the same folder train_args=trainer_pb2.TrainArgs(num_steps=train_steps), eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='target')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'binary_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.4})) # always bless }) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], # baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=pusher_target))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, trainer, model_resolver, evaluator, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( os.path.join(pipeline_root, 'metadata.sqlite')))
def create_test_pipeline(): """Builds an Iris example pipeline with slight changes.""" pipeline_name = "iris" iris_root = "iris_root" serving_model_dir = os.path.join(iris_root, "serving_model", pipeline_name) tfx_root = "tfx_root" data_path = os.path.join(tfx_root, "data_path") pipeline_root = os.path.join(tfx_root, "pipelines", pipeline_name) example_gen = CsvExampleGen(input_base=data_path) statistics_gen = StatisticsGen(examples=example_gen.outputs["examples"]) my_importer = importer.Importer( source_uri="m/y/u/r/i", properties={ "split_names": "['train', 'eval']", }, custom_properties={ "int_custom_property": 42, "str_custom_property": "42", }, artifact_type=standard_artifacts.Examples).with_id("my_importer") schema_gen = SchemaGen(statistics=statistics_gen.outputs["statistics"], infer_feature_shape=True) example_validator = ExampleValidator( statistics=statistics_gen.outputs["statistics"], schema=schema_gen.outputs["schema"]) trainer = Trainer( # Use RuntimeParameter as module_file to test out RuntimeParameter in # compiler. module_file=data_types.RuntimeParameter(name="module_file", default=os.path.join( iris_root, "iris_utils.py"), ptype=str), custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=example_gen.outputs["examples"], schema=schema_gen.outputs["schema"], train_args=trainer_pb2.TrainArgs(num_steps=2000), # Attaching `TrainerArgs` as platform config is not sensible practice, # but is only for testing purpose. eval_args=trainer_pb2.EvalArgs(num_steps=5)).with_platform_config( config=trainer_pb2.TrainArgs(num_steps=2000)) model_resolver = resolver.Resolver( strategy_class=latest_blessed_model_strategy. LatestBlessedModelStrategy, baseline_model=Channel(type=standard_artifacts.Model, producer_component_id="Trainer"), # Cannot add producer_component_id="Evaluator" for model_blessing as it # raises "producer component should have already been compiled" error. model_blessing=Channel(type=standard_artifacts.ModelBlessing)).with_id( "latest_blessed_model_resolver") eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name="eval")], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec( thresholds={ "sparse_categorical_accuracy": tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={"value": 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={"value": -1e-10})) }) ]) evaluator = Evaluator( examples=example_gen.outputs["examples"], model=trainer.outputs["model"], baseline_model=model_resolver.outputs["baseline_model"], eval_config=eval_config) pusher = Pusher(model=trainer.outputs["model"], model_blessing=evaluator.outputs["blessing"], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, my_importer, schema_gen, example_validator, trainer, model_resolver, evaluator, pusher, ], enable_cache=False, beam_pipeline_args=["--my_testing_beam_pipeline_args=bar"], # Attaching `TrainerArgs` as platform config is not sensible practice, # but is only for testing purpose. platform_config=trainer_pb2.TrainArgs(num_steps=2000), execution_mode=pipeline.ExecutionMode.ASYNC)
def _create_pipeline(pipeline_root: Text, csv_input_location: data_types.RuntimeParameter, taxi_module_file: data_types.RuntimeParameter, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = external_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False, ) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema'], ) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=taxi_module_file, ) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), ) # Set the TFMA config for Model Evaluation and Validation. eval_config = tfma.EvalConfig( model_specs=[ # Using signature 'eval' implies the use of an EvalSavedModel. To use # a serving model remove the signature to defaults to 'serving_default' # and add a label_key. tfma.ModelSpec(signature_name='eval') ], metrics_specs=[ tfma.MetricsSpec( # The metrics added here are in addition to those saved with the # model (assuming either a keras model or EvalSavedModel is used). # Any metrics added into the saved model (for example using # model.compile(..., metrics=[...]), etc) will be computed # automatically. metrics=[tfma.MetricConfig(class_name='ExampleCount')], # To add validation thresholds for metrics saved with the model, # add them keyed by metric name to the thresholds map. thresholds={ 'binary_accuracy': tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.5}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ # An empty slice spec means the overall slice, i.e. the whole dataset. tfma.SlicingSpec(), # Data can be sliced along a feature column. In this case, data is # sliced along feature column trip_start_hour. tfma.SlicingSpec(feature_keys=['trip_start_hour']) ]) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config, ) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(pipeline.ROOT_PARAMETER), 'model_serving'))), ) return pipeline.Pipeline( pipeline_name='parameterized_tfx_oss', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, pusher ], enable_cache=enable_cache, )
def _create_parameterized_pipeline( pipeline_name: Text, pipeline_root: Optional[Text] = pipeline_root, enable_cache: Optional[bool] = True) -> pipeline.Pipeline: """Creates a simple TFX pipeline with RuntimeParameter. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. enable_cache: Whether to enable cache in this pipeline. Returns: A logical TFX pipeline.Pipeline object. """ # First, define the pipeline parameters. # Path to the CSV data file, under which there should be a data.csv file. data_root_param = data_types.RuntimeParameter( name='data-root', default='gs://my-bucket/data', ptype=Text, ) # Path to the module file. taxi_module_file_param = data_types.RuntimeParameter( name='module-file', default='gs://my-bucket/modules/taxi_utils.py', ptype=Text, ) # Number of epochs in training. train_steps = data_types.RuntimeParameter( name='train-steps', default=10, ptype=int, ) # Number of epochs in evaluation. eval_steps = data_types.RuntimeParameter( name='eval-steps', default=5, ptype=int, ) # Column name for slicing. slicing_column = data_types.RuntimeParameter( name='slicing-column', default='trip_start_hour', ptype=Text, ) # The input data location is parameterized by _data_root_param examples = external_input(data_root_param) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) infer_schema = SchemaGen( stats=statistics_gen.outputs['statistics'], infer_feature_shape=False) validate_stats = ExampleValidator( stats=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # The module file used in Transform and Trainer component is paramterized by # _taxi_module_file_param. transform = Transform( input_data=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=taxi_module_file_param) # The numbers of steps in train_args are specified as RuntimeParameter with # name 'train-steps' and 'eval-steps', respectively. trainer = Trainer( module_file=taxi_module_file_param, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_output=transform.outputs['transform_graph'], train_args={'num_steps': train_steps}, eval_args={'num_steps': eval_steps}) # The name of slicing column is specified as a RuntimeParameter. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=dict(specs=[{ 'column_for_slicing': [slicing_column] }])) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Hack: ensuring push_destination can be correctly parameterized and # interpreted. # pipeline root will be specified as a dsl.PipelineParam with the name # pipeline-root, see: # https://github.com/tensorflow/tfx/blob/1c670e92143c7856f67a866f721b8a9368ede385/tfx/orchestration/kubeflow/kubeflow_dag_runner.py#L226 pipeline_root_param = dsl.PipelineParam(name='pipeline-root') pusher = Pusher( model_export=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join( str(pipeline_root_param), 'model_serving')))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=data_root) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(Executor), transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), ) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), # Change threshold will be ignored if there is no # baseline model resolved from MLMD (first run). change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) # Performs infra validation of a candidate model to prevent unservable model # from being pushed. In order to use InfraValidator component, persistent # volume and its claim that the pipeline is using should be a ReadWriteMany # access mode. infra_validator = InfraValidator( model=trainer.outputs['model'], examples=example_gen.outputs['examples'], serving_spec=infra_validator_pb2.ServingSpec( tensorflow_serving=infra_validator_pb2.TensorFlowServing( tags=['latest']), kubernetes=infra_validator_pb2.KubernetesConfig()), request_spec=infra_validator_pb2.RequestSpec( tensorflow_serving=infra_validator_pb2. TensorFlowServingRequestSpec())) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud AI Platform if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], infra_blessing=infra_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline(pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, infra_validator, pusher, ], beam_pipeline_args=beam_pipeline_args)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, worker_parallelism: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'binary_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_resolver, model_analyzer, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # LINT.IfChange beam_pipeline_args=[ # -------------------------- Beam Args --------------------------. '--runner=PortableRunner', # Points to the job server started in # setup_beam_on_{flink, spark}.sh '--job_endpoint=localhost:8099', '--environment_type=LOOPBACK', '--sdk_worker_parallelism=%d' % worker_parallelism, '--experiments=use_loopback_process_worker=True', # Setting environment_cache_millis to practically infinity enables # continual reuse of Beam SDK workers, improving performance. '--environment_cache_millis=1000000', # TODO(BEAM-7199): Obviate the need for setting pre_optimize=all. # pylint: disable=g-bad-todo '--experiments=pre_optimize=all', # Note; We use 100 worker threads to mitigate the issue with # scheduling work between the Beam runner and SDK harness. Flink # and Spark can process unlimited work items concurrently while # SdkHarness can only process 1 work item per worker thread. # Having 100 threads will let 100 tasks execute concurrently # avoiding scheduling issue in most cases. In case the threads are # exhausted, beam prints the relevant message in the log. # TODO(BEAM-8151) Remove worker_threads=100 after we start using a # pylint: disable=g-bad-todo # virtually unlimited thread pool by default. '--experiments=worker_threads=100', # ---------------------- End of Beam Args -----------------------. # --------- Flink runner Args (ignored by Spark runner) ---------. '--parallelism=%d' % worker_parallelism, # TODO(FLINK-10672): Obviate setting BATCH_FORCED. # pylint: disable=g-bad-todo '--execution_mode_for_batch=BATCH_FORCED', # ------------------ End of Flink runner Args -------------------. ], # LINT.ThenChange(setup/setup_beam_on_spark.sh) # LINT.ThenChange(setup/setup_beam_on_flink.sh) )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs['output']) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs['output'], schema=infer_schema.outputs['output']) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs['examples'], schema=infer_schema.outputs['output'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['output'], transform_output=transform.outputs['transform_output'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['output'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['output']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs['output'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, )
def _set_up_test_pipeline(self): """Builds an Iris example pipeline with slight changes.""" pipeline_name = "iris" iris_root = "iris_root" serving_model_dir = os.path.join(iris_root, "serving_model", pipeline_name) tfx_root = "tfx_root" data_path = os.path.join(tfx_root, "data_path") pipeline_root = os.path.join(tfx_root, "pipelines", pipeline_name) self.test_pipeline_info = data_types.PipelineInfo(pipeline_name, iris_root) example_gen = CsvExampleGen(input=external_input(data_path)) statistics_gen = StatisticsGen(examples=example_gen.outputs["examples"]) schema_gen = SchemaGen( statistics=statistics_gen.outputs["statistics"], infer_feature_shape=True) example_validator = ExampleValidator( statistics=statistics_gen.outputs["statistics"], schema=schema_gen.outputs["schema"]) trainer = Trainer( # Use RuntimeParameter as module_file to test out RuntimeParameter in # compiler. module_file=data_types.RuntimeParameter( name="module_file", default=os.path.join(iris_root, "iris_utils.py"), ptype=str), custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=example_gen.outputs["examples"], schema=schema_gen.outputs["schema"], train_args=trainer_pb2.TrainArgs(num_steps=2000), eval_args=trainer_pb2.EvalArgs(num_steps=5)) model_resolver = ResolverNode( instance_name="latest_blessed_model_resolver", resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name="eval")], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec( thresholds={ "sparse_categorical_accuracy": tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={"value": 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={"value": -1e-10})) }) ]) evaluator = Evaluator( examples=example_gen.outputs["examples"], model=trainer.outputs["model"], baseline_model=model_resolver.outputs["model"], eval_config=eval_config) pusher = Pusher( model=trainer.outputs["model"], model_blessing=evaluator.outputs["blessing"], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) self._pipeline = pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, trainer, model_resolver, evaluator, pusher, ], enable_cache=True, beam_pipeline_args=[])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'binary_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Performs infra validation of a candidate model to prevent unservable model # from being pushed. infra_validator = InfraValidator( model=trainer.outputs['model'], examples=example_gen.outputs['examples'], serving_spec=infra_validator_pb2.ServingSpec( tensorflow_serving=infra_validator_pb2.TensorFlowServing( tags=['latest']), local_docker=infra_validator_pb2.LocalDockerConfig()), request_spec=infra_validator_pb2.RequestSpec( tensorflow_serving=infra_validator_pb2. TensorFlowServingRequestSpec())) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], infra_blessing=infra_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, infra_validator, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def create_e2e_components( pipeline_root: str, csv_input_location: str, transform_module: str, trainer_module: str, ) -> List[BaseComponent]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ example_gen = CsvExampleGen(input_base=csv_input_location) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics']) example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=transform_module) latest_model_resolver = resolver.Resolver( strategy_class=latest_artifact_strategy.LatestArtifactStrategy, latest_model=Channel(type=Model)).with_id('latest_model_resolver') trainer = Trainer( transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], base_model=latest_model_resolver.outputs['latest_model'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module, ) # Set the TFMA config for Model Evaluation and Validation. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], metrics_specs=[ tfma.MetricsSpec( metrics=[tfma.MetricConfig(class_name='ExampleCount')], thresholds={ 'accuracy': tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.5}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ]) evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config) infra_validator = InfraValidator( model=trainer.outputs['model'], examples=example_gen.outputs['examples'], serving_spec=infra_validator_pb2.ServingSpec( tensorflow_serving=infra_validator_pb2.TensorFlowServing( tags=['latest']), kubernetes=infra_validator_pb2.KubernetesConfig()), request_spec=infra_validator_pb2.RequestSpec( tensorflow_serving=infra_validator_pb2. TensorFlowServingRequestSpec())) pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, schema_gen, example_validator, transform, latest_model_resolver, trainer, evaluator, infra_validator, pusher, ]
def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]], output_dict: Dict[Text, List[types.TfxArtifact]], exec_properties: Dict[Text, Any]) -> None: """Push model to target directory if blessed. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. A push action delivers the model exports produced by Trainer to the destination defined in component config. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: A dict of execution properties, including: - push_destination: JSON string of pusher_pb2.PushDestination instance, providing instruction of destination to push model. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) if not self.CheckBlessing(input_dict, output_dict): return model_push = types.get_single_instance(output_dict['model_push']) model_push_uri = model_push.uri model_export = types.get_single_instance(input_dict['model_export']) model_export_uri = model_export.uri tf.logging.info('Model pushing.') # Copy the model we are pushing into model_path = path_utils.serving_model_path(model_export_uri) # Note: we do not have a logical model version right now. This # model_version is a timestamp mapped to trainer's exporter. model_version = os.path.basename(model_path) tf.logging.info('Model version is %s', model_version) io_utils.copy_dir(model_path, os.path.join(model_push_uri, model_version)) tf.logging.info('Model written to %s.', model_push_uri) # Copied to a fixed outside path, which can be listened by model server. # # If model is already successfully copied to outside before, stop copying. # This is because model validator might blessed same model twice (check # mv driver) with different blessing output, we still want Pusher to # handle the mv output again to keep metadata tracking, but no need to # copy to outside path again.. # TODO(jyzhao): support rpc push and verification. push_destination = pusher_pb2.PushDestination() json_format.Parse(exec_properties['push_destination'], push_destination) serving_path = os.path.join(push_destination.filesystem.base_directory, model_version) if tf.gfile.Exists(serving_path): tf.logging.info( 'Destination directory %s already exists, skipping current push.', serving_path) else: # tf.serving won't load partial model, it will retry until fully copied. io_utils.copy_dir(model_path, serving_path) tf.logging.info('Model written to serving path %s.', serving_path) model_push.set_int_custom_property('pushed', 1) model_push.set_string_custom_property('pushed_model', model_export_uri) model_push.set_int_custom_property('pushed_model_id', model_export.id) tf.logging.info('Model pushed to %s.', serving_path) if exec_properties.get('custom_config'): cmle_serving_args = exec_properties.get( 'custom_config', {}).get('cmle_serving_args') if cmle_serving_args is not None: tf.logging.warn( '\'cmle_serving_args\' is deprecated, please use custom executor ' 'in tfx.extensions.google_cloud_ai_platform.pusher instead' ) return runner.deploy_model_for_cmle_serving( serving_path, model_version, cmle_serving_args)
def testTaxiPipelineNewStyleCompatibility(self): example_gen = CsvExampleGen(input_base='/tmp/fake/path') statistics_gen = StatisticsGen( examples=example_gen.outputs['examples']) self.assertIs(statistics_gen.inputs['examples'], statistics_gen.inputs['input_data']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics']) self.assertIs(schema_gen.inputs['statistics'], schema_gen.inputs['stats']) self.assertIs(schema_gen.outputs['schema'], schema_gen.outputs['output']) transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file='/tmp/fake/module/file') self.assertIs(transform.inputs['examples'], transform.inputs['input_data']) self.assertIs(transform.outputs['transform_graph'], transform.outputs['transform_output']) trainer = Trainer( module_file='/tmp/fake/module/file', transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) self.assertIs(trainer.inputs['transform_graph'], trainer.inputs['transform_output']) self.assertIs(trainer.outputs['model'], trainer.outputs['output']) model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection. HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) self.assertIs(evaluator.inputs['model'], evaluator.inputs['model_exports']) self.assertIs(evaluator.outputs['evaluation'], evaluator.outputs['output']) pusher = Pusher(model=trainer.outputs['output'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory='/fake/serving/dir'))) self.assertIs(pusher.inputs['model'], pusher.inputs['model_export']) self.assertIs(pusher.outputs['pushed_model'], pusher.outputs['model_push'])