def _GetFnArgs(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> fn_args_utils.FnArgs: if input_dict.get(standard_component_specs.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[standard_component_specs.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[standard_component_specs.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[standard_component_specs.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. result = fn_args_utils.get_common_fn_args(input_dict, exec_properties) if result.custom_config and not isinstance(result.custom_config, dict): raise ValueError( 'custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(result.custom_config)) result.transform_output = result.transform_graph_path result.serving_model_dir = serving_model_dir result.eval_model_dir = eval_model_dir result.model_run_dir = model_run_dir result.schema_file = result.schema_path result.hyperparameters = hyperparameters_config return result
def _verify_model_exports(self): self.assertTrue( tf.io.gfile.exists( path_utils.eval_model_dir(self._model_exports.uri))) self.assertTrue( tf.io.gfile.exists( path_utils.serving_model_dir(self._model_exports.uri)))
def _trainer_fn(trainer_fn_args, schema): """Build the estimator using the high level API. Args: trainer_fn_args: Holds args used to train the model as name/value pairs. schema: Holds the schema of the training examples. Returns: A dict of the following: - estimator: The estimator that will be used for training and eval. - train_spec: Spec for training. - eval_spec: Spec for eval. - eval_input_receiver_fn: Input function for eval. """ train_batch_size = 20 eval_batch_size = 10 train_input_fn = lambda: _input_fn( # pylint: disable=g-long-lambda trainer_fn_args.train_files, trainer_fn_args.data_accessor, schema, batch_size=train_batch_size) eval_input_fn = lambda: _input_fn( # pylint: disable=g-long-lambda trainer_fn_args.eval_files, trainer_fn_args.data_accessor, schema, batch_size=eval_batch_size) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=trainer_fn_args.train_steps) serving_receiver_fn = lambda: _serving_input_receiver_fn(schema) exporter = tf.estimator.FinalExporter('iris', serving_receiver_fn) eval_spec = tf.estimator.EvalSpec(eval_input_fn, steps=trainer_fn_args.eval_steps, exporters=[exporter], name='iris-eval') run_config = tf.estimator.RunConfig(save_checkpoints_steps=999, keep_checkpoint_max=1) export_dir = path_utils.serving_model_dir(trainer_fn_args.model_run_dir) run_config = run_config.replace(model_dir=export_dir) estimator = tf.keras.estimator.model_to_estimator( keras_model=_keras_model_builder(), config=run_config) # Create an input receiver for TFMA processing eval_receiver_fn = lambda: _eval_input_receiver_fn(schema) return { 'estimator': estimator, 'train_spec': train_spec, 'eval_spec': eval_spec, 'eval_input_receiver_fn': eval_receiver_fn }
def testKerasModelPath(self): # Create folders based on Keras based Trainer output model directory. output_uri = os.path.join(self.get_temp_dir(), 'model_dir') serving_model_path = path_utils.serving_model_dir(output_uri) serving_model = os.path.join(serving_model_path, 'saved_model.pb') io_utils.write_string_file(serving_model, 'testing') # Test retrieving model folder. self.assertEqual(serving_model_path, path_utils.eval_model_path(output_uri)) self.assertEqual(serving_model_path, path_utils.serving_model_path(output_uri))
def testEstimatorModelPath(self, is_old_artifact): # Create folders based on Estimator based Trainer output model directory, # after Executor performs cleaning. output_uri = os.path.join(self.get_temp_dir(), 'model_dir') eval_model_path = path_utils.eval_model_dir(output_uri, is_old_artifact) eval_model = os.path.join(eval_model_path, 'saved_model.pb') io_utils.write_string_file(eval_model, 'testing') serving_model_path = path_utils.serving_model_dir( output_uri, is_old_artifact) serving_model = os.path.join(eval_model_path, 'saved_model.pb') io_utils.write_string_file(serving_model, 'testing') # Test retrieving model folder. self.assertEqual( eval_model_path, path_utils.eval_model_path(output_uri, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.serving_model_path(output_uri, is_old_artifact)) self.assertEqual( eval_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TFMA_EVAL, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TF_KERAS, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TF_GENERIC, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TF_ESTIMATOR, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TF_JS, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TF_LITE, is_old_artifact))
def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> fn_args_utils.FnArgs: # Load and deserialize custom config from execution properties. # Note that in the component interface the default serialization of custom # config is 'null' instead of '{}'. Therefore we need to default the # json_utils.loads to 'null' then populate it with an empty dict when # needed. custom_config = json_utils.loads( exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {} if not isinstance(custom_config, dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(custom_config)) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. if input_dict.get(constants.BASE_MODEL_KEY): base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY])) else: base_model = None if input_dict.get(constants.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[constants.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[constants.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. result = fn_args_utils.get_common_fn_args(input_dict, exec_properties) result.transform_output = result.transform_graph_path result.serving_model_dir = serving_model_dir result.eval_model_dir = eval_model_dir result.model_run_dir = model_run_dir result.schema_file = result.schema_path result.base_model = base_model result.hyperparameters = hyperparameters_config result.custom_config = custom_config return result
def _assertNumberOfTrainerOutputIsOne(self, pipeline_name): """Make sure the number of trainer executions and output models.""" # There must be only one execution of Trainer. trainer_output_base_dir = os.path.join( self._pipeline_root(pipeline_name), 'Trainer', 'model') trainer_outputs = fileio.listdir(trainer_output_base_dir) self.assertEqual(1, len(trainer_outputs)) # There must be only one saved models each for serving and eval. model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0]) eval_model_dir = path_utils.eval_model_dir(model_uri) serving_model_dir = path_utils.serving_model_dir(model_uri) self.assertEqual(1, fileio.listdir(eval_model_dir).count('saved_model.pb')) self.assertEqual(1, fileio.listdir(serving_model_dir).count('saved_model.pb'))
def ensemble_selection( problem_statement: Parameter[str], examples: InputArtifact[standard_artifacts.Examples], evaluation_split_name: Parameter[str], ensemble_size: Parameter[int], metric: Parameter[str], goal: Parameter[str], model: OutputArtifact[standard_artifacts.Model], input_model0: InputArtifact[standard_artifacts.Model] = None, input_model1: InputArtifact[standard_artifacts.Model] = None, input_model2: InputArtifact[standard_artifacts.Model] = None, input_model3: InputArtifact[standard_artifacts.Model] = None, input_model4: InputArtifact[standard_artifacts.Model] = None, input_model5: InputArtifact[standard_artifacts.Model] = None, input_model6: InputArtifact[standard_artifacts.Model] = None, input_model7: InputArtifact[standard_artifacts.Model] = None, input_model8: InputArtifact[standard_artifacts.Model] = None, input_model9: InputArtifact[standard_artifacts.Model] = None, ) -> None: # pytype: disable=invalid-annotation,wrong-arg-types """Runs the SimpleML trainer as a separate component.""" problem_statement = text_format.Parse(problem_statement, ps_pb2.ProblemStatement()) input_models = [ input_model0, input_model1, input_model2, input_model3, input_model4, input_model5, input_model6, input_model7, input_model8, input_model9 ] saved_model_paths = { str(i): path_utils.serving_model_path(model.uri) for i, model in enumerate(input_models) if model } logging.info('Saved model paths: %s', saved_model_paths) label_key = _label_key(problem_statement) es = es_lib.EnsembleSelection(problem_statement=problem_statement, saved_model_paths=saved_model_paths, ensemble_size=ensemble_size, metric=tf.keras.metrics.deserialize( json.loads(metric)), goal=goal) es.fit(*_data_from_examples(examples_path=os.path.join( examples.uri, evaluation_split_name), label_key=label_key)) logging.info('Selected ensemble weights: %s', es.weights) es.save(export_path=os.path.join(path_utils.serving_model_dir(model.uri), 'export', 'serving'))
def testEstimatorModelPath(self): # Create folders based on Estimator based Trainer output model directory, # after Executor performs cleaning. output_uri = os.path.join(self.get_temp_dir(), 'model_dir') eval_model_path = path_utils.eval_model_dir(output_uri) eval_model = os.path.join(eval_model_path, 'saved_model.pb') io_utils.write_string_file(eval_model, 'testing') serving_model_path = path_utils.serving_model_dir(output_uri) serving_model = os.path.join(eval_model_path, 'saved_model.pb') io_utils.write_string_file(serving_model, 'testing') # Test retrieving model folder. self.assertEqual(eval_model_path, path_utils.eval_model_path(output_uri)) self.assertEqual(serving_model_path, path_utils.serving_model_path(output_uri))
def _assertNumberOfTrainerOutputIsOne(self, pipeline_name): """Make sure the number of trainer executions and output models.""" # There must be only one execution of Trainer. trainer_output_base_dir = os.path.join( self._pipeline_root(pipeline_name), 'Trainer', 'model') trainer_outputs = tf.io.gfile.listdir(trainer_output_base_dir) self.assertEqual(1, len(trainer_outputs)) # There must be only one saved models each for serving and eval. model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0]) self.assertEqual( 1, len(tf.io.gfile.listdir(path_utils.eval_model_dir(model_uri)))) self.assertEqual( 1, len( tf.io.gfile.listdir( os.path.join(path_utils.serving_model_dir(model_uri), 'export', 'chicago-taxi'))))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - examples: Examples used for training, must include 'train' and 'eval' splits. - transform_output: Optional input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None Raises: ValueError: When neither or both of 'module_file' and 'trainer_fn' are present in 'exec_properties'. """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(zhitaoli): Deprecate this in a future version. if exec_properties.get('custom_config', None): cmle_args = exec_properties.get('custom_config', {}).get('cmle_training_args') if cmle_args: executor_class_path = '.'.join([Executor.__module__, Executor.__name__]) absl.logging.warn( 'Passing \'cmle_training_args\' to trainer directly is deprecated, ' 'please use extension executor at ' 'tfx.extensions.google_cloud_ai_platform.trainer.executor instead') return runner.start_cmle_training(input_dict, output_dict, exec_properties, executor_class_path, cmle_args) trainer_fn = self._GetTrainerFn(exec_properties) # Set up training parameters train_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'train')) ] transform_output = artifact_utils.get_single_uri( input_dict['transform_output']) if input_dict.get( 'transform_output', None) else None eval_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'eval')) ] schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['schema'])) train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = artifact_utils.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # Assemble warm start path if needed. warm_start_from = None if exec_properties.get('warm_starting') and exec_properties.get( 'warm_start_from'): previous_model_dir = os.path.join(exec_properties['warm_start_from'], path_utils.SERVING_MODEL_DIR) if previous_model_dir and tf.io.gfile.exists( os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)): warm_start_from = previous_model_dir # TODO(b/126242806) Use PipelineInputs when it is available in third_party. hparams = _HParamWrapper( # A list of uris for train files. train_files=train_files, # An optional single uri for transform graph produced by TFT. Will be # None if not specified. transform_output=transform_output, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A list of uris for eval files. eval_files=eval_files, # A single uri for schema file. schema_file=schema_file, # Number of train steps. train_steps=train_steps, # Number of eval steps. eval_steps=eval_steps, # A single uri for the model directory to warm start from. warm_start_from=warm_start_from) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(hparams, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> TrainerFnArgs: custom_config = exec_properties.get('custom_config') or {} if not isinstance(custom_config, dict): raise ValueError('Expect custom_config to be a dict but got %s instead' % type(custom_config)) # Set up training parameters train_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict[EXAMPLES_KEY], 'train')) ] transform_output = artifact_utils.get_single_uri( input_dict[TRANSFORM_GRAPH_KEY]) if input_dict.get( TRANSFORM_GRAPH_KEY, None) else None eval_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict[EXAMPLES_KEY], 'eval')) ] schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict[BASE_MODEL_KEY]) ) if input_dict.get(BASE_MODEL_KEY) else None if input_dict.get(HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = artifact_utils.get_single_uri(output_dict[OUTPUT_MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. return TrainerFnArgs( # A list of uris for train files. train_files=train_files, # An optional single uri for transform graph produced by TFT. Will be # None if not specified. transform_output=transform_output, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A single uri for the output directory of the eval model. # Note that this is estimator only, Keras doesn't require it for TFMA. eval_model_dir=eval_model_dir, # A list of uris for eval files. eval_files=eval_files, # A single uri for schema file. schema_file=schema_file, # Number of train steps. train_steps=train_steps, # Number of eval steps. eval_steps=eval_steps, # Base model that will be used for this training job. base_model=base_model, # An optional kerastuner.HyperParameters config. hyperparameters=hyperparameters_config, # Additional parameters to pass to trainer function. **custom_config)
def trainer_fn(trainer_fn_args, schema): """Build the estimator using the high level API. Args: trainer_fn_args: Holds args used to train the model as name/value pairs. schema: Holds the schema of the training examples. Returns: A dict of the following: - estimator: The estimator that will be used for training and eval. - train_spec: Spec for training. - eval_spec: Spec for eval. - eval_input_receiver_fn: Input function for eval. """ if trainer_fn_args.hyperparameters: hp = trainer_fn_args.hyperparameters first_dnn_layer_size = hp.get('first_dnn_layer_size') num_dnn_layers = hp.get('num_dnn_layers') dnn_decay_factor = hp.get('dnn_decay_factor') else: # Number of nodes in the first layer of the DNN first_dnn_layer_size = 100 num_dnn_layers = 4 dnn_decay_factor = 0.7 train_batch_size = 40 eval_batch_size = 40 # TODO(b/162532757): use _tfxio_input_fn exclusively once tfx-bsl post-0.22 is # released. use_tfxio_input_fn = trainer_fn_args.get('use_tfxio_input_fn', False) input_fn = _tfxio_input_fn if use_tfxio_input_fn else _input_fn tf_transform_output = tft.TFTransformOutput( trainer_fn_args.transform_output) train_input_fn = lambda: input_fn( # pylint: disable=g-long-lambda trainer_fn_args.train_files, trainer_fn_args.data_accessor, tf_transform_output, batch_size=train_batch_size) eval_input_fn = lambda: input_fn( # pylint: disable=g-long-lambda trainer_fn_args.eval_files, trainer_fn_args.data_accessor, tf_transform_output, batch_size=eval_batch_size) train_spec = tf.estimator.TrainSpec( # pylint: disable=g-long-lambda train_input_fn, max_steps=trainer_fn_args.train_steps) serving_receiver_fn = lambda: _example_serving_receiver_fn( # pylint: disable=g-long-lambda tf_transform_output, schema) exporter = tf.estimator.FinalExporter('chicago-taxi', serving_receiver_fn) eval_spec = tf.estimator.EvalSpec(eval_input_fn, steps=trainer_fn_args.eval_steps, exporters=[exporter], name='chicago-taxi-eval') run_config = tf.estimator.RunConfig( save_checkpoints_steps=999, # keep_checkpoint_max must be more than the number of worker replicas # nodes if training distributed, in order to avoid race condition. keep_checkpoint_max=5) export_dir = path_utils.serving_model_dir(trainer_fn_args.model_run_dir) run_config = run_config.replace(model_dir=export_dir) warm_start_from = trainer_fn_args.base_model estimator = _build_estimator( # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(first_dnn_layer_size * dnn_decay_factor**i)) for i in range(num_dnn_layers) ], config=run_config, warm_start_from=warm_start_from) # Create an input receiver for TFMA processing receiver_fn = lambda: _eval_input_receiver_fn( # pylint: disable=g-long-lambda tf_transform_output, schema) return { 'estimator': estimator, 'train_spec': train_spec, 'eval_spec': eval_spec, 'eval_input_receiver_fn': receiver_fn }
def testAIPlatformTrainerPipeline(self): """Trainer-only test pipeline on AI Platform Training.""" pipeline_name = 'kubeflow-aip-trainer-test-{}'.format(self._random_id()) pipeline = self._create_pipeline( pipeline_name, [ self.schema_importer, self.transformed_examples_importer, self.transform_graph_importer, Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=self._trainer_module, transformed_examples=self.transformed_examples_importer .outputs['result'], schema=self.schema_importer.outputs['result'], transform_graph=self.transform_graph_importer.outputs['result'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), custom_config={ # Test that distributed training is behaves properly. ai_platform_trainer_executor.TRAINING_ARGS_KEY: { 'project': self._gcp_project_id, 'region': self._gcp_region, 'jobDir': os.path.join( self._pipeline_root(pipeline_name), 'tmp'), 'masterConfig': { 'imageUri': self._container_image, }, 'scaleTier': 'CUSTOM', 'masterType': 'large_model', 'parameterServerType': 'standard', 'parameterServerCount': 1, 'workerType': 'standard', 'workerCount': 2, } }) ]) self._compile_and_run_pipeline(pipeline) # There must be only one execution of Trainer. trainer_output_base_dir = os.path.join( self._pipeline_root(pipeline_name), 'Trainer', 'model') trainer_outputs = tf.io.gfile.listdir(trainer_output_base_dir) self.assertEqual(1, len(trainer_outputs)) # There must be only one saved models each for serving and eval. model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0]) self.assertEqual( 1, len(tf.io.gfile.listdir(path_utils.eval_model_dir(model_uri)))) self.assertEqual( 1, len( tf.io.gfile.listdir( os.path.join( path_utils.serving_model_dir(model_uri), 'export', 'chicago-taxi'))))
def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> TrainerFnArgs: fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties) # Load and deserialize custom config from execution properties. # Note that in the component interface the default serialization of custom # config is 'null' instead of '{}'. Therefore we need to default the # json_utils.loads to 'null' then populate it with an empty dict when # needed. custom_config = json_utils.loads( exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {} if not isinstance(custom_config, Dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(custom_config)) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. if input_dict.get(constants.BASE_MODEL_KEY): base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY])) else: base_model = None if input_dict.get(constants.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[constants.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[constants.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. return TrainerFnArgs( # A list of uris for train files. train_files=fn_args.train_files, # An optional single uri for transform graph produced by TFT. Will be # None if not specified. transform_output=fn_args.transform_graph_path, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A single uri for the output directory of the eval model. # Note that this is estimator only, Keras doesn't require it for TFMA. eval_model_dir=eval_model_dir, # A list of uris for eval files. eval_files=fn_args.eval_files, # A single uri for the output directory of model training related files. model_run_dir=model_run_dir, # A single uri for schema file. schema_file=fn_args.schema_path, # Number of train steps. train_steps=fn_args.train_steps, # Number of eval steps. eval_steps=fn_args.eval_steps, # Base model that will be used for this training job. base_model=base_model, # An optional kerastuner.HyperParameters config. hyperparameters=hyperparameters_config, # Additional parameters to pass to trainer function. **custom_config)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - examples: Examples used for training, must include 'train' and 'eval' splits. - transform_output: Optional input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - model: Exported model. - model_run: Model training related outputs (e.g., Tensorboard logs) exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. - custom_config: Optional. JSON-serialized dict of additional parameters to pass to trainer function. Returns: None Raises: ValueError: When neither or both of 'module_file' and 'trainer_fn' are present in 'exec_properties'. """ self._log_startup(input_dict, output_dict, exec_properties) fn_args = self._GetFnArgs(input_dict, output_dict, exec_properties) trainer_fn = udf_utils.get_fn(exec_properties, 'trainer_fn') schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) # TODO(b/160795287): Deprecate estimator based executor. # Provide user with a modified fn_args, with model_run given as # the working directory. Executor will then copy user models to # model artifact directory. serving_dest = fn_args.serving_model_dir eval_dest = fn_args.eval_model_dir working_dir = fn_args.model_run_dir fn_args.serving_model_dir = path_utils.serving_model_dir(working_dir) fn_args.eval_model_dir = path_utils.eval_model_dir(working_dir) training_spec = trainer_fn(fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info( 'Training complete. Model written to %s. ModelRun written to %s', fn_args.serving_model_dir, fn_args.model_run_dir) # Export an eval savedmodel for TFMA. If distributed training, it must only # be written by the chief worker, as would be done for serving savedmodel. if _is_chief(): absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=fn_args.eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir) # TODO(b/160795287): Deprecate estimator based executor. # Copy serving and eval model from model_run to model artifact directory. serving_source = path_utils.serving_model_path(fn_args.model_run_dir) io_utils.copy_dir(serving_source, serving_dest) absl.logging.info('Serving model copied to: %s.', serving_dest) eval_source = path_utils.eval_model_path(fn_args.model_run_dir) io_utils.copy_dir(eval_source, eval_dest) absl.logging.info('Eval model copied to: %s.', eval_dest) else: absl.logging.info( 'Model export is skipped because this is not the chief worker.')
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - examples: Examples used for training, must include 'train' and 'eval' splits. - transform_output: Optional input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None Raises: ValueError: When neither or both of 'module_file' and 'trainer_fn' are present in 'exec_properties'. """ self._log_startup(input_dict, output_dict, exec_properties) custom_config = exec_properties.get('custom_config') or {} if not isinstance(custom_config, dict): raise ValueError( 'Expect custom_config to be a dict but got %s instead' % type(custom_config)) trainer_fn = self._GetTrainerFn(exec_properties) # Set up training parameters train_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'train')) ] transform_output = artifact_utils.get_single_uri( input_dict['transform_output']) if input_dict.get( 'transform_output', None) else None eval_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'eval')) ] schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['schema'])) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict['base_model']) ) if input_dict.get('base_model') else None if input_dict.get('hyperparameters'): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['hyperparameters'])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = artifact_utils.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. train_fn_args = TrainerFnArgs( # A list of uris for train files. train_files=train_files, # An optional single uri for transform graph produced by TFT. Will be # None if not specified. transform_output=transform_output, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A list of uris for eval files. eval_files=eval_files, # A single uri for schema file. schema_file=schema_file, # Number of train steps. train_steps=train_steps, # Number of eval steps. eval_steps=eval_steps, # Base model that will be used for this training job. base_model=base_model, # An optional kerastuner.HyperParameters config. hyperparameters=hyperparameters_config, # Additional parameters to pass to trainer function. **custom_config) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(train_fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
def Do(self, input_dict: Dict[str, List[Artifact]], output_dict: Dict[str, List[Artifact]], exec_properties: Dict[str, Any]) -> None: """Recommends a tuner config. Args: input_dict: Input dict from input key to a list of artifacts, including: - meta_train_features_N: MetaFeatures for Nth train dataset. - hparams_train_N: HParms for Nth train dataset. The maximum value `N` being _MAX_INPUTS. output_dict: Output dict from key to a list of artifacts. exec_properties: A dict of execution properties. Raises: """ algorithm = exec_properties['algorithm'] metafeatures_list = [] # This should be agnostic to meta-feature type. for ix in range(MAX_INPUTS): metafeature_key = f'meta_train_features_{ix}' if metafeature_key in input_dict: metafeature_uri = os.path.join( artifact_utils.get_single_uri(input_dict[metafeature_key]), artifacts.MetaFeatures.DEFAULT_FILE_NAME) logging.info('Found %s at %s.', metafeature_key, metafeature_uri) metafeatures = json.loads( io_utils.read_string_file(metafeature_uri)) metafeatures_list.append(metafeatures['metafeature']) all_hparams = [] for ix in range(MAX_INPUTS): hparam_key = f'hparams_train_{ix}' if hparam_key in input_dict: hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[hparam_key])) logging.info('Found %s at %s.', hparam_key, hyperparameters_file) hparams_json = json.loads( io_utils.read_string_file(hyperparameters_file)) all_hparams.append(hparams_json['values']) if algorithm == MAJORITY_VOTING: discrete_search_space = self._create_search_space_using_voting( all_hparams) hparams_config_list = [discrete_search_space.get_config()] elif algorithm == NEAREST_NEIGHBOR: # Build nearest_neighbor model output_path = artifact_utils.get_single_uri( output_dict[OUTPUT_MODEL]) serving_model_dir = path_utils.serving_model_dir(output_path) model = self._create_knn_model_from_metafeatures(metafeatures_list) # TODO(nikhilmehta): Consider adding signature here. model.save(serving_model_dir) # Collect all Candidate HParams hparams_list = self._convert_to_kerastuner_hyperparameters( all_hparams) hparams_config_list = [ hparam.get_config() for hparam in hparams_list ] else: raise NotImplementedError( f'The algorithm "{algorithm}" is not supported.') meta_hparams_path = os.path.join( artifact_utils.get_single_uri(output_dict[OUTPUT_HYPERPARAMS]), _DEFAULT_FILE_NAME) io_utils.write_string_file(meta_hparams_path, json.dumps(hparams_config_list)) logging.info('Meta HParams saved at %s', meta_hparams_path)
def Do(self, input_dict, output_dict, exec_properties): """Runs trainer job the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - transformed_examples: Transformed example. - transform_output: Input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(khaas): Move this to tfx/extensions. if exec_properties.get('custom_config', None): cmle_args = exec_properties.get('custom_config', {}).get('cmle_training_args') if cmle_args: return cmle_runner.start_cmle_training(input_dict, output_dict, exec_properties, cmle_args) trainer_fn = io_utils.import_func(exec_properties['module_file'], 'trainer_fn') # Set up training parameters train_files = [ _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'train')) ] transform_output = types.get_single_uri(input_dict['transform_output']) eval_files = _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'eval')) schema_file = io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema'])) train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = types.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # Assemble warm start path if needed. warm_start_from = None if exec_properties.get('warm_starting') and exec_properties.get( 'warm_start_from'): previous_model_dir = os.path.join( exec_properties['warm_start_from'], path_utils.SERVING_MODEL_DIR) if previous_model_dir and tf.gfile.Exists( os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)): warm_start_from = previous_model_dir # TODO(b/126242806) Use PipelineInputs when it is available in third_party. hparams = tf.contrib.training.HParams( train_files=train_files, transform_output=transform_output, output_dir=output_path, serving_model_dir=serving_model_dir, eval_files=eval_files, schema_file=schema_file, train_steps=train_steps, eval_steps=eval_steps, warm_start_from=warm_start_from) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(hparams, schema) # Train the model tf.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) tf.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA tf.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)