def test_preprocessing_fn(self): schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) feature_spec = taxi_utils._get_raw_feature_spec(schema) working_dir = self.get_temp_dir() transform_output_path = os.path.join(working_dir, 'transform_output') transformed_examples_path = os.path.join( working_dir, 'transformed_examples') # Run very simplified version of executor logic. # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults. # Generate legacy `DatasetMetadata` object. Future version of Transform # will accept the `Schema` proto directly. legacy_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(feature_spec)) decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema) with beam.Pipeline() as p: with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')): examples = ( p | 'ReadTrainData' >> beam.io.ReadFromTFRecord( os.path.join(self._testdata_path, 'csv_example_gen/train/*'), coder=beam.coders.BytesCoder(), # TODO(b/114938612): Eventually remove this override. validate=False) | 'DecodeTrainData' >> beam.Map(decoder.decode)) (transformed_examples, transformed_metadata), transform_fn = ( (examples, legacy_metadata) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( taxi_utils.preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # pylint: disable=expression-not-assigned (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn( transform_output_path)) encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema) (transformed_examples | 'EncodeTrainData' >> beam.Map(encoder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(transformed_examples_path, 'train/transformed_exmaples.gz'), coder=beam.coders.BytesCoder())) # pylint: enable=expression-not-assigned # Verify the output matches golden output. # NOTE: we don't verify that transformed examples match golden output. expected_transformed_schema = io_utils.parse_pbtxt_file( os.path.join( self._testdata_path, 'transform/transform_output/transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) transformed_schema = io_utils.parse_pbtxt_file( os.path.join(transform_output_path, 'transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) self.assertEqual(transformed_schema, expected_transformed_schema)
def compare_anomalies(output_uri: Text, expected_uri: Text) -> bool: """Compares anomalies files in output uri and recorded uri. Args: output_uri: pipeline output artifact uri. expected_uri: recorded pipeline output artifact uri. Returns: boolean whether anomalies are same. """ for dir_name, _, leaf_files in tf.io.gfile.walk(expected_uri): for leaf_file in leaf_files: expected_file_name = os.path.join(dir_name, leaf_file) file_name = os.path.join( dir_name.replace(expected_uri, output_uri, 1), leaf_file) anomalies = anomalies_pb2.Anomalies() io_utils.parse_pbtxt_file(os.path.join(output_uri, file_name), anomalies) expected_anomalies = anomalies_pb2.Anomalies() io_utils.parse_pbtxt_file( os.path.join(expected_uri, expected_file_name), expected_anomalies) if expected_anomalies.anomaly_info != anomalies.anomaly_info: return False return True
def annotate_schema( ignore_features: Parameter[str], original_schema: InputArtifact[standard_artifacts.Schema], schema: OutputArtifact[standard_artifacts.Schema], ) -> None: # pytype: disable=invalid-annotation,wrong-arg-types r"""Updates a schema with additional metadata. Args: ignore_features: Newline ('\n') separated list of features to mark as disabled in the output schema. original_schema: The Schema artifact to modify. schema: The output Schema with updates. """ schema_file = io_utils.get_only_uri_in_dir(original_schema.uri) dataset_schema = schema_pb2.Schema() io_utils.parse_pbtxt_file(schema_file, dataset_schema) ignore_features = ignore_features.split("\n") for feature in dataset_schema.feature: if feature.name in ignore_features: logging.info("Marking '%s' as DISABLED.", feature.name) feature.lifecycle_stage = schema_pb2.LifecycleStage.DISABLED io_utils.write_pbtxt_file(os.path.join(schema.uri, "schema.txt"), dataset_schema)
def testPreprocessingFn(self): schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) feature_spec = taxi_utils._get_raw_feature_spec(schema) working_dir = self.get_temp_dir() transform_graph_path = os.path.join(working_dir, 'transform_graph') transformed_examples_path = os.path.join( working_dir, 'transformed_examples') # Run very simplified version of executor logic. # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults. # Generate legacy `DatasetMetadata` object. Future version of Transform # will accept the `Schema` proto directly. legacy_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec(feature_spec)) tfxio = tf_example_record.TFExampleRecord( file_pattern=os.path.join(self._testdata_path, 'csv_example_gen/Split-train/*'), telemetry_descriptors=['Tests'], schema=legacy_metadata.schema) with beam.Pipeline() as p: with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')): examples = p | 'ReadTrainData' >> tfxio.BeamSource() (transformed_examples, transformed_metadata), transform_fn = ( (examples, tfxio.TensorAdapterConfig()) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( taxi_utils.preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # pylint: disable=expression-not-assigned (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_graph_path)) encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema) (transformed_examples | 'EncodeTrainData' >> beam.Map(encoder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(transformed_examples_path, 'Split-train/transformed_examples.gz'), coder=beam.coders.BytesCoder())) # pylint: enable=expression-not-assigned # Verify the output matches golden output. # NOTE: we don't verify that transformed examples match golden output. expected_transformed_schema = io_utils.parse_pbtxt_file( os.path.join( self._testdata_path, 'transform/transform_graph/transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) transformed_schema = io_utils.parse_pbtxt_file( os.path.join(transform_graph_path, 'transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) # Clear annotations so we only have to test main schema. transformed_schema.ClearField('annotation') for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(transformed_schema, expected_transformed_schema)
def testDo(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') eval_stats_artifact = standard_artifacts.ExampleStatistics() eval_stats_artifact.uri = os.path.join(source_data_dir, 'statistics_gen') eval_stats_artifact.split_names = artifact_utils.encode_split_names( ['train', 'eval', 'test']) schema_artifact = standard_artifacts.Schema() schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) validation_output = standard_artifacts.ExampleAnomalies() validation_output.uri = os.path.join(output_data_dir, 'output') input_dict = { executor.STATISTICS_KEY: [eval_stats_artifact], executor.SCHEMA_KEY: [schema_artifact], } exec_properties = { # List needs to be serialized before being passed into Do function. executor.EXCLUDE_SPLITS_KEY: json_utils.dumps(['test']) } output_dict = { executor.ANOMALIES_KEY: [validation_output], } example_validator_executor = executor.Executor() example_validator_executor.Do(input_dict, output_dict, exec_properties) self.assertEqual( artifact_utils.encode_split_names(['train', 'eval']), validation_output.split_names) # Check example_validator outputs. train_anomalies_path = os.path.join(validation_output.uri, 'train', 'anomalies.pbtxt') eval_anomalies_path = os.path.join(validation_output.uri, 'eval', 'anomalies.pbtxt') self.assertTrue(tf.io.gfile.exists(train_anomalies_path)) self.assertTrue(tf.io.gfile.exists(eval_anomalies_path)) train_anomalies = io_utils.parse_pbtxt_file(train_anomalies_path, anomalies_pb2.Anomalies()) eval_anomalies = io_utils.parse_pbtxt_file(eval_anomalies_path, anomalies_pb2.Anomalies()) self.assertEqual(0, len(train_anomalies.anomaly_info)) self.assertEqual(0, len(eval_anomalies.anomaly_info)) # Assert 'test' split is excluded. train_file_path = os.path.join(validation_output.uri, 'test', 'anomalies.pbtxt') self.assertFalse(tf.io.gfile.exists(train_file_path))
def _test_pipeline(pipeline_id: str, run_id: str): """Creates test pipeline with pipeline_id and run_id.""" pipeline = pipeline_pb2.Pipeline() path = os.path.join( os.path.dirname(__file__), 'testdata', 'sync_pipeline.pbtxt') io_utils.parse_pbtxt_file(path, pipeline) pipeline.pipeline_info.id = pipeline_id runtime_parameter_utils.substitute_runtime_parameter(pipeline, { 'pipeline_run_id': run_id, }) return pipeline
def _test_pipeline(ir_path: str, pipeline_id: str, run_id: str, deployment_config: Optional[message.Message]): """Creates test pipeline with pipeline_id and run_id.""" pipeline = pipeline_pb2.Pipeline() io_utils.parse_pbtxt_file(ir_path, pipeline) pipeline.pipeline_info.id = pipeline_id runtime_parameter_utils.substitute_runtime_parameter(pipeline, { constants.PIPELINE_RUN_ID_PARAMETER_NAME: run_id, }) if deployment_config: pipeline.deployment_config.Pack(deployment_config) return pipeline
def tuner_fn(fn_args: FnArgs) -> TunerFnResult: """Build the tuner using the KerasTuner API. Args: fn_args: Holds args as name/value pairs. - working_dir: working dir for tuning. - train_files: List of file paths containing training tf.Example data. - eval_files: List of file paths containing eval tf.Example data. - train_steps: number of train steps. - eval_steps: number of eval steps. - schema_path: optional schema of the input data. - transform_graph_path: optional transform graph produced by TFT. Returns: A namedtuple contains the following: - tuner: A BaseTuner that will be used for tuning. - fit_kwargs: Args to pass to tuner's run_trial function for fitting the model , e.g., the training and validation dataset. Required args depend on the above tuner's implementation. """ hp = kerastuner.HyperParameters() # Defines search space. hp.Choice('learning_rate', [1e-1, 1e-3]) hp.Int('num_layers', 1, 5) # RandomSearch is a subclass of Keras model Tuner. tuner = kerastuner.RandomSearch( _build_keras_model, max_trials=5, hyperparameters=hp, allow_new_entries=False, objective='val_sparse_categorical_accuracy', directory=fn_args.working_dir, project_name='test') schema = schema_pb2.Schema() io_utils.parse_pbtxt_file(fn_args.schema_path, schema) train_dataset = _input_fn(fn_args.train_files, schema) eval_dataset = _input_fn(fn_args.eval_files, schema) return TunerFnResult( tuner=tuner, fit_kwargs={ 'x': train_dataset, 'validation_data': eval_dataset, 'steps_per_epoch': fn_args.train_steps, 'validation_steps': fn_args.eval_steps })
def testTrainerFn(self): temp_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') trainer_fn_args = trainer_executor.TrainerFnArgs( train_files=os.path.join( self._testdata_path, 'transform/transformed_examples/train/*.gz'), transform_output=os.path.join(self._testdata_path, 'transform/transform_output/'), serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'), eval_files=os.path.join( self._testdata_path, 'transform/transformed_examples/eval/*.gz'), schema_file=schema_file, train_steps=1, eval_steps=1, base_model=os.path.join(self._testdata_path, 'trainer/current/serving_model_dir'), data_accessor=DataAccessor(tf_dataset_factory=tfxio_utils. get_tf_dataset_factory_from_artifact( [standard_artifacts.Examples()], []), record_batch_factory=None, data_view_decode_fn=None)) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = taxi_utils_bqml.trainer_fn(trainer_fn_args, schema) estimator = training_spec['estimator'] train_spec = training_spec['train_spec'] eval_spec = training_spec['eval_spec'] eval_input_receiver_fn = training_spec['eval_input_receiver_fn'] self.assertIsInstance(estimator, tf.estimator.Estimator) self.assertIsInstance(train_spec, tf.estimator.TrainSpec) self.assertIsInstance(eval_spec, tf.estimator.EvalSpec) self.assertIsInstance(eval_input_receiver_fn, types.FunctionType) # Train for one step, then eval for one step. eval_result, exports = tf.estimator.train_and_evaluate( estimator, train_spec, eval_spec) self.assertGreater(eval_result['loss'], 0.0) self.assertEqual(len(exports), 1) self.assertGreaterEqual(len(fileio.listdir(exports[0])), 1) # Export the eval saved model. eval_savedmodel_path = tfma.export.export_eval_savedmodel( estimator=estimator, export_dir_base=path_utils.eval_model_dir(temp_dir), eval_input_receiver_fn=eval_input_receiver_fn) self.assertGreaterEqual(len(fileio.listdir(eval_savedmodel_path)), 1) # Test exported serving graph. with tf.compat.v1.Session() as sess: metagraph_def = tf.compat.v1.saved_model.loader.load( sess, [tf.saved_model.SERVING], exports[0]) self.assertIsInstance(metagraph_def, tf.compat.v1.MetaGraphDef)
def run_fn(fn_args: TrainerFnArgs): """Train the model based on given args. Args: fn_args: Holds args used to train the model as name/value pairs. """ schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) x_train, y_train = _input_fn(fn_args.train_files, fn_args.data_accessor, schema) x_eval, y_eval = _input_fn(fn_args.eval_files, fn_args.data_accessor, schema) steps_per_epoch = _TRAIN_DATA_SIZE / _TRAIN_BATCH_SIZE model = MLPClassifier(hidden_layer_sizes=[8, 8, 8], activation='relu', solver='adam', batch_size=_TRAIN_BATCH_SIZE, learning_rate_init=0.0005, max_iter=int(fn_args.train_steps / steps_per_epoch), verbose=True) model.fit(x_train, y_train) absl.logging.info(model) score = model.score(x_eval, y_eval) absl.logging.info('Accuracy: %f', score) os.makedirs(fn_args.serving_model_dir) model_path = os.path.join(fn_args.serving_model_dir, 'model.pkl') with tf.io.gfile.GFile(model_path, 'wb+') as f: pickle.dump(model, f)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: # KerasTuner generates tuning state (e.g., oracle, trials) to working dir. working_dir = self._get_tmp_dir() train_path = artifact_utils.get_split_uri(input_dict['examples'], 'train') eval_path = artifact_utils.get_split_uri(input_dict['examples'], 'eval') schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['schema'])) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) tuner_fn = self._GetTunerFn(exec_properties) tuner_spec = tuner_fn(working_dir, io_utils.all_files_pattern(train_path), io_utils.all_files_pattern(eval_path), schema) tuner = tuner_spec.tuner tuner.search_space_summary() # TODO(jyzhao): assert v2 behavior as KerasTuner doesn't work in v1. # TODO(jyzhao): make epochs configurable. tuner.search( tuner_spec.train_dataset, epochs=5, validation_data=tuner_spec.eval_dataset) tuner.results_summary() best_hparams = tuner.oracle.get_best_trials( 1)[0].hyperparameters.get_config() best_hparams_path = os.path.join( artifact_utils.get_single_uri(output_dict['study_best_hparams_path']), _DEFAULT_FILE_NAME) io_utils.write_string_file(best_hparams_path, json.dumps(best_hparams)) absl.logging.info('Best HParams is written to %s.' % best_hparams_path)
def test_trainer_fn(self): temp_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') hparams = tf.contrib.training.HParams( train_files=os.path.join(temp_dir, 'train_files'), transform_output=os.path.join(self._testdata_path, 'transform/transform_output/'), output_dir=os.path.join(temp_dir, 'output_dir'), serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'), eval_files=os.path.join(temp_dir, 'eval_files'), schema_file=schema_file, train_steps=10001, eval_steps=5000, verbosity='INFO', warm_start_from=os.path.join(temp_dir, 'serving_model_dir')) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = taxi_utils.trainer_fn(hparams, schema) self.assertIsInstance(training_spec['estimator'], tf.estimator.DNNLinearCombinedClassifier) self.assertIsInstance(training_spec['train_spec'], tf.estimator.TrainSpec) self.assertIsInstance(training_spec['eval_spec'], tf.estimator.EvalSpec) self.assertIsInstance(training_spec['eval_input_receiver_fn'], types.FunctionType)
def run_fn(fn_args: executor.TrainerFnArgs): """Train the model based on given args. Args: fn_args: Holds args used to train the model as name/value pairs. """ schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) training_spec = trainer_fn(fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', fn_args.serving_model_dir) # Export an eval savedmodel for TFMA absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=fn_args.eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir)
def run_fn(fn_args: executor.TrainerFnArgs): """Train the model based on given args. Args: fn_args: Holds args used to train the model as name/value pairs. """ schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) training_spec = trainer_fn(fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', fn_args.serving_model_dir) # Export an eval savedmodel for TFMA # NOTE: When trained in distributed training cluster, eval_savedmodel must be # exported only by the chief worker. absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=fn_args.eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) # Simulate writing a log to the path given by fn_args io_utils.write_string_file( os.path.join(fn_args.model_run_dir, 'fake_log.txt'), '') absl.logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir)
def run_fn(fn_args): """Train the model based on given args. Args: fn_args: Holds args used to train the model as name/value pairs. """ schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) train_and_eval_spec = _create_train_and_eval_spec(fn_args, schema) # Train the model logging.info('Training model.') tf.estimator.train_and_evaluate(train_and_eval_spec['estimator'], train_and_eval_spec['train_spec'], train_and_eval_spec['eval_spec']) logging.info('Training complete. Model written to %s', fn_args.serving_model_dir) # Export an eval savedmodel for TFMA # NOTE: When trained in distributed training cluster, eval_savedmodel must be # exported only by the chief worker. logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=train_and_eval_spec['estimator'], export_dir_base=fn_args.eval_model_dir, eval_input_receiver_fn=train_and_eval_spec['eval_input_receiver_fn']) logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir)
def train_custom_model( fn_args: TrainerFnArgs, inputs: Dict[str, Feature], outputs: Dict[str, Feature], custom_model: Callable[ [ TrainerFnArgs, tf.data.Dataset, tf.data.Dataset, ], Any, ], batch_size: int = DEFAULT_BATCH_SIZE, ) -> Callable[[TrainerFnArgs], None]: if fn_args.transform_output is None: tf_transform_output = None schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) feature_spec = schema_utils.schema_as_feature_spec(schema).feature_spec for output_name in Features(outputs).names(): feature_spec.pop(output_name) def transform_features(serialized_tf_examples): return tf.io.parse_example(serialized_tf_examples, feature_spec) else: tf_transform_output = tft.TFTransformOutput(fn_args.transform_output) tft_layer = tf_transform_output.transform_features_layer() schema = tf_transform_output.transformed_metadata.schema feature_spec = tf_transform_output.raw_feature_spec() for output_name in Features(outputs).names(): feature_spec.pop(output_name) def transform_features(serialized_tf_examples): parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec) return tft_layer(parsed_features) def build_dataset(files): return ( fn_args.data_accessor.tf_dataset_factory( files, dataset_options.TensorFlowDatasetOptions(batch_size), schema, ) .map( lambda batch: ( Features(inputs).map(lambda name, _: batch[name]), Features(outputs).map(lambda name, _: batch[name]), ) ) .repeat() ) distributed_strategy = tf.distribute.MirroredStrategy() with distributed_strategy.scope(): custom_model( fn_args=fn_args, train_dataset=build_dataset(fn_args.train_files), eval_dataset=build_dataset(fn_args.eval_files), transform_features=transform_features, )
def test_trainer_fn(self): temp_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') output_dir = os.path.join(temp_dir, 'output_dir') hparams = tf.contrib.training.HParams( train_files=os.path.join( self._testdata_path, 'transform/transformed_examples/train/*.gz'), transform_output=os.path.join(self._testdata_path, 'transform/transform_output/'), output_dir=output_dir, serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'), eval_files=os.path.join( self._testdata_path, 'transform/transformed_examples/eval/*.gz'), schema_file=schema_file, train_steps=1, eval_steps=1, verbosity='INFO', warm_start_from=os.path.join(self._testdata_path, 'trainer/current/serving_model_dir')) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = taxi_utils.trainer_fn(hparams, schema) estimator = training_spec['estimator'] train_spec = training_spec['train_spec'] eval_spec = training_spec['eval_spec'] eval_input_receiver_fn = training_spec['eval_input_receiver_fn'] self.assertIsInstance(estimator, tf.estimator.DNNLinearCombinedClassifier) self.assertIsInstance(train_spec, tf.estimator.TrainSpec) self.assertIsInstance(eval_spec, tf.estimator.EvalSpec) self.assertIsInstance(eval_input_receiver_fn, types.FunctionType) # Train for one step, then eval for one step. eval_result, exports = tf.estimator.train_and_evaluate( estimator, train_spec, eval_spec) self.assertGreater(eval_result['loss'], 0.0) self.assertEqual(len(exports), 1) self.assertGreaterEqual(len(tf.gfile.ListDirectory(exports[0])), 1) # Export the eval saved model. eval_savedmodel_path = tfma.export.export_eval_savedmodel( estimator=estimator, export_dir_base=path_utils.eval_model_dir(output_dir), eval_input_receiver_fn=eval_input_receiver_fn) self.assertGreaterEqual( len(tf.gfile.ListDirectory(eval_savedmodel_path)), 1) # Test exported serving graph. with tf.Session() as sess: metagraph_def = tf.compat.v1.saved_model.loader.load( sess, [tf.saved_model.tag_constants.SERVING], exports[0]) self.assertIsInstance(metagraph_def, tf.MetaGraphDef)
def run_fn(fn_args: FnArgs): """Train the model based on given args. Args: fn_args: Holds args used to train the model as name/value pairs. """ schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) x_train, y_train = _input_fn(fn_args.train_files, fn_args.data_accessor, schema) x_eval, y_eval = _input_fn(fn_args.eval_files, fn_args.data_accessor, schema) steps_per_epoch = _TRAIN_DATA_SIZE / _TRAIN_BATCH_SIZE estimator = MLPClassifier(hidden_layer_sizes=[8, 8, 8], activation='relu', solver='adam', batch_size=_TRAIN_BATCH_SIZE, learning_rate_init=0.0005, max_iter=int(fn_args.train_steps / steps_per_epoch), verbose=True) # Create a pipeline that standardizes the input data before passing it to an # estimator. Once the scaler is fit, it will use the same mean and stdev to # transform inputs at both training and serving time. model = Pipeline([ ('scaler', StandardScaler()), ('estimator', estimator), ]) model.feature_keys = _FEATURE_KEYS model.label_key = _LABEL_KEY model.fit(x_train, y_train) absl.logging.info(model) score = model.score(x_eval, y_eval) absl.logging.info('Accuracy: %f', score) # Export the model as a pickle named model.pkl. AI Platform Prediction expects # sklearn model artifacts to follow this naming convention. os.makedirs(fn_args.serving_model_dir) model_path = os.path.join(fn_args.serving_model_dir, 'model.pkl') with fileio.open(model_path, 'wb+') as f: pickle.dump(model, f)
def run_fn(fn_args: TrainerFnArgs): schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) x_train, y_train = _input_fn(fn_args.train_files, fn_args.data_accessor, schema) cls = SGDClassifier(loss='log', penalty='elasticnet', learning_rate='adaptive', eta0=2, verbose=1, tol=1e-2) count_vectorizer = CountVectorizer() pipeline = Pipeline([('vect', count_vectorizer), ('cls', cls)]) grid = { 'cls__alpha': [0.01, 0.5, 0.99, 2, 5], 'cls__l1_ratio': [0.01, 0.5, 0.99] } grid_search_cv = GridSearchCV(pipeline, param_grid=grid, scoring='roc_auc') print(x_train.shape) print(x_train[0:2, :]) # ravel here # model = grid_search_cv.fit(X=x_train.ravel(), y=y_train) model = pipeline.fit(X=x_train.ravel(), y=y_train) x_eval, y_eval = _input_fn(fn_args.eval_files, fn_args.data_accessor, schema) score = model.score(x_eval.ravel(), y_eval) absl.logging.info('Accuracy: %f', score) os.makedirs(fn_args.serving_model_dir) model_path = os.path.join(fn_args.serving_model_dir, 'model.pkl') with fileio.open(model_path, 'wb+') as f: pickle.dump(model, f)
def run_fn(fn_args: executor.TrainerFnArgs): """Train the model based on given args. Args: fn_args: Holds args used to train the model as name/value pairs. """ schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) training_spec = _trainer_fn(fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', fn_args.serving_model_dir) # Export an eval savedmodel for TFMA # NOTE: When trained in distributed training cluster, eval_savedmodel must be # exported only by the chief worker (check TF_CONFIG). absl.logging.info('Exporting eval_savedmodel for TFMA.') eval_export_dir = path_utils.eval_model_dir(fn_args.model_run_dir) tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_export_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir) # TODO(b/160795287): Deprecate estimator based executor. # Copy serving and eval model from model_run to model artifact directory. serving_source = path_utils.serving_model_path(fn_args.model_run_dir) io_utils.copy_dir(serving_source, fn_args.serving_model_dir) absl.logging.info('Serving model copied to: %s.', fn_args.serving_model_dir) eval_source = path_utils.eval_model_path(fn_args.model_run_dir) io_utils.copy_dir(eval_source, fn_args.eval_model_dir) absl.logging.info('Eval model copied to: %s.', fn_args.eval_model_dir)
def run_fn(fn_args): schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) train_and_eval_spec = _create_train_and_eval_spec(fn_args, schema) logging.info('Training model.') tf.estimator.train_and_evaluate(train_and_eval_spec['estimator'], train_and_eval_spec['train_spec'], train_and_eval_spec['eval_spec']) logging.info('Training complete. Model written to %s', fn_args.serving_model_dir) logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=train_and_eval_spec['estimator'], export_dir_base=fn_args.eval_model_dir, eval_input_receiver_fn=train_and_eval_spec['eval_input_receiver_fn']) logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir)
def testDo(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') eval_stats_artifact = standard_artifacts.ExampleStatistics() eval_stats_artifact.uri = os.path.join(source_data_dir, 'statistics_gen') eval_stats_artifact.split_names = artifact_utils.encode_split_names( ['eval']) schema_artifact = standard_artifacts.Schema() schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) validation_output = standard_artifacts.ExampleAnomalies() validation_output.uri = os.path.join(output_data_dir, 'output') input_dict = { executor.STATISTICS_KEY: [eval_stats_artifact], executor.SCHEMA_KEY: [schema_artifact], } output_dict = { executor.ANOMALIES_KEY: [validation_output], } exec_properties = {} example_validator_executor = executor.Executor() example_validator_executor.Do(input_dict, output_dict, exec_properties) self.assertEqual(['anomalies.pbtxt'], tf.io.gfile.listdir(validation_output.uri)) anomalies = io_utils.parse_pbtxt_file( os.path.join(validation_output.uri, 'anomalies.pbtxt'), anomalies_pb2.Anomalies()) self.assertNotEqual(0, len(anomalies.anomaly_info))
def test_do(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') eval_stats_artifact = types.Artifact('ExampleStatsPath', split='eval') eval_stats_artifact.uri = os.path.join(source_data_dir, 'statistics_gen/eval/') schema_artifact = standard_artifacts.Schema() schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen/') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) validation_output = standard_artifacts.ExampleValidationResult() validation_output.uri = os.path.join(output_data_dir, 'output') input_dict = { 'stats': [eval_stats_artifact], 'schema': [schema_artifact], } output_dict = { 'output': [validation_output], } exec_properties = {} example_validator_executor = executor.Executor() example_validator_executor.Do(input_dict, output_dict, exec_properties) self.assertEqual(['anomalies.pbtxt'], tf.gfile.ListDirectory(validation_output.uri)) anomalies = io_utils.parse_pbtxt_file( os.path.join(validation_output.uri, 'anomalies.pbtxt'), anomalies_pb2.Anomalies()) self.assertNotEqual(0, len(anomalies.anomaly_info))
def test_do(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') eval_stats_artifact = types.TfxType('ExampleStatsPath', split='eval') eval_stats_artifact.uri = os.path.join(source_data_dir, 'statistics_gen/eval/') schema_artifact = types.TfxType('SchemaPath') schema_artifact.uri = os.path.join(source_data_dir, 'schema_gen/') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) validation_output = types.TfxType('ExampleValidationPath') validation_output.uri = os.path.join(output_data_dir, 'output') input_dict = { 'stats': [eval_stats_artifact], 'schema': [schema_artifact], } output_dict = { 'output': [validation_output], } exec_properties = {} example_validator_executor = executor.Executor() example_validator_executor.Do(input_dict, output_dict, exec_properties) self.assertEqual(['anomalies.pbtxt'], tf.gfile.ListDirectory(validation_output.uri)) anomalies = io_utils.parse_pbtxt_file( os.path.join(validation_output.uri, 'anomalies.pbtxt'), anomalies_pb2.Anomalies()) self.assertNotEqual(0, len(anomalies.anomaly_info))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - examples: Examples used for training, must include 'train' and 'eval' splits. - transform_output: Optional input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None Raises: ValueError: When neither or both of 'module_file' and 'trainer_fn' are present in 'exec_properties'. """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(zhitaoli): Deprecate this in a future version. if exec_properties.get('custom_config', None): cmle_args = exec_properties.get('custom_config', {}).get('cmle_training_args') if cmle_args: executor_class_path = '.'.join([Executor.__module__, Executor.__name__]) absl.logging.warn( 'Passing \'cmle_training_args\' to trainer directly is deprecated, ' 'please use extension executor at ' 'tfx.extensions.google_cloud_ai_platform.trainer.executor instead') return runner.start_cmle_training(input_dict, output_dict, exec_properties, executor_class_path, cmle_args) trainer_fn = self._GetTrainerFn(exec_properties) # Set up training parameters train_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'train')) ] transform_output = artifact_utils.get_single_uri( input_dict['transform_output']) if input_dict.get( 'transform_output', None) else None eval_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'eval')) ] schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['schema'])) train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = artifact_utils.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # Assemble warm start path if needed. warm_start_from = None if exec_properties.get('warm_starting') and exec_properties.get( 'warm_start_from'): previous_model_dir = os.path.join(exec_properties['warm_start_from'], path_utils.SERVING_MODEL_DIR) if previous_model_dir and tf.io.gfile.exists( os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)): warm_start_from = previous_model_dir # TODO(b/126242806) Use PipelineInputs when it is available in third_party. hparams = _HParamWrapper( # A list of uris for train files. train_files=train_files, # An optional single uri for transform graph produced by TFT. Will be # None if not specified. transform_output=transform_output, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A list of uris for eval files. eval_files=eval_files, # A single uri for schema file. schema_file=schema_file, # Number of train steps. train_steps=train_steps, # Number of eval steps. eval_steps=eval_steps, # A single uri for the model directory to warm start from. warm_start_from=warm_start_from) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(hparams, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
def load_proto_from_text( self, file_name: Text, proto_message: message.Message) -> message.Message: """Loads proto message from serialized text.""" path = os.path.join(os.path.dirname(__file__), 'testdata', file_name) return io_utils.parse_pbtxt_file(path, proto_message)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Get human review result on a model through Slack channel. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - slack_blessing: model blessing result. exec_properties: A dict of execution properties, including: - slack_token: Token used to setup connection with slack server. - slack_channel_id: The id of the Slack channel to send and receive messages. - timeout_sec: How long do we wait for response, in seconds. Returns: None Raises: TimeoutError: When there is no decision made within timeout_sec. ConnectionError: When connection to slack server cannot be established. """ self._log_startup(input_dict, output_dict, exec_properties) transform_graph_uri = artifact_utils.get_single_uri( input_dict[TRANSFORM_GRAPH_KEY]) temp_path = os.path.join(transform_graph_uri, _TEMP_DIR_IN_TRANSFORM_OUTPUT) # transformed_schema_file = os.path.join( # transform_graph_uri, # tft.TFTransformOutput.TRANSFORMED_METADATA_DIR, # 'schema.pbtxt' # ) # transformed_schema_proto = io_utils.parse_pbtxt_file( # transformed_schema_file, # schema_pb2.Schema() # ) transformed_train_output = artifact_utils.get_split_uri( output_dict[TRANSFORMED_EXAMPLES_KEY], 'train') transformed_eval_output = artifact_utils.get_split_uri( output_dict[TRANSFORMED_EXAMPLES_KEY], 'eval') tf_transform_output = tft.TFTransformOutput(transform_graph_uri) # transform_output_dataset_metadata = dataset_metadata.DatasetMetadata( # schema=transformed_schema_proto # ) # transform_fn = (tf_transform_output.transform_raw_features, transform_output_dataset_metadata) # feature_spec = schema_utils.schema_as_feature_spec(schema_proto).feature_spec schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])) schema_proto = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) transform_input_dataset_metadata = dataset_metadata.DatasetMetadata( schema_proto ) train_data_uri = artifact_utils.get_split_uri( input_dict[EXAMPLES_KEY], 'train' ) eval_data_uri = artifact_utils.get_split_uri( input_dict[EXAMPLES_KEY], 'eval' ) analyze_data_paths = [io_utils.all_files_pattern(train_data_uri)] transform_data_paths = [ io_utils.all_files_pattern(train_data_uri), io_utils.all_files_pattern(eval_data_uri), ] materialize_output_paths = [ os.path.join(transformed_train_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), os.path.join(transformed_eval_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX) ] transform_data_list = self._MakeDatasetList( transform_data_paths, materialize_output_paths ) analyze_data_list = self._MakeDatasetList( analyze_data_paths, ) with self._make_beam_pipeline() as pipeline: with tft_beam.Context(temp_dir=temp_path): # NOTE: Unclear if there is a difference between input_dataset_metadata # and transform_input_dataset_metadata. Look at Transform executor. decode_fn = tft.coders.ExampleProtoCoder(schema_proto, serialized=True).decode input_analysis_data = {} for dataset in analyze_data_list: infix = 'AnalysisIndex{}'.format(dataset.index) dataset.serialized = ( pipeline | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples( dataset, transform_input_dataset_metadata)) dataset.decoded = ( dataset.serialized | 'Decode[{}]'.format(infix) >> self._DecodeInputs(decode_fn)) input_analysis_data[dataset.dataset_key] = dataset.decoded if not hasattr(tft_beam.analyzer_cache, 'DatasetKey'): input_analysis_data = ( [ dataset for dataset in input_analysis_data.values() if dataset is not None ] | 'FlattenAnalysisDatasetsBecauseItIsRequired' >> beam.Flatten(pipeline=pipeline)) transform_fn = ( (input_analysis_data, transform_input_dataset_metadata) | 'Analyze' >> tft_beam.AnalyzeDataset( tf_transform_output.transform_raw_features, pipeline=pipeline)) for dataset in transform_data_list: infix = 'TransformIndex{}'.format(dataset.index) dataset.serialized = ( pipeline | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples( dataset, transform_input_dataset_metadata)) dataset.decoded = ( dataset.serialized | 'Decode[{}]'.format(infix) >> self._DecodeInputs(decode_fn)) dataset.transformed, metadata = ( ((dataset.decoded, transform_input_dataset_metadata), transform_fn) | 'Transform[{}]'.format(infix) >> tft_beam.TransformDataset()) dataset.transformed_and_serialized = ( dataset.transformed | 'EncodeAndSerialize[{}]'.format(infix) >> beam.ParDo(self._EncodeAsSerializedExamples(), _GetSchemaProto(metadata))) _ = ( dataset.transformed_and_serialized | 'Materialize[{}]'.format(infix) >> self._WriteExamples(dataset.materialize_output_path))
def Do(self, input_dict, output_dict, exec_properties): """Runs trainer job the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - transformed_examples: Transformed example. - transform_output: Input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(khaas): Move this to tfx/extensions. if exec_properties.get('custom_config', None): cmle_args = exec_properties.get('custom_config', {}).get('cmle_training_args') if cmle_args: return cmle_runner.start_cmle_training(input_dict, output_dict, exec_properties, cmle_args) trainer_fn = io_utils.import_func(exec_properties['module_file'], 'trainer_fn') # Set up training parameters train_files = [ _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'train')) ] transform_output = types.get_single_uri(input_dict['transform_output']) eval_files = _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'eval')) schema_file = io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema'])) train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = types.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # Assemble warm start path if needed. warm_start_from = None if exec_properties.get('warm_starting') and exec_properties.get( 'warm_start_from'): previous_model_dir = os.path.join( exec_properties['warm_start_from'], path_utils.SERVING_MODEL_DIR) if previous_model_dir and tf.gfile.Exists( os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)): warm_start_from = previous_model_dir # TODO(b/126242806) Use PipelineInputs when it is available in third_party. hparams = tf.contrib.training.HParams( train_files=train_files, transform_output=transform_output, output_dir=output_path, serving_model_dir=serving_model_dir, eval_files=eval_files, schema_file=schema_file, train_steps=train_steps, eval_steps=eval_steps, warm_start_from=warm_start_from) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(hparams, schema) # Train the model tf.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) tf.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA tf.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
def get_proto_from_test_data(filename: Text, pb_message: message.Message) -> message.Message: """Helper function that gets proto from testdata.""" filepath = os.path.join(os.path.dirname(__file__), 'testdata', filename) return io_utils.parse_pbtxt_file(filepath, pb_message)
def test_preprocessing_fn(self): schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) feature_spec = taxi_utils._get_raw_feature_spec(schema) working_dir = self.get_temp_dir() transform_output_path = os.path.join(working_dir, 'transform_output') transformed_examples_path = os.path.join(working_dir, 'transformed_examples') # Run very simplified version of executor logic. # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults. # Generate legacy `DatasetMetadata` object. Future version of Transform # will accept the `Schema` proto directly. legacy_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(feature_spec)) decoder = tft.coders.ExampleProtoCoder(legacy_metadata.schema) with beam.Pipeline() as p: with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')): examples = ( p | 'ReadTrainData' >> beam.io.ReadFromTFRecord( os.path.join(self._testdata_path, 'csv_example_gen/train/*'), coder=beam.coders.BytesCoder(), # TODO(b/114938612): Eventually remove this override. validate=False) | 'DecodeTrainData' >> beam.Map(decoder.decode)) (transformed_examples, transformed_metadata), transform_fn = ( (examples, legacy_metadata) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( taxi_utils.preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to subdirectories # tensorflow_transform.SAVED_MODEL_DIR and # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively. # pylint: disable=expression-not-assigned (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_output_path)) encoder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) (transformed_examples | 'EncodeTrainData' >> beam.Map(encoder.encode) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(transformed_examples_path, 'train/transformed_examples.gz'), coder=beam.coders.BytesCoder())) # pylint: enable=expression-not-assigned # Verify the output matches golden output. # NOTE: we don't verify that transformed examples match golden output. expected_transformed_schema = io_utils.parse_pbtxt_file( os.path.join( self._testdata_path, 'transform/transform_output/transformed_metadata/schema.pbtxt' ), schema_pb2.Schema()) transformed_schema = io_utils.parse_pbtxt_file( os.path.join(transform_output_path, 'transformed_metadata/schema.pbtxt'), schema_pb2.Schema()) # Clear annotations so we only have to test main schema. for feature in transformed_schema.feature: feature.ClearField('annotation') self.assertEqual(transformed_schema, expected_transformed_schema)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - examples: Examples used for training, must include 'train' and 'eval' splits. - transform_output: Optional input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - model: Exported model. - model_run: Model training related outputs (e.g., Tensorboard logs) exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. - custom_config: Optional. JSON-serialized dict of additional parameters to pass to trainer function. Returns: None Raises: ValueError: When neither or both of 'module_file' and 'trainer_fn' are present in 'exec_properties'. """ self._log_startup(input_dict, output_dict, exec_properties) fn_args = self._GetFnArgs(input_dict, output_dict, exec_properties) trainer_fn = udf_utils.get_fn(exec_properties, 'trainer_fn') schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) # TODO(b/160795287): Deprecate estimator based executor. # Provide user with a modified fn_args, with model_run given as # the working directory. Executor will then copy user models to # model artifact directory. serving_dest = fn_args.serving_model_dir eval_dest = fn_args.eval_model_dir working_dir = fn_args.model_run_dir fn_args.serving_model_dir = path_utils.serving_model_dir(working_dir) fn_args.eval_model_dir = path_utils.eval_model_dir(working_dir) training_spec = trainer_fn(fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info( 'Training complete. Model written to %s. ModelRun written to %s', fn_args.serving_model_dir, fn_args.model_run_dir) # Export an eval savedmodel for TFMA. If distributed training, it must only # be written by the chief worker, as would be done for serving savedmodel. if _is_chief(): absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=fn_args.eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir) # TODO(b/160795287): Deprecate estimator based executor. # Copy serving and eval model from model_run to model artifact directory. serving_source = path_utils.serving_model_path(fn_args.model_run_dir) io_utils.copy_dir(serving_source, serving_dest) absl.logging.info('Serving model copied to: %s.', serving_dest) eval_source = path_utils.eval_model_path(fn_args.model_run_dir) io_utils.copy_dir(eval_source, eval_dest) absl.logging.info('Eval model copied to: %s.', eval_dest) else: absl.logging.info( 'Model export is skipped because this is not the chief worker.')
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - examples: Examples used for training, must include 'train' and 'eval' splits. - transform_output: Optional input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. - custom_config: Optional. Additional parameters to pass to trainer function. Returns: None Raises: ValueError: When neither or both of 'module_file' and 'trainer_fn' are present in 'exec_properties'. """ self._log_startup(input_dict, output_dict, exec_properties) fn_args = self._GetFnArgs(input_dict, output_dict, exec_properties) trainer_fn = self._GetFn(exec_properties, 'trainer_fn') schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) training_spec = trainer_fn(fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', fn_args.serving_model_dir) # Export an eval savedmodel for TFMA # For distributed training, master and worker(s) try to export multiple # eval_savedmodels (b/147378113). To avoid that, only export # eval_savedmodel if eval_model_dir does not exist as an intermediate # solution until b/147378113 is resolved. if not tf.io.gfile.exists(fn_args.eval_model_dir): absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=fn_args.eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir)