def test_get_from_single_list(self): """Test various retrieval utilities on a single list of TfxType.""" single_list = [types.TfxType('MyTypeName', split='eval')] single_list[0].uri = '/tmp/evaluri' self.assertEqual(single_list[0], types.get_single_instance(single_list)) self.assertEqual('/tmp/evaluri', types.get_single_uri(single_list)) self.assertEqual(single_list[0], types._get_split_instance(single_list, 'eval')) self.assertEqual('/tmp/evaluri', types.get_split_uri(single_list, 'eval')) with self.assertRaises(ValueError): types._get_split_instance(single_list, 'train') with self.assertRaises(ValueError): types.get_split_uri(single_list, 'train')
def Do(self, input_dict, output_dict, exec_properties): """Take BigQuery sql and generates train and eval tf examples. Args: input_dict: Input dict from input key to a list of Artifacts. output_dict: Output dict from output key to a list of Artifacts. - examples: train and eval split of tf examples. exec_properties: A dict of execution properties. - query: BigQuery sql string. Returns: None Raises: RuntimeError: if query is missing in exec_properties. """ self._log_startup(input_dict, output_dict, exec_properties) training_tfrecord = types.get_split_uri(output_dict['examples'], 'train') eval_tfrecord = types.get_split_uri(output_dict['examples'], 'eval') if 'query' not in exec_properties: raise RuntimeError('Missing query.') query = exec_properties['query'] tf.logging.info('Generating examples from BigQuery.') with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: converter = _BigQueryConverter(query) example_splits = ( pipeline | 'QueryTable' >> self._big_query_ptransform(query) | 'ToSerializedTFExample' >> beam.Map( converter.row_to_serialized_example) | 'SplitData' >> beam.Partition(_partition_fn, 2)) # TODO(jyzhao): make shuffle optional. # pylint: disable=expression-not-assigned (example_splits[0] | 'ShuffleTrainSplit' >> beam.transforms.Reshuffle() | 'OutputTrainSplit' >> beam.io.WriteToTFRecord( os.path.join(training_tfrecord, DEFAULT_FILE_NAME), file_name_suffix='.gz')) (example_splits[1] | 'ShuffleEvalSplit' >> beam.transforms.Reshuffle() | 'OutputEvalSplit' >> beam.io.WriteToTFRecord( os.path.join(eval_tfrecord, DEFAULT_FILE_NAME), file_name_suffix='.gz')) # pylint: enable=expression-not-assigned tf.logging.info('Examples generated.')
def Do(self, input_dict, output_dict, exec_properties): """Take input csv data and generates train and eval tf examples. Args: input_dict: Input dict from input key to a list of Artifacts. - input-base: input dir that contains csv data. csv files must have header line. output_dict: Output dict from output key to a list of Artifacts. - examples: train and eval split of tf examples. exec_properties: A dict of execution properties. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) training_tfrecord = types.get_split_uri(output_dict['examples'], 'train') eval_tfrecord = types.get_split_uri(output_dict['examples'], 'eval') input_base = types.get_single_instance(input_dict['input-base']) input_base_uri = input_base.uri tf.logging.info('Generating examples.') raw_data = io_utils.get_only_uri_in_dir(input_base_uri) tf.logging.info('No split {}.'.format(raw_data)) with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: example_splits = ( pipeline # pylint: disable=no-value-for-parameter | 'CsvToSerializedExample' >> _CsvToSerializedExample(raw_data) | 'SplitData' >> beam.Partition(_partition_fn, 2)) # TODO(jyzhao): make shuffle optional. # pylint: disable=expression-not-assigned (example_splits[0] | 'ShuffleTrainSplit' >> beam.transforms.Reshuffle() | 'OutputTrainSplit' >> beam.io.WriteToTFRecord( os.path.join(training_tfrecord, DEFAULT_FILE_NAME), file_name_suffix='.gz')) (example_splits[1] | 'ShuffleEvalSplit' >> beam.transforms.Reshuffle() | 'OutputEvalSplit' >> beam.io.WriteToTFRecord( os.path.join(eval_tfrecord, DEFAULT_FILE_NAME), file_name_suffix='.gz')) # pylint: enable=expression-not-assigned tf.logging.info('Examples generated.')
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of 'ExampleStatisticsPath' type which should contain split 'train'. Stats on other splits are ignored. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'SchemaPath' artifact of size one. exec_properties: A dict of execution properties. Not used yet. Returns: None """ # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. train_stats_uri = io_utils.get_only_uri_in_dir( types.get_split_uri(input_dict['stats'], 'train')) output_uri = os.path.join( types.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME) infer_feature_shape = False tf.logging.info('Infering schema from statistics.') schema = tfdv.infer_schema( tfdv.load_statistics(train_stats_uri), infer_feature_shape) io_utils.write_pbtxt_file(output_uri, schema) tf.logging.info('Schema written to {}.'.format(output_uri))
def Do(self, input_dict, output_dict, exec_properties): """Take input data source and generates train and eval tf examples. Args: input_dict: Input dict from input key to a list of Artifacts. Depends on detailed example gen implementation. output_dict: Output dict from output key to a list of Artifacts. - examples: train and eval split of tf examples. exec_properties: A dict of execution properties. Depends on detailed example gen implementation. - output: JSON string of example_gen_pb2.Output instance, providing output configuration. Returns: None Raises: RuntimeError: if output split config is not specified. """ self._log_startup(input_dict, output_dict, exec_properties) # Get output split information. output_config = example_gen_pb2.Output() json_format.Parse(exec_properties['output'], output_config) self._check_split_config(output_config.split_config) splits = output_config.split_config.splits # Calculate split buckets. buckets = [] total_buckets = 0 for split in splits: total_buckets += split.hash_buckets buckets.append(total_buckets) tf.logging.info('Generating examples.') with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: input_to_example = self.GetInputSourceToExamplePTransform() example_splits = ( pipeline | 'InputSourceToExample' >> input_to_example( input_dict, exec_properties) # Returns deterministic string as partition is based on it. | 'SerializeDeterministically' >> beam.Map(lambda x: x.SerializeToString(deterministic=True)) | 'SplitData' >> beam.Partition(_partition_fn, len(buckets), buckets)) # TODO(jyzhao): make shuffle optional. # pylint: disable=expression-not-assigned for index, example_split in enumerate(example_splits): (example_split | 'ShuffleSplit' + splits[index].name >> beam.transforms.Reshuffle() | 'OutputSplit' + splits[index].name >> beam.io.WriteToTFRecord(os.path.join( types.get_split_uri(output_dict['examples'], splits[index].name), DEFAULT_FILE_NAME), file_name_suffix='.gz')) # pylint: enable=expression-not-assigned tf.logging.info('Examples generated.')
def Do(self, input_dict: Dict[Text, List[types.TfxType]], output_dict: Dict[Text, List[types.TfxType]], exec_properties: Dict[Text, Any]) -> None: """Take input data source and generates TF Example splits. Args: input_dict: Input dict from input key to a list of Artifacts. Depends on detailed example gen implementation. output_dict: Output dict from output key to a list of Artifacts. - examples: splits of tf examples. exec_properties: A dict of execution properties. Depends on detailed example gen implementation. - input: JSON string of example_gen_pb2.Input instance, providing input configuration. - output: JSON string of example_gen_pb2.Output instance, providing output configuration. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) tf.logging.info('Generating examples.') with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: example_splits = self.GenerateExamplesByBeam( pipeline, input_dict, exec_properties) # pylint: disable=expression-not-assigned, no-value-for-parameter for split_name, example_split in example_splits.items(): (example_split | 'WriteSplit' + split_name >> _WriteSplit( types.get_split_uri(output_dict['examples'], split_name))) # pylint: enable=expression-not-assigned, no-value-for-parameter tf.logging.info('Examples generated.')
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow ExampleValidator executor entrypoint. This validates the statistics on the 'eval' split against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of 'ExampleStatisticsPath' type which should contain split 'eval'. Stats on other splits are ignored. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'ExampleValidationPath' artifact of size one. It will include a single pbtxt file which contains all anomalies found. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) tf.logging.info('Validating schema against the computed statistics.') schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema']))) stats = tfdv.load_statistics( io_utils.get_only_uri_in_dir( types.get_split_uri(input_dict['stats'], 'eval'))) output_uri = types.get_single_uri(output_dict['output']) anomalies = tfdv.validate_statistics(stats, schema) io_utils.write_pbtxt_file(os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies) tf.logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def Do(self, input_dict, output_dict, exec_properties): """Take input data source and generates train and eval tf examples. Args: input_dict: Input dict from input key to a list of Artifacts. Depends on detailed example gen implementation. output_dict: Output dict from output key to a list of Artifacts. - examples: train and eval split of tf examples. exec_properties: A dict of execution properties. Depends on detailed example gen implementation. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) training_tfrecord = types.get_split_uri(output_dict['examples'], 'train') eval_tfrecord = types.get_split_uri(output_dict['examples'], 'eval') tf.logging.info('Generating examples.') with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: input_to_example = self.GetInputSourceToExamplePTransform() example_splits = ( pipeline | 'InputSourceToExample' >> input_to_example( input_dict, exec_properties) # Returns deterministic string as partition is based on it. | 'SerializeDeterministically' >> beam.Map(lambda x: x.SerializeToString(deterministic=True)) | 'SplitData' >> beam.Partition(_partition_fn, 2)) # TODO(jyzhao): make shuffle optional. # pylint: disable=expression-not-assigned (example_splits[0] | 'ShuffleTrainSplit' >> beam.transforms.Reshuffle() | 'OutputTrainSplit' >> beam.io.WriteToTFRecord( os.path.join(training_tfrecord, DEFAULT_FILE_NAME), file_name_suffix='.gz')) (example_splits[1] | 'ShuffleEvalSplit' >> beam.transforms.Reshuffle() | 'OutputEvalSplit' >> beam.io.WriteToTFRecord( os.path.join(eval_tfrecord, DEFAULT_FILE_NAME), file_name_suffix='.gz')) # pylint: enable=expression-not-assigned tf.logging.info('Examples generated.')
def Do(self, input_dict, output_dict, exec_properties): """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model_exports: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - output: model evaluation results. exec_properties: A dict of execution properties. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Returns: None """ if 'model_exports' not in input_dict: raise ValueError('\'model_exports\' is missing in input dict.') if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'output' not in output_dict: raise ValueError('\'output\' is missing in output dict.') self._log_startup(input_dict, output_dict, exec_properties) # Extract input artifacts model_exports_uri = types.get_single_uri(input_dict['model_exports']) feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() json_format.Parse(exec_properties['feature_slicing_spec'], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) output_uri = types.get_single_uri(output_dict['output']) eval_model_path = path_utils.eval_model_path(model_exports_uri) tf.logging.info('Using {} for model eval.'.format(eval_model_path)) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_path) tf.logging.info('Evaluating model.') with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: # pylint: disable=expression-not-assigned (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern( types.get_split_uri(input_dict['examples'], 'eval'))) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, slice_spec=slice_spec, output_path=output_uri)) tf.logging.info( 'Evaluation complete. Results written to {}.'.format(output_uri))
def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]], output_dict: Dict[Text, List[types.TfxArtifact]], exec_properties: Dict[Text, Any]) -> None: """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model_exports: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - output: model evaluation results. exec_properties: A dict of execution properties. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Returns: None """ if 'model_exports' not in input_dict: raise ValueError('\'model_exports\' is missing in input dict.') if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'output' not in output_dict: raise ValueError('\'output\' is missing in output dict.') self._log_startup(input_dict, output_dict, exec_properties) # Extract input artifacts model_exports_uri = types.get_single_uri(input_dict['model_exports']) feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() json_format.Parse(exec_properties['feature_slicing_spec'], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) output_uri = types.get_single_uri(output_dict['output']) eval_model_path = path_utils.eval_model_path(model_exports_uri) tf.logging.info('Using {} for model eval.'.format(eval_model_path)) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_path) tf.logging.info('Evaluating model.') with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: # pylint: disable=expression-not-assigned (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=io_utils.all_files_pattern( types.get_split_uri(input_dict['examples'], 'eval'))) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, slice_spec=slice_spec, output_path=output_uri)) tf.logging.info( 'Evaluation complete. Results written to {}.'.format(output_uri))
def test_get_from_split_list(self): """Test various retrieval utilities on a list of split TfxTypes.""" split_list = [] for split in ['train', 'eval']: instance = types.TfxType('MyTypeName', split=split) instance.uri = '/tmp/' + split split_list.append(instance) with self.assertRaises(ValueError): types.get_single_instance(split_list) with self.assertRaises(ValueError): types.get_single_uri(split_list) self.assertEqual(split_list[0], types._get_split_instance(split_list, 'train')) self.assertEqual('/tmp/train', types.get_split_uri(split_list, 'train')) self.assertEqual(split_list[1], types._get_split_instance( split_list, 'eval')) self.assertEqual('/tmp/eval', types.get_split_uri(split_list, 'eval'))
def test_get_from_split_list(self): """Test various retrieval utilities on a list of split TfxTypes.""" split_list = [] for split in ['train', 'eval']: instance = types.TfxType('MyTypeName', split=split) instance.uri = '/tmp/' + split split_list.append(instance) with self.assertRaises(ValueError): types.get_single_instance(split_list) with self.assertRaises(ValueError): types.get_single_uri(split_list) self.assertEqual(split_list[0], types._get_split_instance(split_list, 'train')) self.assertEqual('/tmp/train', types.get_split_uri(split_list, 'train')) self.assertEqual(split_list[1], types._get_split_instance(split_list, 'eval')) self.assertEqual('/tmp/eval', types.get_split_uri(split_list, 'eval'))
def testGetSplitUriDeprecated(self): with mock.patch.object(tf_logging, 'warning'): warn_mock = mock.MagicMock() tf_logging.warning = warn_mock my_artifact = artifact.Artifact('TestType') my_artifact.uri = '123' my_artifact.split = 'train' self.assertEqual('123', types.get_split_uri([my_artifact], 'train')) warn_mock.assert_called_once() self.assertIn('tfx.utils.types.get_split_uri has been renamed to', warn_mock.call_args[0][5])
def Do(self, input_dict: Dict[Text, List[types.TfxType]], output_dict: Dict[Text, List[types.TfxType]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of 'ExamplesPath' type. This should contain both 'train' and 'eval' split. output_dict: Output dict from output key to a list of Artifacts. - output: A list of 'ExampleStatisticsPath' type. This should contain both 'train' and 'eval' split. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) split_to_instance = {x.split: x for x in input_dict['input_data']} with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p: # TODO(b/126263006): Support more stats_options through config. stats_options = options.StatsOptions() for split, instance in split_to_instance.items(): tf.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(instance.uri) output_uri = types.get_split_uri(output_dict['output'], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) tf.logging.info('Statistics written to {}.'.format(output_uri))
def Do(self, input_dict, output_dict, exec_properties): """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of 'ExamplesPath' type. This should contain both 'train' and 'eval' split. output_dict: Output dict from output key to a list of Artifacts. - output: A list of 'ExampleStatisticsPath' type. This should contain both 'train' and 'eval' split. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) split_to_instance = {x.split: x for x in input_dict['input_data']} with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p: # TODO(b/126263006): Support more stats_options through config. stats_options = options.StatsOptions() for split, instance in split_to_instance.items(): tf.logging.info('Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(instance.uri) output_uri = types.get_split_uri(output_dict['output'], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = ( p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) tf.logging.info('Statistics written to {}.'.format(output_uri))
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow ExampleValidator executor entrypoint. This validates the statistics on the 'eval' split against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of 'ExampleStatisticsPath' type which should contain split 'eval'. Stats on other splits are ignored. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'ExampleValidationPath' artifact of size one. It will include a single pbtxt file which contains all anomalies found. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) tf.logging.info('Validating schema against the computed statistics.') schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema']))) stats = tfdv.load_statistics( io_utils.get_only_uri_in_dir( types.get_split_uri(input_dict['stats'], 'eval'))) output_uri = types.get_single_uri(output_dict['output']) anomalies = tfdv.validate_statistics(stats, schema) io_utils.write_pbtxt_file( os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies) tf.logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]], output_dict: Dict[Text, List[types.TfxArtifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - transformed_examples: Transformed example. - transform_output: Input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None Raises: None """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(zhitaoli): Deprecate this in a future version. if exec_properties.get('custom_config', None): cmle_args = exec_properties.get('custom_config', {}).get('cmle_training_args') if cmle_args: executor_class_path = '.'.join( [Executor.__module__, Executor.__name__]) tf.logging.warn( 'Passing \'cmle_training_args\' to trainer directly is deprecated, ' 'please use extension executor at ' 'tfx.extensions.google_cloud_ai_platform.trainer.executor instead' ) return cmle_runner.start_cmle_training(input_dict, output_dict, exec_properties, executor_class_path, cmle_args) trainer_fn = io_utils.import_func(exec_properties['module_file'], 'trainer_fn') # Set up training parameters train_files = [ _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'train')) ] transform_output = types.get_single_uri(input_dict['transform_output']) eval_files = [ _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'eval')) ] schema_file = io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema'])) train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = types.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # Assemble warm start path if needed. warm_start_from = None if exec_properties.get('warm_starting') and exec_properties.get( 'warm_start_from'): previous_model_dir = os.path.join( exec_properties['warm_start_from'], path_utils.SERVING_MODEL_DIR) if previous_model_dir and tf.gfile.Exists( os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)): warm_start_from = previous_model_dir # TODO(b/126242806) Use PipelineInputs when it is available in third_party. hparams = tf.contrib.training.HParams( # A list of uris for train files. train_files=train_files, # A single uri for transform graph produced by TFT. transform_output=transform_output, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A list of uris for eval files. eval_files=eval_files, # A single uri for schema file. schema_file=schema_file, # Number of train steps. train_steps=train_steps, # Number of eval steps. eval_steps=eval_steps, # A single uri for the model directory to warm start from. warm_start_from=warm_start_from) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(hparams, schema) # Train the model tf.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) tf.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA tf.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow Transform executor entrypoint. This implements BaseExecutor.Do() and is invoked by orchestration systems. This is not inteded for manual usage or further customization. Please use the Transform() function which takes an input format with no artifact dependency. Args: input_dict: Input dict from input key to a list of artifacts, including: - input_data: A list of 'ExamplesPath' type which should contain two splits 'train' and 'eval'. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - transform_output: Output of 'tf.Transform', which includes an exported Tensorflow graph suitable for both training and serving; - transformed_examples: Materialized transformed examples, which includes both 'train' and 'eval' splits. exec_properties: A dict of execution properties, including: - module_file: The file path to a python module file, from which the 'preprocessing_fn' function will be loaded. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) train_data_uri = types.get_split_uri(input_dict['input_data'], 'train') eval_data_uri = types.get_split_uri(input_dict['input_data'], 'eval') schema_file = io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema'])) transform_output = types.get_single_uri(output_dict['transform_output']) if tf.gfile.Exists(transform_output): io_utils.delete_dir(transform_output) transformed_train_output = types.get_split_uri( output_dict['transformed_examples'], 'train') if tf.gfile.Exists(transformed_train_output): io_utils.delete_dir(transformed_train_output) transformed_eval_output = types.get_split_uri( output_dict['transformed_examples'], 'eval') if tf.gfile.Exists(transformed_eval_output): io_utils.delete_dir(transformed_eval_output) temp_path = os.path.join(transform_output, _TEMP_DIR_IN_TRANSFORM_OUTPUT) tf.logging.debug('Using temp path %s for tft.beam', temp_path) label_inputs = { labels.COMPUTE_STATISTICS_LABEL: False, labels.SCHEMA_PATH_LABEL: schema_file, labels.EXAMPLES_DATA_FORMAT_LABEL: labels.FORMAT_TF_EXAMPLE, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL: io_utils.all_files_pattern(train_data_uri), labels.TRANSFORM_ONLY_DATA_PATHS_LABEL: io_utils.all_files_pattern(eval_data_uri), labels.TFT_STATISTICS_USE_TFDV_LABEL: True, labels.PREPROCESSING_FN: exec_properties['module_file'], } label_outputs = { labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output, labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [ os.path.join(transformed_train_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), os.path.join(transformed_eval_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), ], labels.TEMP_OUTPUT_LABEL: str(temp_path), } status_file = 'status_file' # Unused self.Transform(label_inputs, label_outputs, status_file) tf.logging.info('Cleaning up temp path %s on executor success', temp_path) io_utils.delete_dir(temp_path)
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow Transform executor entrypoint. This implements BaseExecutor.Do() and is invoked by orchestration systems. This is not inteded for manual usage or further customization. Please use the Transform() function which takes an input format with no artifact dependency. Args: input_dict: Input dict from input key to a list of artifacts, including: - input_data: A list of 'ExamplesPath' type which should contain two splits 'train' and 'eval'. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - transform_output: Output of 'tf.Transform', which includes an exported Tensorflow graph suitable for both training and serving; - transformed_examples: Materialized transformed examples, which includes both 'train' and 'eval' splits. exec_properties: A dict of execution properties, including: - module_file: The file path to a python module file, from which the 'preprocessing_fn' function will be loaded. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) train_data_uri = types.get_split_uri(input_dict['input_data'], 'train') eval_data_uri = types.get_split_uri(input_dict['input_data'], 'eval') schema_file = io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema'])) transform_output = types.get_single_uri( output_dict['transform_output']) if tf.gfile.Exists(transform_output): io_utils.delete_dir(transform_output) transformed_train_output = types.get_split_uri( output_dict['transformed_examples'], 'train') if tf.gfile.Exists(transformed_train_output): io_utils.delete_dir(transformed_train_output) transformed_eval_output = types.get_split_uri( output_dict['transformed_examples'], 'eval') if tf.gfile.Exists(transformed_eval_output): io_utils.delete_dir(transformed_eval_output) temp_path = os.path.join(transform_output, _TEMP_DIR_IN_TRANSFORM_OUTPUT) tf.logging.debug('Using temp path %s for tft.beam', temp_path) label_inputs = { labels.COMPUTE_STATISTICS_LABEL: False, labels.SCHEMA_PATH_LABEL: schema_file, labels.EXAMPLES_DATA_FORMAT_LABEL: labels.FORMAT_TF_EXAMPLE, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL: io_utils.all_files_pattern(train_data_uri), labels.TRANSFORM_ONLY_DATA_PATHS_LABEL: io_utils.all_files_pattern(eval_data_uri), labels.TFT_STATISTICS_USE_TFDV_LABEL: True, labels.PREPROCESSING_FN: exec_properties['module_file'], } label_outputs = { labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output, labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [ os.path.join(transformed_train_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), os.path.join(transformed_eval_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), ], labels.TEMP_OUTPUT_LABEL: str(temp_path), } status_file = 'status_file' # Unused self.Transform(label_inputs, label_outputs, status_file) tf.logging.info('Cleaning up temp path %s on executor success', temp_path) io_utils.delete_dir(temp_path)
def Do(self, input_dict, output_dict, exec_properties): """Runs trainer job the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - transformed_examples: Transformed example. - transform_output: Input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(khaas): Move this to tfx/extensions. if exec_properties.get('custom_config', None): cmle_args = exec_properties.get('custom_config', {}).get('cmle_training_args') if cmle_args: return cmle_runner.start_cmle_training(input_dict, output_dict, exec_properties, cmle_args) trainer_fn = io_utils.import_func(exec_properties['module_file'], 'trainer_fn') # Set up training parameters train_files = [ _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'train')) ] transform_output = types.get_single_uri(input_dict['transform_output']) eval_files = _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'eval')) schema_file = io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema'])) train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = types.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # Assemble warm start path if needed. warm_start_from = None if exec_properties.get('warm_starting') and exec_properties.get( 'warm_start_from'): previous_model_dir = os.path.join( exec_properties['warm_start_from'], path_utils.SERVING_MODEL_DIR) if previous_model_dir and tf.gfile.Exists( os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)): warm_start_from = previous_model_dir # TODO(b/126242806) Use PipelineInputs when it is available in third_party. hparams = tf.contrib.training.HParams( train_files=train_files, transform_output=transform_output, output_dir=output_path, serving_model_dir=serving_model_dir, eval_files=eval_files, schema_file=schema_file, train_steps=train_steps, eval_steps=eval_steps, warm_start_from=warm_start_from) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(hparams, schema) # Train the model tf.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) tf.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA tf.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
def Do(self, input_dict, output_dict, exec_properties): """Validate current model against last blessed model. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: examples for eval the model. - model: current model for validation. output_dict: Output dict from output key to a list of Artifacts. - blessing: model blessing result. - results: model validation results. exec_properties: A dict of execution properties. - blessed_model: last blessed model for validation. - blessed_model_id: last blessed model id. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(b/125451545): Provide a safe temp path from base executor instead. self._temp_path = os.path.join( types.get_single_uri(output_dict['results']), '.temp') tf.logging.info('Using temp path {} for tft.beam'.format( self._temp_path)) eval_examples_uri = types.get_split_uri(input_dict['examples'], 'eval') blessing = types.get_single_instance(output_dict['blessing']) # Current model. current_model = types.get_single_instance(input_dict['model']) tf.logging.info('Using {} as current model.'.format(current_model.uri)) blessing.set_string_custom_property('current_model', current_model.uri) blessing.set_int_custom_property('current_model_id', current_model.id) # Blessed model. blessed_model_dir = exec_properties['blessed_model'] blessed_model_id = exec_properties['blessed_model_id'] tf.logging.info('Using {} as blessed model.'.format(blessed_model_dir)) if blessed_model_dir: blessing.set_string_custom_property('blessed_model', blessed_model_dir) blessing.set_int_custom_property('blessed_model_id', blessed_model_id) tf.logging.info('Validating model.') # TODO(b/125853306): support customized slice spec. blessed = self._generate_blessing_result( eval_examples_uri=eval_examples_uri, slice_spec=[tfma.slicer.slicer.SingleSliceSpec()], current_model_dir=current_model.uri, blessed_model_dir=blessed_model_dir) if blessed: io_utils.write_string_file(os.path.join(blessing.uri, 'BLESSED'), '') blessing.set_int_custom_property('blessed', 1) else: io_utils.write_string_file( os.path.join(blessing.uri, 'NOT_BLESSED'), '') blessing.set_int_custom_property('blessed', 0) tf.logging.info('Blessing result {} written to {}.'.format( blessed, blessing.uri)) io_utils.delete_dir(self._temp_path) tf.logging.info('Cleaned up temp path {} on executor success.'.format( self._temp_path))