def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: # KerasTuner generates tuning state (e.g., oracle, trials) to working dir. working_dir = self._get_tmp_dir() train_path = artifact_utils.get_split_uri(input_dict['examples'], 'train') eval_path = artifact_utils.get_split_uri(input_dict['examples'], 'eval') schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['schema'])) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) tuner_fn = self._GetTunerFn(exec_properties) tuner_spec = tuner_fn(working_dir, io_utils.all_files_pattern(train_path), io_utils.all_files_pattern(eval_path), schema) tuner = tuner_spec.tuner tuner.search_space_summary() # TODO(jyzhao): assert v2 behavior as KerasTuner doesn't work in v1. # TODO(jyzhao): make epochs configurable. tuner.search( tuner_spec.train_dataset, epochs=5, validation_data=tuner_spec.eval_dataset) tuner.results_summary() best_hparams = tuner.oracle.get_best_trials( 1)[0].hyperparameters.get_config() best_hparams_path = os.path.join( artifact_utils.get_single_uri(output_dict['study_best_hparams_path']), _DEFAULT_FILE_NAME) io_utils.write_string_file(best_hparams_path, json.dumps(best_hparams)) absl.logging.info('Best HParams is written to %s.' % best_hparams_path)
def get_common_fn_args(input_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], working_dir: Text = None) -> FnArgs: """Get common args of training and tuning.""" train_files = [ io_utils.all_files_pattern( artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY], 'train')) ] eval_files = [ io_utils.all_files_pattern( artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY], 'eval')) ] if input_dict.get(constants.TRANSFORM_GRAPH_KEY): transform_graph_path = artifact_utils.get_single_uri( input_dict[constants.TRANSFORM_GRAPH_KEY]) else: transform_graph_path = None if input_dict.get(constants.SCHEMA_KEY): schema_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[constants.SCHEMA_KEY])) else: schema_path = None train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties[constants.TRAIN_ARGS_KEY], train_args) json_format.Parse(exec_properties[constants.EVAL_ARGS_KEY], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None # TODO(b/156929910): Refactor Trainer to be consistent with empty or None # custom_config handling. custom_config = json_utils.loads( exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) return FnArgs( working_dir=working_dir, train_files=train_files, eval_files=eval_files, train_steps=train_steps, eval_steps=eval_steps, schema_path=schema_path, transform_graph_path=transform_graph_path, custom_config=custom_config, )
def _generate_blessing_result(self, eval_examples_uri: Text, slice_spec: List[ tfma.slicer.SingleSliceSpec], current_model_dir: Text, blessed_model_dir: Text) -> bool: current_model_eval_result_path = os.path.join( self._temp_path, constants.CURRENT_MODEL_EVAL_RESULT_PATH) blessed_model_eval_result_path = os.path.join( self._temp_path, constants.BLESSED_MODEL_EVAL_RESULT_PATH) with self._make_beam_pipeline() as pipeline: eval_data = (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(eval_examples_uri))) current_model = tfma.default_eval_shared_model( eval_saved_model_path=path_utils.eval_model_path( current_model_dir)) (eval_data | 'EvalCurrentModel' >> tfma.ExtractEvaluateAndWriteResults( # pylint: disable=expression-not-assigned eval_shared_model=current_model, slice_spec=slice_spec, output_path=current_model_eval_result_path)) if blessed_model_dir is not None: blessed_model = tfma.default_eval_shared_model( eval_saved_model_path=path_utils.eval_model_path( blessed_model_dir)) (eval_data | 'EvalBlessedModel' >> tfma.ExtractEvaluateAndWriteResults( # pylint: disable=expression-not-assigned eval_shared_model=blessed_model, slice_spec=slice_spec, output_path=blessed_model_eval_result_path)) absl.logging.info( 'all files in current_model_eval_result_path: [%s]', str(tf.io.gfile.listdir(current_model_eval_result_path))) current_model_eval_result = tfma.load_eval_result( output_path=current_model_eval_result_path) if not self._pass_threshold(current_model_eval_result): absl.logging.info('Current model does not pass threshold.') return False absl.logging.info('Current model passes threshold.') if blessed_model_dir is None: absl.logging.info('No blessed model yet.') return True absl.logging.info( 'all files in blessed_model_eval_result: [%s]', str(tf.io.gfile.listdir(blessed_model_eval_result_path))) blessed_model_eval_result = tfma.load_eval_result( output_path=blessed_model_eval_result_path) if (self._compare_eval_result(current_model_eval_result, blessed_model_eval_result)): absl.logging.info('Current model better than blessed model.') return True else: absl.logging.info('Current model worse than blessed model.') return False
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model_exports: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - output: model evaluation results. exec_properties: A dict of execution properties. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Returns: None """ if 'model_exports' not in input_dict: raise ValueError('\'model_exports\' is missing in input dict.') if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'output' not in output_dict: raise ValueError('\'output\' is missing in output dict.') self._log_startup(input_dict, output_dict, exec_properties) # Extract input artifacts model_exports_uri = artifact_utils.get_single_uri( input_dict['model_exports']) feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() json_format.Parse(exec_properties['feature_slicing_spec'], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) output_uri = artifact_utils.get_single_uri(output_dict['output']) eval_model_path = path_utils.eval_model_path(model_exports_uri) tf.logging.info('Using {} for model eval.'.format(eval_model_path)) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_path) tf.logging.info('Evaluating model.') with self._make_beam_pipeline() as pipeline: # pylint: disable=expression-not-assigned (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=io_utils.all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'eval'))) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, slice_spec=slice_spec, output_path=output_uri)) tf.logging.info( 'Evaluation complete. Results written to {}.'.format(output_uri))
def Do(self, input_dict, output_dict, exec_properties): """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model_exports: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - output: model evaluation results. exec_properties: A dict of execution properties. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Returns: None """ if 'model_exports' not in input_dict: raise ValueError('\'model_exports\' is missing in input dict.') if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'output' not in output_dict: raise ValueError('\'output\' is missing in output dict.') self._log_startup(input_dict, output_dict, exec_properties) # Extract input artifacts model_exports_uri = types.get_single_uri(input_dict['model_exports']) feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() json_format.Parse(exec_properties['feature_slicing_spec'], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) output_uri = types.get_single_uri(output_dict['output']) eval_model_path = path_utils.eval_model_path(model_exports_uri) tf.logging.info('Using {} for model eval.'.format(eval_model_path)) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_path) tf.logging.info('Evaluating model.') with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: # pylint: disable=expression-not-assigned (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern( types.get_split_uri(input_dict['examples'], 'eval'))) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, slice_spec=slice_spec, output_path=output_uri)) tf.logging.info( 'Evaluation complete. Results written to {}.'.format(output_uri))
def _run_model_inference( self, data_spec: bulk_inferrer_pb2.DataSpec, examples: List[types.Artifact], output_uri: Text, inference_endpoint: model_spec_pb2.InferenceSpecType) -> bool: """Runs model inference on given example data. Args: data_spec: bulk_inferrer_pb2.DataSpec instance. examples: List of example artifacts. output_uri: Output artifact uri. inference_endpoint: Model inference endpoint. Returns: Whether the inference job succeed. """ example_uris = {} if data_spec.example_splits: for example in examples: for split in artifact_utils.decode_split_names( example.split_names): if split in data_spec.example_splits: example_uris[split] = os.path.join(example.uri, split) else: for example in examples: for split in artifact_utils.decode_split_names( example.split_names): example_uris[split] = os.path.join(example.uri, split) output_path = os.path.join(output_uri, _PREDICTION_LOGS_DIR_NAME) logging.info('BulkInferrer generates prediction log to %s', output_path) with self._make_beam_pipeline() as pipeline: data_list = [] for split, example_uri in example_uris.items(): data = ( pipeline | 'ReadData[{}]'.format(split) >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(example_uri))) data_list.append(data) _ = ( data_list | 'FlattenExamples' >> beam.Flatten(pipeline=pipeline) # TODO(b/131873699): Use the correct Example type here, which # is either Example or SequenceExample. | 'ParseExamples' >> beam.Map(tf.train.Example.FromString) | 'RunInference' >> run_inference.RunInference(inference_endpoint) | 'WritePredictionLogs' >> beam.io.WriteToTFRecord( output_path, file_name_suffix='.gz', coder=beam.coders.ProtoCoder( prediction_log_pb2.PredictionLog))) logging.info('Inference result written to %s.', output_path)
def _run_model_inference(self, model_path: Text, example_uris: Mapping[Text, Text], output_path: Text, model_spec: bulk_inferrer_pb2.ModelSpec) -> None: """Runs model inference on given example data. Args: model_path: Path to model. example_uris: Mapping of example split name to example uri. output_path: Path to output generated prediction logs. model_spec: bulk_inferrer_pb2.ModelSpec instance. Returns: None """ try: from tfx_bsl.public.beam import run_inference from tfx_bsl.public.proto import model_spec_pb2 except ImportError: # TODO(b/151468119): Remove this branch after next release. run_inference = importlib.import_module( 'tfx_bsl.beam.run_inference') model_spec_pb2 = importlib.import_module( 'tfx_bsl.proto.model_spec_pb2') saved_model_spec = model_spec_pb2.SavedModelSpec( model_path=model_path, tag=model_spec.tag, signature_name=model_spec.model_signature_name) # TODO(b/151468119): Remove this branch after next release. if getattr(model_spec_pb2, 'InferenceEndpoint', False): inference_endpoint = getattr(model_spec_pb2, 'InferenceEndpoint')() else: inference_endpoint = model_spec_pb2.InferenceSpecType() inference_endpoint.saved_model_spec.CopyFrom(saved_model_spec) with self._make_beam_pipeline() as pipeline: data_list = [] for split, example_uri in example_uris.items(): data = ( pipeline | 'ReadData[{}]'.format(split) >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(example_uri))) data_list.append(data) _ = ([data for data in data_list] | 'FlattenExamples' >> beam.Flatten(pipeline=pipeline) | 'ParseExamples' >> beam.Map(tf.train.Example.FromString) | 'RunInference' >> run_inference.RunInference(inference_endpoint) | 'WritePredictionLogs' >> beam.io.WriteToTFRecord( output_path, file_name_suffix='.gz', coder=beam.coders.ProtoCoder( prediction_log_pb2.PredictionLog))) logging.info('Inference result written to %s.', output_path)
def _generate_blessing_result(self, eval_examples_uri, slice_spec, current_model_dir, blessed_model_dir): current_model_eval_result_path = os.path.join( self._temp_path, CURRENT_MODEL_EVAL_RESULT_PATH) blessed_model_eval_result_path = os.path.join( self._temp_path, BLESSED_MODEL_EVAL_RESULT_PATH) with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: eval_data = (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(eval_examples_uri))) current_model = tfma.default_eval_shared_model( eval_saved_model_path=path_utils.eval_model_path( current_model_dir)) (eval_data | 'EvalCurrentModel' >> tfma.ExtractEvaluateAndWriteResults( # pylint: disable=expression-not-assigned eval_shared_model=current_model, slice_spec=slice_spec, output_path=current_model_eval_result_path)) if blessed_model_dir is not None: blessed_model = tfma.default_eval_shared_model( eval_saved_model_path=path_utils.eval_model_path( blessed_model_dir)) (eval_data | 'EvalBlessedModel' >> tfma.ExtractEvaluateAndWriteResults( # pylint: disable=expression-not-assigned eval_shared_model=blessed_model, slice_spec=slice_spec, output_path=blessed_model_eval_result_path)) current_model_eval_result = tfma.load_eval_result( output_path=current_model_eval_result_path) if not self._pass_threshold(current_model_eval_result): tf.logging.info('Current model does not pass threshold.') return False tf.logging.info('Current model passes threshold.') if blessed_model_dir is None: tf.logging.info('No blessed model yet.') return True blessed_model_eval_result = tfma.load_eval_result( output_path=blessed_model_eval_result_path) if (self._compare_eval_result(current_model_eval_result, blessed_model_eval_result)): tf.logging.info('Current model better than blessed model.') return True else: tf.logging.info('Current model worse than blessed model.') return False
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) tokenizer_step: BaseTokenizer = c(**args) tokenizer_location = artifact_utils.get_single_uri( output_dict["tokenizer"]) split_uris, split_names, all_files = [], [], [] for artifact in input_dict["examples"]: for split in artifact_utils.decode_split_names( artifact.split_names): split_names.append(split) uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) all_files += path_utils.list_dir(uri) # Get output split path output_examples = artifact_utils.get_single_instance( output_dict["output_examples"]) output_examples.split_names = artifact_utils.encode_split_names( split_names) if not tokenizer_step.skip_training: tokenizer_step.train(files=all_files) tokenizer_step.save(output_dir=tokenizer_location) with self._make_beam_pipeline() as p: for split, uri in split_uris: input_uri = io_utils.all_files_pattern(uri) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord( file_pattern=input_uri) | "ParseTFExFromString." + split >> beam.Map( tf.train.Example.FromString) | "AddTokens." + split >> beam.Map( append_tf_example, tokenizer_step=tokenizer_step) | 'Serialize.' + split >> beam.Map( lambda x: x.SerializeToString()) | 'WriteSplit.' + split >> WriteSplit( get_split_uri( output_dict["output_examples"], split)))
def _run_sampling(self, example_uris: Mapping[Text, Text], to_key_fn: Text, output_artifact: Artifact, samples_per_key: int) -> None: """Runs stratified sampling on given example data. Args: example_uris: Mapping of example split name to example uri. to_key_fn: function to convert an example to a key output_artifact: Output artifact. samples_per_key: number of examples to keep per value of the key. Returns: None """ d = {} exec(to_key_fn, globals(), d) # how ugly is that? to_key = d['to_key'] def to_keyed_value(m): return to_key(m), m with self._make_beam_pipeline() as pipeline: for split_name, example_uri in example_uris.items(): data_list = [ (pipeline | 'ReadData[{}]'.format(split_name) >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(example_uri))) ] dest_path = os.path.join( artifact_utils.get_split_uri([output_artifact], split_name), _STRATIFIED_EXAMPLES_FILE_PREFIX) _ = ([data for data in data_list] | 'FlattenExamples ({})'.format(split_name) >> beam.Flatten(pipeline=pipeline) | 'ParseExamples ({})'.format(split_name) >> beam.Map( tf.train.Example.FromString) | 'Key ({})'.format(split_name) >> beam.Map(to_keyed_value) | 'Sample per key ({})'.format(split_name) >> beam.combiners.Sample.FixedSizePerKey(samples_per_key) | 'Values ({})'.format(split_name) >> beam.Values() | 'Flatten lists ({})'.format(split_name) >> beam.FlatMap(lambda elements: elements) | 'WriteStratifiedSamples ({})'.format(split_name) >> beam.io.WriteToTFRecord(dest_path, file_name_suffix='.gz', coder=beam.coders.ProtoCoder( tf.train.Example))) logging.info('Sampling result written to %s.', dest_path)
def _RunInference( pipeline: beam.Pipeline, example_uri: Text, inference_endpoint: model_spec_pb2.InferenceSpecType ) -> beam.pvalue.PCollection: """Runs model inference on given examples data.""" # TODO(b/174703893): adopt standardized input. return ( pipeline | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(example_uri)) # TODO(b/131873699): Use the correct Example type here, which # is either Example or SequenceExample. | 'ParseExamples' >> beam.Map(tf.train.Example.FromString) | 'RunInference' >> run_inference.RunInference(inference_endpoint))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) split_uris = [] for artifact in input_dict['input_data']: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: # TODO(b/126263006): Support more stats_options through config. stats_options = options.StatsOptions() for split, uri in split_uris: absl.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(uri) output_uri = artifact_utils.get_split_uri( output_dict['output'], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) absl.logging.info( 'Statistics for split {} written to {}.'.format( split, output_uri))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: self._log_startup(input_dict, output_dict, exec_properties) example_uris = {} for example in input_dict['examples']: for split in artifact_utils.decode_split_names( example.split_names): example_uris[split] = os.path.join(example.uri, split) model = artifact_utils.get_single_instance(input_dict['model']) model_path = path_utils.serving_model_path(model.uri) absl.logging.info('Using {} as current model.'.format(model_path)) output_uri = os.path.join( artifact_utils.get_single_uri(output_dict['output_data']), 'pred.csv') with self._make_beam_pipeline() as pipeline: test_data = [] for split, example_uri in example_uris.items(): test_data.append(pipeline | 'ReadFromTFRecord_{}'.format( split) >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(example_uri))) (test_data | 'Flattern' >> beam.Flatten() | 'ParseToExample' >> beam.Map(tf.train.Example.FromString) | 'Prediction' >> beam.ParDo( RunModel(model_path, 'serving_default', 'PassengerId')) | 'ParseToKVPair' >> beam.Map(lambda x: ParseResultToKV(x)) | 'AddSameKey' >> beam.Map(lambda x: (1, x)) | 'Window' >> beam.WindowInto(beam.window.GlobalWindows()) | 'GroupByKey' >> beam.GroupByKey() | 'Sort' >> beam.Map( lambda group_data: sorted(group_data[1], key=lambda x: x[0])) | 'Flatten' >> beam.FlatMap(lambda x: x) | 'ToStr' >> beam.Map( lambda x: '{},{}'.format(x[0], '0' if x[1] < 0.5 else '1')) | 'WriteToFile' >> beam.io.WriteToText( output_uri, num_shards=1, shard_name_template='', header='PassengerId,Survived')) absl.logging.info('TestPredComponent result written to %s.', output_uri)
def Do(self, input_dict: Dict[Text, List[types.TfxType]], output_dict: Dict[Text, List[types.TfxType]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of 'ExamplesPath' type. This should contain both 'train' and 'eval' split. output_dict: Output dict from output key to a list of Artifacts. - output: A list of 'ExampleStatisticsPath' type. This should contain both 'train' and 'eval' split. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) split_to_instance = {x.split: x for x in input_dict['input_data']} with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p: # TODO(b/126263006): Support more stats_options through config. stats_options = options.StatsOptions() for split, instance in split_to_instance.items(): tf.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(instance.uri) output_uri = types.get_split_uri(output_dict['output'], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) tf.logging.info('Statistics written to {}.'.format(output_uri))
def _run_model_inference(self, model_path: Text, example_uris: Mapping[Text, Text], output_path: Text, model_spec: bulk_inferrer_pb2.ModelSpec) -> None: """Runs model inference on given example data. Args: model_path: Path to model. example_uris: Mapping of example split name to example uri. output_path: Path to output generated prediction logs. model_spec: bulk_inferrer_pb2.ModelSpec instance. Returns: None """ saved_model_spec = model_spec_pb2.SavedModelSpec( model_path=model_path, tag=model_spec.tag, signature_name=model_spec.model_signature_name) inference_endpoint = model_spec_pb2.InferenceEndpoint() inference_endpoint.saved_model_spec.CopyFrom(saved_model_spec) with self._make_beam_pipeline() as pipeline: data_list = [] for split, example_uri in example_uris.items(): data = ( pipeline | 'ReadData[{}]'.format(split) >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(example_uri))) data_list.append(data) _ = ([data for data in data_list] | 'FlattenExamples' >> beam.Flatten(pipeline=pipeline) | 'ParseExamples' >> beam.Map(tf.train.Example.FromString) | 'RunInference' >> run_inference.RunInference(inference_endpoint) | 'WritePredictionLogs' >> beam.io.WriteToTFRecord( output_path, file_name_suffix='.gz', coder=beam.coders.ProtoCoder( prediction_log_pb2.PredictionLog))) logging.info('Inference result written to %s.', output_path)
def Do(self, input_dict, output_dict, exec_properties): """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of 'ExamplesPath' type. This should contain both 'train' and 'eval' split. output_dict: Output dict from output key to a list of Artifacts. - output: A list of 'ExampleStatisticsPath' type. This should contain both 'train' and 'eval' split. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) split_to_instance = {x.split: x for x in input_dict['input_data']} with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p: # TODO(b/126263006): Support more stats_options through config. stats_options = options.StatsOptions() for split, instance in split_to_instance.items(): tf.logging.info('Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(instance.uri) output_uri = types.get_split_uri(output_dict['output'], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = ( p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) tf.logging.info('Statistics written to {}.'.format(output_uri))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: self._log_startup(input_dict, output_dict, exec_properties) absl.logging.info('Hello Component - Executor - Do Start') assert (len(input_dict['input_data']) == 1) for artifact in input_dict['input_data']: input_dir = artifact.uri output_dir = artifact_utils.get_single_uri( output_dict['output_data']) input_uri = io_utils.all_files_pattern(input_dir) output_uri = os.path.join(output_dir, 'result.csv') with self._make_beam_pipeline() as p: intrim = p | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=input_uri, coder=beam.coders.ProtoCoder( prediction_log_pb2.PredictionLog)) intrim = intrim | 'Process' >> beam.Map(process_item) intrim = intrim | 'SameKey' >> beam.Map(lambda it: (0, it)) intrim = intrim | 'SameWindow' >> beam.WindowInto( beam.window.GlobalWindows()) intrim = intrim | 'GroupAll' >> GroupByKey() intrim = intrim | 'RemoveDummyKey' >> beam.Map( lambda item: item[1]) intrim = intrim | 'SortAll' >> beam.Map(sort_data) intrim = intrim | 'InMemorySink' >> beam.Map( lambda item: write_data(item, output_uri)) # intrim | 'Sink' >> beam.io.WriteToText(file_path_prefix=output_uri, # file_name_suffix='.csv', # num_shards=1, # # CompressionTypes.UNCOMPRESSED, # header='ID_code,target') absl.logging.info('Hello Component - Executor - Do End')
from tensorflow_serving.apis import prediction_log_pb2 import apache_beam as beam import tensorflow as tf def print_item(item, file): example_bytes = item.predict_log.request.inputs['input_example_tensor'].string_val[0] # parsed = tf.train.Example.FromString(example_bytes) # parsed is tf.Example (list of feature) features = { 'ID_code': tf.io.FixedLenFeature((), tf.string) } parsed = tf.io.parse_single_example(example_bytes, features=features) # parsed['ID_code'] is a Tensor with string value, .numpy() can gets the value like b'id1' id_string = parsed['ID_code'].numpy().decode() output = item.predict_log.response.outputs['output_0'].float_val[0] file.write('{0},{1}\n'.format(id_string, 1 if output >= 0.5 else 0)) input_dir = '/var/tmp/santander/keras-tft/HelloComponent.HelloWorld/output_data/10' input_uri = io_utils.all_files_pattern(input_dir) with tf.io.gfile.GFile('/var/tmp/output.csv', 'w') as file: file.write('ID_code,target\n') p = beam.Pipeline() out = p | 'ReadExamples' >> beam.io.ReadFromTFRecord(file_pattern=input_uri, coder=beam.coders.ProtoCoder(prediction_log_pb2.PredictionLog)) out = out | 'Print' >> beam.Map(lambda item: print_item(item, file)) result = p.run() print('done')
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. - schema: Optionally, a list of type `standard_artifacts.Schema`. When the stats_options exec_property also contains a schema, this input should not be provided. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. - stats_options_json: Optionally, a JSON representation of StatsOptions. When a schema is provided as an input, the StatsOptions value should not also contain a schema. Raises: ValueError when a schema is provided both as an input and as part of the StatsOptions exec_property. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) stats_options = options.StatsOptions() if STATS_OPTIONS_JSON_KEY in exec_properties: stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY] if stats_options_json: # TODO(b/150802589): Move jsonable interface to tfx_bsl and use # json_utils stats_options = options.StatsOptions.from_json( stats_options_json) if input_dict.get(SCHEMA_KEY): if stats_options.schema: raise ValueError( 'A schema was provided as an input and the ' 'stats_options exec_property also contains a schema ' 'value. At most one of these may be set.') else: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) stats_options.schema = schema split_uris = [] for artifact in input_dict[EXAMPLES_KEY]: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: for split, uri in split_uris: absl.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(uri) tfxio_kwargs = {'file_pattern': input_uri} # TODO(b/151624179): clean this up after tfx_bsl is released with the # below flag. if getattr(tfxio, 'TFXIO_HAS_TELEMETRY', False): tfxio_kwargs[ 'telemetry_descriptors'] = _TELEMETRY_DESCRIPTORS input_tfxio = tf_example_record.TFExampleRecord(**tfxio_kwargs) output_uri = artifact_utils.get_split_uri( output_dict[STATISTICS_KEY], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) data = p | 'TFXIORead[{}]'.format( split) >> input_tfxio.BeamSource() # TODO(b/153368237): Clean this up after a release post tfx 0.21. if not getattr(tfdv, 'TFDV_ACCEPT_RECORD_BATCH', False): data |= 'RecordBatchToTable[{}]'.format(split) >> beam.Map( lambda rb: pa.Table.from_batches([rb])) _ = (data | 'GenerateStatistics[{}]'.format(split) >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput[{}]'.format(split) >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) absl.logging.info( 'Statistics for split {} written to {}.'.format( split, output_uri))
def testAllFilesPattern(self): self.assertEqual('model*', io_utils.all_files_pattern('model'))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow Transform executor entrypoint. This implements BaseExecutor.Do() and is invoked by orchestration systems. This is not inteded for manual usage or further customization. Please use the Transform() function which takes an input format with no artifact dependency. Args: input_dict: Input dict from input key to a list of artifacts, including: - input_data: A list of 'ExamplesPath' type which should contain two splits 'train' and 'eval'. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - transform_output: Output of 'tf.Transform', which includes an exported Tensorflow graph suitable for both training and serving; - transformed_examples: Materialized transformed examples, which includes both 'train' and 'eval' splits. exec_properties: A dict of execution properties, including: - module_file: The file path to a python module file, from which the 'preprocessing_fn' function will be loaded. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) train_data_uri = artifact_utils.get_split_uri(input_dict['input_data'], 'train') eval_data_uri = artifact_utils.get_split_uri(input_dict['input_data'], 'eval') schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['schema'])) transform_output = artifact_utils.get_single_uri( output_dict['transform_output']) transformed_train_output = artifact_utils.get_split_uri( output_dict['transformed_examples'], 'train') transformed_eval_output = artifact_utils.get_split_uri( output_dict['transformed_examples'], 'eval') temp_path = os.path.join(transform_output, _TEMP_DIR_IN_TRANSFORM_OUTPUT) tf.logging.debug('Using temp path %s for tft.beam', temp_path) def _GetCachePath(label, params_dict): if label not in params_dict: return None else: return artifact_utils.get_single_uri(params_dict[label]) label_inputs = { labels.COMPUTE_STATISTICS_LABEL: False, labels.SCHEMA_PATH_LABEL: schema_file, labels.EXAMPLES_DATA_FORMAT_LABEL: labels.FORMAT_TF_EXAMPLE, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL: io_utils.all_files_pattern(train_data_uri), labels.TRANSFORM_ONLY_DATA_PATHS_LABEL: io_utils.all_files_pattern(eval_data_uri), labels.TFT_STATISTICS_USE_TFDV_LABEL: True, labels.PREPROCESSING_FN: exec_properties['module_file'], } cache_input = _GetCachePath('cache_input_path', input_dict) if cache_input is not None: label_inputs[labels.CACHE_INPUT_PATH_LABEL] = cache_input label_outputs = { labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output, labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [ os.path.join(transformed_train_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), os.path.join(transformed_eval_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), ], labels.TEMP_OUTPUT_LABEL: str(temp_path), } cache_output = _GetCachePath('cache_output_path', output_dict) if cache_output is not None: label_outputs[labels.CACHE_OUTPUT_PATH_LABEL] = cache_output status_file = 'status_file' # Unused self.Transform(label_inputs, label_outputs, status_file) tf.logging.info('Cleaning up temp path %s on executor success', temp_path) io_utils.delete_dir(temp_path)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """ Write description regarding this beautiful executor. Args: input_dict: output_dict: exec_properties: """ self._log_startup(input_dict, output_dict, exec_properties) schema = parse_schema(input_dict=input_dict) statistics = parse_statistics( split_name=DATA_SPLIT_NAME, statistics=input_dict[constants.STATISTICS]) source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] # pass the schema and stats straight to the Step args[constants.SCHEMA] = schema args[constants.STATISTICS] = statistics c = source_utils.load_source_path_class(source) split_step: BaseSplit = c(**args) # infer the names of the splits from the config split_names = split_step.get_split_names() # Get output split path examples_artifact = artifact_utils.get_single_instance( output_dict[constants.OUTPUT_EXAMPLES]) if SKIP in split_names: sanitized_names = [name for name in split_names if name != SKIP] examples_artifact.split_names = artifact_utils.encode_split_names( sanitized_names) else: examples_artifact.split_names = artifact_utils.encode_split_names( split_names) split_uris = [] for artifact in input_dict[constants.INPUT_EXAMPLES]: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: # The outer loop will for now only run once for split, uri in split_uris: input_uri = io_utils.all_files_pattern(uri) new_splits = ( p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | beam.Map(tf.train.Example.FromString) | 'Split' >> beam.Partition(split_step.partition_fn()[0], split_step.get_num_splits(), **split_step.partition_fn()[1])) for split_name, new_split in zip(split_names, list(new_splits)): if split_name != SKIP: # WriteSplit function writes to TFRecord again (new_split | 'Serialize.' + split_name >> beam.Map(lambda x: x.SerializeToString()) | 'WriteSplit_' + split_name >> WriteSplit( get_split_uri( output_dict[constants.OUTPUT_EXAMPLES], split_name)))
def _run_model_inference( self, data_spec: bulk_inferrer_pb2.DataSpec, output_example_spec: bulk_inferrer_pb2.OutputExampleSpec, examples: List[types.Artifact], output_examples: Optional[types.Artifact], inference_result: Optional[types.Artifact], inference_endpoint: model_spec_pb2.InferenceSpecType, ) -> None: """Runs model inference on given examples data. Args: data_spec: bulk_inferrer_pb2.DataSpec instance. output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance. examples: List of `standard_artifacts.Examples` artifacts. output_examples: Optional output `standard_artifacts.Examples` artifact. inference_result: Optional output `standard_artifacts.InferenceResult` artifact. inference_endpoint: Model inference endpoint. """ example_uris = {} for example_artifact in examples: for split in artifact_utils.decode_split_names( example_artifact.split_names): if data_spec.example_splits: if split in data_spec.example_splits: example_uris[split] = artifact_utils.get_split_uri( [example_artifact], split) else: example_uris[split] = artifact_utils.get_split_uri( [example_artifact], split) payload_format, _ = tfxio_utils.resolve_payload_format_and_data_view_uri( examples) tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples, _TELEMETRY_DESCRIPTORS, schema=None, read_as_raw_records=True, # We have to specify this parameter in order to create a RawRecord TFXIO # but we won't use the RecordBatches so the column name of the raw # records does not matter. raw_record_column_name='unused') if output_examples: output_examples.split_names = artifact_utils.encode_split_names( sorted(example_uris.keys())) with self._make_beam_pipeline() as pipeline: data_list = [] for split, example_uri in example_uris.items(): tfxio = tfxio_factory( [io_utils.all_files_pattern(example_uri)]) assert isinstance( tfxio, record_based_tfxio.RecordBasedTFXIO ), ('Unable to use TFXIO {} as it does not support reading raw records.' .format(type(tfxio))) # pylint: disable=no-value-for-parameter data = (pipeline | 'ReadData[{}]'.format(split) >> tfxio.RawRecordBeamSource() | 'RunInference[{}]'.format(split) >> _RunInference( payload_format, inference_endpoint)) if output_examples: output_examples_split_uri = artifact_utils.get_split_uri( [output_examples], split) logging.info('Path of output examples split `%s` is %s.', split, output_examples_split_uri) _ = (data | 'WriteExamples[{}]'.format(split) >> _WriteExamples( output_example_spec, output_examples_split_uri)) # pylint: enable=no-value-for-parameter data_list.append(data) if inference_result: _ = ( data_list | 'FlattenInferenceResult' >> beam.Flatten(pipeline=pipeline) | 'WritePredictionLogs' >> beam.io.WriteToTFRecord( os.path.join(inference_result.uri, _PREDICTION_LOGS_FILE_NAME), file_name_suffix='.gz', coder=beam.coders.ProtoCoder( prediction_log_pb2.PredictionLog))) if output_examples: logging.info('Output examples written to %s.', output_examples.uri) if inference_result: logging.info('Inference result written to %s.', inference_result.uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model_exports: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - output: model evaluation results. exec_properties: A dict of execution properties. - eval_config: JSON string of tfma.EvalConfig. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Deprecated, use eval_config.slicing_specs instead. Returns: None """ if constants.EXAMPLES_KEY not in input_dict: raise ValueError('EXAMPLES_KEY is missing from input dict.') if constants.MODEL_KEY not in input_dict: raise ValueError('MODEL_KEY is missing from input dict.') if constants.EVALUATION_KEY not in output_dict: raise ValueError('EVALUATION_KEY is missing from output dict.') if len(input_dict[constants.MODEL_KEY]) > 1: raise ValueError( 'There can be only one candidate model, there are {}.'.format( len(input_dict[constants.MODEL_KEY]))) if constants.BASELINE_MODEL_KEY in input_dict and len( input_dict[constants.BASELINE_MODEL_KEY]) > 1: raise ValueError( 'There can be only one baseline model, there are {}.'.format( len(input_dict[constants.BASELINE_MODEL_KEY]))) self._log_startup(input_dict, output_dict, exec_properties) # Add fairness indicator metric callback if necessary. fairness_indicator_thresholds = exec_properties.get( 'fairness_indicator_thresholds', None) add_metrics_callbacks = None if fairness_indicator_thresholds: # Need to import the following module so that the fairness indicator # post-export metric is registered. import tensorflow_model_analysis.addons.fairness.post_export_metrics.fairness_indicators # pylint: disable=g-import-not-at-top, unused-variable add_metrics_callbacks = [ tfma.post_export_metrics.fairness_indicators( # pytype: disable=module-attr thresholds=fairness_indicator_thresholds), ] def _get_eval_saved_model(artifact: List[types.Artifact], tags=None) -> tfma.EvalSharedModel: model_uri = artifact_utils.get_single_uri(artifact) if tags and tf.saved_model.SERVING in tags: model_path = path_utils.serving_model_path(model_uri) else: model_path = path_utils.eval_model_path(model_uri) return tfma.default_eval_shared_model( eval_saved_model_path=model_path, tags=tags, add_metrics_callbacks=add_metrics_callbacks) output_uri = artifact_utils.get_single_uri( output_dict[constants.EVALUATION_KEY]) run_validation = False if 'eval_config' in exec_properties and exec_properties['eval_config']: slice_spec = None eval_config = tfma.EvalConfig() json_format.Parse(exec_properties['eval_config'], eval_config) # Do not validate model when there is no thresholds configured. This is to # avoid accidentally blessing models when users forget to set thresholds. for metrics_spec in eval_config.metrics_specs: if (metrics_spec.thresholds or any( metric.HasField('threshold') for metric in metrics_spec.metrics)): run_validation = True break if len(eval_config.model_specs) > 2: raise ValueError( """Cannot support more than two models. There are {} models in this eval_config.""".format(len(eval_config.model_specs))) if not eval_config.model_specs: eval_config.model_specs.add() # Remove baseline model_spec and all change thresholds if there is no # baseline model provided. if not input_dict.get(constants.BASELINE_MODEL_KEY): tmp_model_specs = [] for model_spec in eval_config.model_specs: if not model_spec.is_baseline: tmp_model_specs.append(model_spec) del eval_config.model_specs[:] eval_config.model_specs.extend(tmp_model_specs) absl.logging.info("""No baseline model provided, ignoring all baseline model_spec.""") for metrics_spec in eval_config.metrics_specs: for metric in metrics_spec.metrics: metric.threshold.ClearField('change_threshold') for threshold in metrics_spec.thresholds.values(): threshold.ClearField('change_threshold') absl.logging.info("""No baseline model provided, ignoring all change thresholds.""") # Extract model artifacts. models = {} for model_spec in eval_config.model_specs: if model_spec.signature_name != 'eval': tags = [tf.saved_model.SERVING] if model_spec.is_baseline: models[model_spec.name] = _get_eval_saved_model( input_dict[constants.BASELINE_MODEL_KEY], tags) absl.logging.info('Using {} as baseline model.'.format( models[model_spec.name].model_path)) else: models[model_spec.name] = _get_eval_saved_model( input_dict[constants.MODEL_KEY], tags) absl.logging.info('Using {} for model eval.'.format( models[model_spec.name].model_path)) else: eval_config = None assert ('feature_slicing_spec' in exec_properties and exec_properties['feature_slicing_spec'] ), 'both eval_config and feature_slicing_spec are unset.' feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() json_format.Parse(exec_properties['feature_slicing_spec'], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) models = _get_eval_saved_model(input_dict[constants.MODEL_KEY]) absl.logging.info('Using {} for model eval.'.format(models.model_path)) absl.logging.info('Evaluating model.') with self._make_beam_pipeline() as pipeline: # pylint: disable=expression-not-assigned (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern( artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY], 'eval'))) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=models, eval_config=eval_config, output_path=output_uri, slice_spec=slice_spec)) absl.logging.info( 'Evaluation complete. Results written to {}.'.format(output_uri)) if not run_validation: # TODO(jinhuang): delete the BLESSING_KEY from output_dict when supported. absl.logging.info('No threshold configured, will not validate model.') return # Set up blessing artifact blessing = artifact_utils.get_single_instance( output_dict[constants.BLESSING_KEY]) blessing.set_string_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY, artifact_utils.get_single_uri(input_dict[constants.MODEL_KEY])) blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY, input_dict[constants.MODEL_KEY][0].id) if input_dict.get(constants.BASELINE_MODEL_KEY): baseline_model = input_dict[constants.BASELINE_MODEL_KEY][0] blessing.set_string_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_URI_KEY, baseline_model.uri) blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY, baseline_model.id) if 'current_component_id' in exec_properties: blessing.set_string_custom_property( 'component_id', exec_properties['current_component_id']) # Check validation result and write BLESSED file accordingly. validation_file = os.path.join(output_uri, tfma.constants.VALIDATIONS_KEY) absl.logging.info('Checking validation results.') validation_result = tfma.load_validation_result(validation_file) if validation_result.validation_ok: io_utils.write_string_file( os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '') blessing.set_int_custom_property(constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.BLESSED_VALUE) else: io_utils.write_string_file( os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME), '') blessing.set_int_custom_property(constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.NOT_BLESSED_VALUE) absl.logging.info('Blessing result {} written to {}.'.format( validation_result.validation_ok, blessing.uri))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model_exports: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - output: model evaluation results. exec_properties: A dict of execution properties. - eval_config: JSON string of tfma.EvalConfig. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Deprecated, use eval_config.slicing_specs instead. Returns: None """ if constants.EXAMPLES_KEY not in input_dict: raise ValueError('EXAMPLES_KEY is missing from input dict.') if constants.MODEL_KEY not in input_dict: raise ValueError('MODEL_KEY is missing from input dict.') if constants.EVALUATION_KEY not in output_dict: raise ValueError('EVALUATION_KEY is missing from output dict.') if len(input_dict[constants.MODEL_KEY]) > 1: raise ValueError( 'There can be only one candidate model, there are {}.'.format( len(input_dict[constants.MODEL_KEY]))) if constants.BASELINE_MODEL_KEY in input_dict and len( input_dict[constants.BASELINE_MODEL_KEY]) > 1: raise ValueError( 'There can be only one baseline model, there are {}.'.format( len(input_dict[constants.BASELINE_MODEL_KEY]))) self._log_startup(input_dict, output_dict, exec_properties) # Add fairness indicator metric callback if necessary. fairness_indicator_thresholds = exec_properties.get( 'fairness_indicator_thresholds', None) add_metrics_callbacks = None if fairness_indicator_thresholds: # Need to import the following module so that the fairness indicator # post-export metric is registered. import tensorflow_model_analysis.addons.fairness.post_export_metrics.fairness_indicators # pylint: disable=g-import-not-at-top, unused-variable add_metrics_callbacks = [ tfma.post_export_metrics.fairness_indicators( # pytype: disable=module-attr thresholds=fairness_indicator_thresholds), ] output_uri = artifact_utils.get_single_uri( output_dict[constants.EVALUATION_KEY]) run_validation = False models = [] if 'eval_config' in exec_properties and exec_properties['eval_config']: slice_spec = None has_baseline = bool(input_dict.get(constants.BASELINE_MODEL_KEY)) eval_config = tfma.EvalConfig() json_format.Parse(exec_properties['eval_config'], eval_config) eval_config = tfma.update_eval_config_with_defaults( eval_config, maybe_add_baseline=has_baseline, maybe_remove_baseline=not has_baseline) tfma.verify_eval_config(eval_config) # Do not validate model when there is no thresholds configured. This is to # avoid accidentally blessing models when users forget to set thresholds. run_validation = bool( tfma.metrics.metric_thresholds_from_metrics_specs( eval_config.metrics_specs)) if len(eval_config.model_specs) > 2: raise ValueError( """Cannot support more than two models. There are {} models in this eval_config.""".format(len(eval_config.model_specs))) # Extract model artifacts. for model_spec in eval_config.model_specs: if model_spec.is_baseline: model_uri = artifact_utils.get_single_uri( input_dict[constants.BASELINE_MODEL_KEY]) else: model_uri = artifact_utils.get_single_uri( input_dict[constants.MODEL_KEY]) if tfma.get_model_type(model_spec) == tfma.TF_ESTIMATOR: model_path = path_utils.eval_model_path(model_uri) else: model_path = path_utils.serving_model_path(model_uri) absl.logging.info('Using {} as {} model.'.format( model_path, model_spec.name)) models.append( tfma.default_eval_shared_model( model_name=model_spec.name, eval_saved_model_path=model_path, add_metrics_callbacks=add_metrics_callbacks, eval_config=eval_config)) else: eval_config = None assert ('feature_slicing_spec' in exec_properties and exec_properties['feature_slicing_spec'] ), 'both eval_config and feature_slicing_spec are unset.' feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() json_format.Parse(exec_properties['feature_slicing_spec'], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) model_uri = artifact_utils.get_single_uri( input_dict[constants.MODEL_KEY]) model_path = path_utils.eval_model_path(model_uri) absl.logging.info('Using {} for model eval.'.format(model_path)) models.append( tfma.default_eval_shared_model( eval_saved_model_path=model_path, add_metrics_callbacks=add_metrics_callbacks)) file_pattern = io_utils.all_files_pattern( artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY], 'eval')) eval_shared_model = models[0] if len(models) == 1 else models schema = None if constants.SCHEMA_KEY in input_dict: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.SCHEMA_KEY]))) absl.logging.info('Evaluating model.') with self._make_beam_pipeline() as pipeline: # pylint: disable=expression-not-assigned if _USE_TFXIO: tensor_adapter_config = None if tfma.is_batched_input(eval_shared_model, eval_config): tfxio = tf_example_record.TFExampleRecord( file_pattern=file_pattern, schema=schema, raw_record_column_name=tfma.BATCHED_INPUT_KEY) if schema is not None: tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfxio.ArrowSchema(), tensor_representations=tfxio.TensorRepresentations( )) data = pipeline | 'ReadFromTFRecordToArrow' >> tfxio.BeamSource( ) else: data = pipeline | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord( file_pattern=file_pattern) (data | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=models[0] if len(models) == 1 else models, eval_config=eval_config, output_path=output_uri, slice_spec=slice_spec, tensor_adapter_config=tensor_adapter_config)) else: data = pipeline | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord( file_pattern=file_pattern) (data | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=models[0] if len(models) == 1 else models, eval_config=eval_config, output_path=output_uri, slice_spec=slice_spec)) absl.logging.info( 'Evaluation complete. Results written to {}.'.format(output_uri)) if not run_validation: # TODO(jinhuang): delete the BLESSING_KEY from output_dict when supported. absl.logging.info( 'No threshold configured, will not validate model.') return # Set up blessing artifact blessing = artifact_utils.get_single_instance( output_dict[constants.BLESSING_KEY]) blessing.set_string_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY, artifact_utils.get_single_uri(input_dict[constants.MODEL_KEY])) blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY, input_dict[constants.MODEL_KEY][0].id) if input_dict.get(constants.BASELINE_MODEL_KEY): baseline_model = input_dict[constants.BASELINE_MODEL_KEY][0] blessing.set_string_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_URI_KEY, baseline_model.uri) blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY, baseline_model.id) if 'current_component_id' in exec_properties: blessing.set_string_custom_property( 'component_id', exec_properties['current_component_id']) # Check validation result and write BLESSED file accordingly. absl.logging.info('Checking validation results.') validation_result = tfma.load_validation_result(output_uri) if validation_result.validation_ok: io_utils.write_string_file( os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '') blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.BLESSED_VALUE) else: io_utils.write_string_file( os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME), '') blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.NOT_BLESSED_VALUE) absl.logging.info('Blessing result {} written to {}.'.format( validation_result.validation_ok, blessing.uri))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Get human review result on a model through Slack channel. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - slack_blessing: model blessing result. exec_properties: A dict of execution properties, including: - slack_token: Token used to setup connection with slack server. - slack_channel_id: The id of the Slack channel to send and receive messages. - timeout_sec: How long do we wait for response, in seconds. Returns: None Raises: TimeoutError: When there is no decision made within timeout_sec. ConnectionError: When connection to slack server cannot be established. """ self._log_startup(input_dict, output_dict, exec_properties) transform_graph_uri = artifact_utils.get_single_uri( input_dict[TRANSFORM_GRAPH_KEY]) temp_path = os.path.join(transform_graph_uri, _TEMP_DIR_IN_TRANSFORM_OUTPUT) # transformed_schema_file = os.path.join( # transform_graph_uri, # tft.TFTransformOutput.TRANSFORMED_METADATA_DIR, # 'schema.pbtxt' # ) # transformed_schema_proto = io_utils.parse_pbtxt_file( # transformed_schema_file, # schema_pb2.Schema() # ) transformed_train_output = artifact_utils.get_split_uri( output_dict[TRANSFORMED_EXAMPLES_KEY], 'train') transformed_eval_output = artifact_utils.get_split_uri( output_dict[TRANSFORMED_EXAMPLES_KEY], 'eval') tf_transform_output = tft.TFTransformOutput(transform_graph_uri) # transform_output_dataset_metadata = dataset_metadata.DatasetMetadata( # schema=transformed_schema_proto # ) # transform_fn = (tf_transform_output.transform_raw_features, transform_output_dataset_metadata) # feature_spec = schema_utils.schema_as_feature_spec(schema_proto).feature_spec schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])) schema_proto = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) transform_input_dataset_metadata = dataset_metadata.DatasetMetadata( schema_proto ) train_data_uri = artifact_utils.get_split_uri( input_dict[EXAMPLES_KEY], 'train' ) eval_data_uri = artifact_utils.get_split_uri( input_dict[EXAMPLES_KEY], 'eval' ) analyze_data_paths = [io_utils.all_files_pattern(train_data_uri)] transform_data_paths = [ io_utils.all_files_pattern(train_data_uri), io_utils.all_files_pattern(eval_data_uri), ] materialize_output_paths = [ os.path.join(transformed_train_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), os.path.join(transformed_eval_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX) ] transform_data_list = self._MakeDatasetList( transform_data_paths, materialize_output_paths ) analyze_data_list = self._MakeDatasetList( analyze_data_paths, ) with self._make_beam_pipeline() as pipeline: with tft_beam.Context(temp_dir=temp_path): # NOTE: Unclear if there is a difference between input_dataset_metadata # and transform_input_dataset_metadata. Look at Transform executor. decode_fn = tft.coders.ExampleProtoCoder(schema_proto, serialized=True).decode input_analysis_data = {} for dataset in analyze_data_list: infix = 'AnalysisIndex{}'.format(dataset.index) dataset.serialized = ( pipeline | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples( dataset, transform_input_dataset_metadata)) dataset.decoded = ( dataset.serialized | 'Decode[{}]'.format(infix) >> self._DecodeInputs(decode_fn)) input_analysis_data[dataset.dataset_key] = dataset.decoded if not hasattr(tft_beam.analyzer_cache, 'DatasetKey'): input_analysis_data = ( [ dataset for dataset in input_analysis_data.values() if dataset is not None ] | 'FlattenAnalysisDatasetsBecauseItIsRequired' >> beam.Flatten(pipeline=pipeline)) transform_fn = ( (input_analysis_data, transform_input_dataset_metadata) | 'Analyze' >> tft_beam.AnalyzeDataset( tf_transform_output.transform_raw_features, pipeline=pipeline)) for dataset in transform_data_list: infix = 'TransformIndex{}'.format(dataset.index) dataset.serialized = ( pipeline | 'ReadDataset[{}]'.format(infix) >> self._ReadExamples( dataset, transform_input_dataset_metadata)) dataset.decoded = ( dataset.serialized | 'Decode[{}]'.format(infix) >> self._DecodeInputs(decode_fn)) dataset.transformed, metadata = ( ((dataset.decoded, transform_input_dataset_metadata), transform_fn) | 'Transform[{}]'.format(infix) >> tft_beam.TransformDataset()) dataset.transformed_and_serialized = ( dataset.transformed | 'EncodeAndSerialize[{}]'.format(infix) >> beam.ParDo(self._EncodeAsSerializedExamples(), _GetSchemaProto(metadata))) _ = ( dataset.transformed_and_serialized | 'Materialize[{}]'.format(infix) >> self._WriteExamples(dataset.materialize_output_path))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model_exports: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - output: model evaluation results. exec_properties: A dict of execution properties. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Returns: None """ if 'model_exports' not in input_dict: raise ValueError('\'model_exports\' is missing in input dict.') if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'output' not in output_dict: raise ValueError('\'output\' is missing in output dict.') self._log_startup(input_dict, output_dict, exec_properties) # Extract input artifacts model_exports_uri = artifact_utils.get_single_uri( input_dict['model_exports']) feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() json_format.Parse(exec_properties['feature_slicing_spec'], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) output_uri = artifact_utils.get_single_uri(output_dict['output']) eval_model_path = path_utils.eval_model_path(model_exports_uri) # Add fairness indicator metric callback if necessary. fairness_indicator_thresholds = exec_properties.get( 'fairness_indicator_thresholds', None) add_metrics_callbacks = None if fairness_indicator_thresholds: # Need to import the following module so that the fairness indicator # post-export metric is registered. import tensorflow_model_analysis.addons.fairness.post_export_metrics.fairness_indicators # pylint: disable=g-import-not-at-top, unused-variable add_metrics_callbacks = [ tfma.post_export_metrics.fairness_indicators( # pytype: disable=module-attr thresholds=fairness_indicator_thresholds), ] absl.logging.info('Using {} for model eval.'.format(eval_model_path)) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_path, add_metrics_callbacks=add_metrics_callbacks) absl.logging.info('Evaluating model.') with self._make_beam_pipeline() as pipeline: # pylint: disable=expression-not-assigned (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'eval'))) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, slice_spec=slice_spec, output_path=output_uri)) absl.logging.info( 'Evaluation complete. Results written to {}.'.format(output_uri))
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. - schema: Optionally, a list of type `standard_artifacts.Schema`. When the stats_options exec_property also contains a schema, this input should not be provided. output_dict: Output dict from output key to a list of Artifacts. - statistics: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. - stats_options_json: Optionally, a JSON representation of StatsOptions. When a schema is provided as an input, the StatsOptions value should not also contain a schema. - exclude_splits: JSON-serialized list of names of splits where statistics and sample should not be generated. Raises: ValueError when a schema is provided both as an input and as part of the StatsOptions exec_property. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError('exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) # Setup output splits. examples = artifact_utils.get_single_instance( input_dict[standard_component_specs.EXAMPLES_KEY]) examples_split_names = artifact_utils.decode_split_names( examples.split_names) split_names = [ split for split in examples_split_names if split not in exclude_splits ] statistics_artifact = artifact_utils.get_single_instance( output_dict[standard_component_specs.STATISTICS_KEY]) statistics_artifact.split_names = artifact_utils.encode_split_names( split_names) stats_options = options.StatsOptions() stats_options_json = exec_properties.get( standard_component_specs.STATS_OPTIONS_JSON_KEY) if stats_options_json: # TODO(b/150802589): Move jsonable interface to tfx_bsl and use # json_utils stats_options = options.StatsOptions.from_json(stats_options_json) if input_dict.get(standard_component_specs.SCHEMA_KEY): if stats_options.schema: raise ValueError('A schema was provided as an input and the ' 'stats_options exec_property also contains a schema ' 'value. At most one of these may be set.') else: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[standard_component_specs.SCHEMA_KEY]))) stats_options.schema = schema split_and_tfxio = [] tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples=[examples], telemetry_descriptors=_TELEMETRY_DESCRIPTORS) for split in artifact_utils.decode_split_names(examples.split_names): if split in exclude_splits: continue uri = artifact_utils.get_split_uri([examples], split) split_and_tfxio.append( (split, tfxio_factory(io_utils.all_files_pattern(uri)))) with self._make_beam_pipeline() as p: for split, tfxio in split_and_tfxio: logging.info('Generating statistics for split %s.', split) output_uri = artifact_utils.get_split_uri( output_dict[standard_component_specs.STATISTICS_KEY], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) data = p | 'TFXIORead[%s]' % split >> tfxio.BeamSource() _ = ( data | 'GenerateStatistics[%s]' % split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput[%s]' % split >> stats_api.WriteStatisticsToBinaryFile(output_path)) logging.info('Statistics for split %s written to %s.', split, output_uri)
def Do(self, input_dict: Dict[Text, List[Artifact]], output_dict: Dict[Text, List[Artifact]], exec_properties: Dict[Text, Any]) -> None: split_uris: List[Text] = [] for artifact in input_dict[executor.EXAMPLES_KEY]: for split in artifact_utils.decode_split_names( artifact.split_names): split_uris.append(split) self._log_startup(input_dict, output_dict, exec_properties) data_uris = [] for split in split_uris: data_uris.append( artifact_utils.get_split_uri(input_dict[executor.EXAMPLES_KEY], split)) schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[executor.SCHEMA_KEY])) transform_output = artifact_utils.get_single_uri( output_dict[executor.TRANSFORM_GRAPH_KEY]) transformed_data_uris = [] for split in split_uris: transformed_data_uris.append( artifact_utils.get_split_uri( output_dict[executor.TRANSFORMED_EXAMPLES_KEY], split)) temp_path = os.path.join(transform_output, executor._TEMP_DIR_IN_TRANSFORM_OUTPUT) logging.debug('Using temp path %s for tft.beam', temp_path) def _GetCachePath(label, params_dict): if label not in params_dict: return None else: return artifact_utils.get_single_uri(params_dict[label]) label_inputs = { labels.COMPUTE_STATISTICS_LABEL: False, labels.SCHEMA_PATH_LABEL: schema_file, labels.EXAMPLES_DATA_FORMAT_LABEL: labels.FORMAT_TF_EXAMPLE, labels.ANALYZE_DATA_PATHS_LABEL: io_utils.all_files_pattern(data_uris[0]), labels.ANALYZE_PATHS_FILE_FORMATS_LABEL: labels.FORMAT_TFRECORD, labels.TRANSFORM_DATA_PATHS_LABEL: [io_utils.all_files_pattern(uri) for uri in data_uris], labels.TRANSFORM_PATHS_FILE_FORMATS_LABEL: [labels.FORMAT_TFRECORD for uri in data_uris], labels.TFT_STATISTICS_USE_TFDV_LABEL: True, labels.MODULE_FILE: exec_properties.get('module_file', None), labels.PREPROCESSING_FN: exec_properties.get('preprocessing_fn', None), # TODO(b/149754658): switch to True once the TFXIO integration is # complete. labels.USE_TFXIO_LABEL: False, } cache_input = _GetCachePath('cache_input_path', input_dict) if cache_input is not None: label_inputs[labels.CACHE_INPUT_PATH_LABEL] = cache_input label_outputs = { labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output, labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [ os.path.join(uri, executor._DEFAULT_TRANSFORMED_EXAMPLES_PREFIX) for uri in transformed_data_uris ], labels.TEMP_OUTPUT_LABEL: str(temp_path), } cache_output = _GetCachePath('cache_output_path', output_dict) if cache_output is not None: label_outputs[labels.CACHE_OUTPUT_PATH_LABEL] = cache_output status_file = 'status_file' # Unused self.Transform(label_inputs, label_outputs, status_file) logging.debug('Cleaning up temp path %s on executor success', temp_path) io_utils.delete_dir(temp_path)
def get_common_fn_args(input_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], working_dir: Text = None) -> FnArgs: """Get common args of training and tuning.""" if input_dict.get(standard_component_specs.TRANSFORM_GRAPH_KEY): transform_graph_path = artifact_utils.get_single_uri( input_dict[standard_component_specs.TRANSFORM_GRAPH_KEY]) else: transform_graph_path = None if input_dict.get(standard_component_specs.SCHEMA_KEY): schema_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[standard_component_specs.SCHEMA_KEY])) else: schema_path = None train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() proto_utils.json_to_proto( exec_properties[standard_component_specs.TRAIN_ARGS_KEY], train_args) proto_utils.json_to_proto( exec_properties[standard_component_specs.EVAL_ARGS_KEY], eval_args) # Default behavior is train on `train` split (when splits is empty in train # args) and evaluate on `eval` split (when splits is empty in eval args). if not train_args.splits: train_args.splits.append('train') absl.logging.info("Train on the 'train' split when train_args.splits is " 'not set.') if not eval_args.splits: eval_args.splits.append('eval') absl.logging.info("Evaluate on the 'eval' split when eval_args.splits is " 'not set.') train_files = [] for train_split in train_args.splits: train_files.extend([ io_utils.all_files_pattern(uri) for uri in artifact_utils.get_split_uris( input_dict[standard_component_specs.EXAMPLES_KEY], train_split) ]) eval_files = [] for eval_split in eval_args.splits: eval_files.extend([ io_utils.all_files_pattern(uri) for uri in artifact_utils.get_split_uris( input_dict[standard_component_specs.EXAMPLES_KEY], eval_split) ]) data_accessor = DataAccessor( tf_dataset_factory=tfxio_utils.get_tf_dataset_factory_from_artifact( input_dict[standard_component_specs.EXAMPLES_KEY], _TELEMETRY_DESCRIPTORS), record_batch_factory=tfxio_utils.get_record_batch_factory_from_artifact( input_dict[standard_component_specs.EXAMPLES_KEY], _TELEMETRY_DESCRIPTORS), data_view_decode_fn=tfxio_utils.get_data_view_decode_fn_from_artifact( input_dict[standard_component_specs.EXAMPLES_KEY], _TELEMETRY_DESCRIPTORS) ) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None # Load and deserialize custom config from execution properties. # Note that in the component interface the default serialization of custom # config is 'null' instead of '{}'. Therefore we need to default the # json_utils.loads to 'null' then populate it with an empty dict when # needed. custom_config = json_utils.loads( exec_properties.get(standard_component_specs.CUSTOM_CONFIG_KEY, 'null')) return FnArgs( working_dir=working_dir, train_files=train_files, eval_files=eval_files, train_steps=train_steps, eval_steps=eval_steps, schema_path=schema_path, transform_graph_path=transform_graph_path, data_accessor=data_accessor, custom_config=custom_config, )
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """ Main execution logic for the Sequencer component :param input_dict: input channels :param output_dict: output channels :param exec_properties: the execution properties defined in the spec """ source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) # Get the schema schema_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[constants.SCHEMA])) schema = io_utils.SchemaReader().read(schema_path) # TODO: Getting the statistics might help the future implementations sequence_step: BaseSequencerStep = c(schema=schema, statistics=None, **args) # Get split names input_artifact = artifact_utils.get_single_instance( input_dict[constants.INPUT_EXAMPLES]) split_names = artifact_utils.decode_split_names( input_artifact.split_names) # Create output artifact output_artifact = artifact_utils.get_single_instance( output_dict[constants.OUTPUT_EXAMPLES]) output_artifact.split_names = artifact_utils.encode_split_names( split_names) with self._make_beam_pipeline() as p: for s in split_names: input_uri = io_utils.all_files_pattern( artifact_utils.get_split_uri( input_dict[constants.INPUT_EXAMPLES], s)) output_uri = artifact_utils.get_split_uri( output_dict[constants.OUTPUT_EXAMPLES], s) output_path = os.path.join(output_uri, self._DEFAULT_FILENAME) # Read and decode the data data = \ (p | 'Read_' + s >> beam.io.ReadFromTFRecord( file_pattern=input_uri) | 'Decode_' + s >> tf_example_decoder.DecodeTFExample() | 'ToDataFrame_' + s >> beam.ParDo(utils.ConvertToDataframe())) # Window into sessions s_data = \ (data | 'AddCategory_' + s >> beam.ParDo( sequence_step.get_category_do_fn()) | 'AddTimestamp_' + s >> beam.ParDo( sequence_step.get_timestamp_do_fn()) | 'Sessions_' + s >> beam.WindowInto( sequence_step.get_window())) # Combine and transform p_data = \ (s_data | 'Combine_' + s >> beam.CombinePerKey( sequence_step.get_combine_fn())) # Write the results _ = \ (p_data | 'Global_' + s >> beam.WindowInto(GlobalWindows()) | 'RemoveKey_' + s >> beam.ParDo(RemoveKey()) | 'ToExample_' + s >> beam.Map(utils.df_to_example) | 'Serialize_' + s >> beam.Map(utils.serialize) | 'Write_' + s >> beam.io.WriteToTFRecord( output_path, file_name_suffix='.gz'))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. - schema: Optionally, a list of type `standard_artifacts.Schema`. When the stats_options exec_property also contains a schema, this input should not be provided. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. - stats_options_json: Optionally, a JSON representation of StatsOptions. When a schema is provided as an input, the StatsOptions value should not also contain a schema. Raises: ValueError when a schema is provided both as an input and as part of the StatsOptions exec_property. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) stats_options = options.StatsOptions() if STATS_OPTIONS_JSON_KEY in exec_properties: stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY] if stats_options_json: # TODO(b/150802589): Move jsonable interface to tfx_bsl and use # json_utils stats_options = options.StatsOptions.from_json( stats_options_json) if input_dict.get(SCHEMA_KEY): if stats_options.schema: raise ValueError( 'A schema was provided as an input and the ' 'stats_options exec_property also contains a schema ' 'value. At most one of these may be set.') else: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) stats_options.schema = schema split_uris = [] for artifact in input_dict[EXAMPLES_KEY]: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: for split, uri in split_uris: absl.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(uri) output_uri = artifact_utils.get_split_uri( output_dict[STATISTICS_KEY], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) absl.logging.info( 'Statistics for split {} written to {}.'.format( split, output_uri))
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow Transform executor entrypoint. This implements BaseExecutor.Do() and is invoked by orchestration systems. This is not inteded for manual usage or further customization. Please use the Transform() function which takes an input format with no artifact dependency. Args: input_dict: Input dict from input key to a list of artifacts, including: - input_data: A list of 'ExamplesPath' type which should contain two splits 'train' and 'eval'. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - transform_output: Output of 'tf.Transform', which includes an exported Tensorflow graph suitable for both training and serving; - transformed_examples: Materialized transformed examples, which includes both 'train' and 'eval' splits. exec_properties: A dict of execution properties, including: - module_file: The file path to a python module file, from which the 'preprocessing_fn' function will be loaded. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) train_data_uri = types.get_split_uri(input_dict['input_data'], 'train') eval_data_uri = types.get_split_uri(input_dict['input_data'], 'eval') schema_file = io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema'])) transform_output = types.get_single_uri(output_dict['transform_output']) if tf.gfile.Exists(transform_output): io_utils.delete_dir(transform_output) transformed_train_output = types.get_split_uri( output_dict['transformed_examples'], 'train') if tf.gfile.Exists(transformed_train_output): io_utils.delete_dir(transformed_train_output) transformed_eval_output = types.get_split_uri( output_dict['transformed_examples'], 'eval') if tf.gfile.Exists(transformed_eval_output): io_utils.delete_dir(transformed_eval_output) temp_path = os.path.join(transform_output, _TEMP_DIR_IN_TRANSFORM_OUTPUT) tf.logging.debug('Using temp path %s for tft.beam', temp_path) label_inputs = { labels.COMPUTE_STATISTICS_LABEL: False, labels.SCHEMA_PATH_LABEL: schema_file, labels.EXAMPLES_DATA_FORMAT_LABEL: labels.FORMAT_TF_EXAMPLE, labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL: io_utils.all_files_pattern(train_data_uri), labels.TRANSFORM_ONLY_DATA_PATHS_LABEL: io_utils.all_files_pattern(eval_data_uri), labels.TFT_STATISTICS_USE_TFDV_LABEL: True, labels.PREPROCESSING_FN: exec_properties['module_file'], } label_outputs = { labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output, labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [ os.path.join(transformed_train_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), os.path.join(transformed_eval_output, _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX), ], labels.TEMP_OUTPUT_LABEL: str(temp_path), } status_file = 'status_file' # Unused self.Transform(label_inputs, label_outputs, status_file) tf.logging.info('Cleaning up temp path %s on executor success', temp_path) io_utils.delete_dir(temp_path)