def test_decode_example_with_beam_pipeline(self): example_proto_text = """ features { feature { key: "int_feature_1" value { int64_list { value: [ 0 ] } } } feature { key: "int_feature_2" value { int64_list { value: [ 1, 2, 3 ] } } } feature { key: "float_feature_1" value { float_list { value: [ 4.0 ] } } } feature { key: "float_feature_2" value { float_list { value: [ 5.0, 6.0 ] } } } feature { key: "str_feature_1" value { bytes_list { value: [ 'female' ] } } } feature { key: "str_feature_2" value { bytes_list { value: [ 'string', 'list' ] } } } } """ expected_decoded = { 'int_feature_1': np.array([0], dtype=np.integer), 'int_feature_2': np.array([1, 2, 3], dtype=np.integer), 'float_feature_1': np.array([4.0], dtype=np.floating), 'float_feature_2': np.array([5.0, 6.0], dtype=np.floating), 'str_feature_1': np.array([b'female'], dtype=np.object), 'str_feature_2': np.array([b'string', b'list'], dtype=np.object), } example = tf.train.Example() text_format.Merge(example_proto_text, example) with beam.Pipeline() as p: result = (p | beam.Create([example.SerializeToString()]) | tf_example_decoder.DecodeTFExample()) util.assert_that( result, test_util.make_example_dict_equal_fn(self, [expected_decoded]))
def generate_statistics_from_tfrecord( data_location, output_path=None, stats_options=options.StatsOptions(), pipeline_options=None, ): """Compute data statistics from TFRecord files containing TFExamples. Runs a Beam pipeline to compute the data statistics and return the result data statistics proto. This is a convenience method for users with data in TFRecord format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'GenerateStatistics' PTransform API directly instead. Args: data_location: The location of the input data files. output_path: The file path to output data statistics result to. If None, we use a temporary directory. It will be a TFRecord file containing a single data statistics proto, and can be read with the 'load_statistics' API. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. Returns: A DatasetFeatureStatisticsList proto. """ if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.gfile.Exists(output_dir_path): tf.gfile.MakeDirs(output_dir_path) # PyLint doesn't understand Beam PTransforms. # pylint: disable=no-value-for-parameter with beam.Pipeline(options=pipeline_options) as p: # Auto detect tfrecord file compression format based on input data # path suffix. _ = ( p | 'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=data_location) | 'DecodeData' >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options) # TODO(b/112014711) Implement a custom sink to write the stats proto. | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) return load_statistics(output_path)
def test_decode_example_with_beam_pipeline(self, example_proto_text, decoded_table): example = tf.train.Example() text_format.Merge(example_proto_text, example) with beam.Pipeline() as p: result = (p | beam.Create([example.SerializeToString()]) | tf_example_decoder.DecodeTFExample()) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, [decoded_table]))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) split_uris = [] for artifact in input_dict['input_data']: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: # TODO(b/126263006): Support more stats_options through config. stats_options = options.StatsOptions() for split, uri in split_uris: absl.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(uri) output_uri = artifact_utils.get_split_uri( output_dict['output'], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) absl.logging.info( 'Statistics for split {} written to {}.'.format( split, output_uri))
def Do(self, input_dict: Dict[Text, List[types.TfxType]], output_dict: Dict[Text, List[types.TfxType]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of 'ExamplesPath' type. This should contain both 'train' and 'eval' split. output_dict: Output dict from output key to a list of Artifacts. - output: A list of 'ExampleStatisticsPath' type. This should contain both 'train' and 'eval' split. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) split_to_instance = {x.split: x for x in input_dict['input_data']} with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p: # TODO(b/126263006): Support more stats_options through config. stats_options = options.StatsOptions() for split, instance in split_to_instance.items(): tf.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(instance.uri) output_uri = types.get_split_uri(output_dict['output'], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) tf.logging.info('Statistics written to {}.'.format(output_uri))
def validate_examples_in_tfrecord( data_location: Text, stats_options: options.StatsOptions, output_path: Optional[Text] = None, # TODO(b/131719250): Add option to output a sample of anomalous examples for # each anomaly reason. pipeline_options: Optional[PipelineOptions] = None, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Validates TFExamples in TFRecord files. Runs a Beam pipeline to detect anomalies on a per-example basis. If this function detects anomalous examples, it generates summary statistics regarding the set of examples that exhibit each anomaly. This is a convenience function for users with data in TFRecord format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples' PTransform API directly instead. Args: data_location: The location of the input data files. stats_options: `tfdv.StatsOptions` for generating data statistics. This must contain a schema. output_path: The file path to output data statistics result to. If None, the function uses a temporary directory. The output will be a TFRecord file containing a single data statistics list proto, and can be read with the 'load_statistics' function. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. Returns: A DatasetFeatureStatisticsList proto in which each dataset consists of the set of examples that exhibit a particular anomaly. Raises: ValueError: If the specified stats_options does not include a schema. """ if stats_options.schema is None: raise ValueError('The specified stats_options must include a schema.') if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'anomaly_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.io.gfile.exists(output_dir_path): tf.io.gfile.makedirs(output_dir_path) with beam.Pipeline(options=pipeline_options) as p: _ = ( p | 'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=data_location) | 'DecodeData' >> tf_example_decoder.DecodeTFExample(desired_batch_size=1) | 'DetectAnomalies' >> validation_api.IdentifyAnomalousExamples(stats_options) | 'GenerateSummaryStatistics' >> stats_impl.GenerateSlicedStatisticsImpl(stats_options, is_slicing_enabled=True) # TODO(b/112014711) Implement a custom sink to write the stats proto. | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) return stats_gen_lib.load_statistics(output_path)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. - schema: Optionally, a list of type `standard_artifacts.Schema`. When the stats_options exec_property also contains a schema, this input should not be provided. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. - stats_options_json: Optionally, a JSON representation of StatsOptions. When a schema is provided as an input, the StatsOptions value should not also contain a schema. Raises: ValueError when a schema is provided both as an input and as part of the StatsOptions exec_property. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) stats_options = options.StatsOptions() if STATS_OPTIONS_JSON_KEY in exec_properties: stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY] if stats_options_json: # TODO(b/150802589): Move jsonable interface to tfx_bsl and use # json_utils stats_options = options.StatsOptions.from_json( stats_options_json) if input_dict.get(SCHEMA_KEY): if stats_options.schema: raise ValueError( 'A schema was provided as an input and the ' 'stats_options exec_property also contains a schema ' 'value. At most one of these may be set.') else: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) stats_options.schema = schema split_uris = [] for artifact in input_dict[EXAMPLES_KEY]: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: for split, uri in split_uris: absl.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(uri) output_uri = artifact_utils.get_split_uri( output_dict[STATISTICS_KEY], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) absl.logging.info( 'Statistics for split {} written to {}.'.format( split, output_uri))
def generate_statistics_from_tfrecord( data_location: Text, output_path: Optional[bytes] = None, stats_options: options.StatsOptions = options.StatsOptions(), pipeline_options: Optional[PipelineOptions] = None, compression_type: Text = CompressionTypes.AUTO, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Compute data statistics from TFRecord files containing TFExamples. Runs a Beam pipeline to compute the data statistics and return the result data statistics proto. This is a convenience method for users with data in TFRecord format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'GenerateStatistics' PTransform API directly instead. Args: data_location: The location of the input data files. output_path: The file path to output data statistics result to. If None, we use a temporary directory. It will be a TFRecord file containing a single data statistics proto, and can be read with the 'load_statistics' API. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. compression_type: Used to handle compressed input files. Default value is CompressionTypes.AUTO, in which case the file_path's extension will be used to detect the compression. Returns: A DatasetFeatureStatisticsList proto. """ if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.io.gfile.exists(output_dir_path): tf.io.gfile.makedirs(output_dir_path) batch_size = (stats_options.desired_batch_size if stats_options.desired_batch_size and stats_options.desired_batch_size > 0 else constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE) # PyLint doesn't understand Beam PTransforms. # pylint: disable=no-value-for-parameter with beam.Pipeline(options=pipeline_options) as p: # Auto detect tfrecord file compression format based on input data # path suffix. _ = ( p | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=data_location, compression_type=compression_type) | 'DecodeData' >> tf_example_decoder.DecodeTFExample(desired_batch_size=batch_size) | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput' >> stats_api.WriteStatisticsToTFRecord(output_path)) return stats_util.load_statistics(output_path)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """ Main execution logic for the Sequencer component :param input_dict: input channels :param output_dict: output channels :param exec_properties: the execution properties defined in the spec """ source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) # Get the schema schema_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[constants.SCHEMA])) schema = io_utils.SchemaReader().read(schema_path) # TODO: Getting the statistics might help the future implementations sequence_step: BaseSequencerStep = c(schema=schema, statistics=None, **args) # Get split names input_artifact = artifact_utils.get_single_instance( input_dict[constants.INPUT_EXAMPLES]) split_names = artifact_utils.decode_split_names( input_artifact.split_names) # Create output artifact output_artifact = artifact_utils.get_single_instance( output_dict[constants.OUTPUT_EXAMPLES]) output_artifact.split_names = artifact_utils.encode_split_names( split_names) with self._make_beam_pipeline() as p: for s in split_names: input_uri = io_utils.all_files_pattern( artifact_utils.get_split_uri( input_dict[constants.INPUT_EXAMPLES], s)) output_uri = artifact_utils.get_split_uri( output_dict[constants.OUTPUT_EXAMPLES], s) output_path = os.path.join(output_uri, self._DEFAULT_FILENAME) # Read and decode the data data = \ (p | 'Read_' + s >> beam.io.ReadFromTFRecord( file_pattern=input_uri) | 'Decode_' + s >> tf_example_decoder.DecodeTFExample() | 'ToDataFrame_' + s >> beam.ParDo(utils.ConvertToDataframe())) # Window into sessions s_data = \ (data | 'AddCategory_' + s >> beam.ParDo( sequence_step.get_category_do_fn()) | 'AddTimestamp_' + s >> beam.ParDo( sequence_step.get_timestamp_do_fn()) | 'Sessions_' + s >> beam.WindowInto( sequence_step.get_window())) # Combine and transform p_data = \ (s_data | 'Combine_' + s >> beam.CombinePerKey( sequence_step.get_combine_fn())) # Write the results _ = \ (p_data | 'Global_' + s >> beam.WindowInto(GlobalWindows()) | 'RemoveKey_' + s >> beam.ParDo(RemoveKey()) | 'ToExample_' + s >> beam.Map(utils.df_to_example) | 'Serialize_' + s >> beam.Map(utils.serialize) | 'Write_' + s >> beam.io.WriteToTFRecord( output_path, file_name_suffix='.gz'))