示例#1
0
 def test_decode_example_with_beam_pipeline(self):
     example_proto_text = """
 features {
   feature { key: "int_feature_1"
             value { int64_list { value: [ 0 ] } } }
   feature { key: "int_feature_2"
             value { int64_list { value: [ 1, 2, 3 ] } } }
   feature { key: "float_feature_1"
             value { float_list { value: [ 4.0 ] } } }
   feature { key: "float_feature_2"
             value { float_list { value: [ 5.0, 6.0 ] } } }
   feature { key: "str_feature_1"
             value { bytes_list { value: [ 'female' ] } } }
   feature { key: "str_feature_2"
             value { bytes_list { value: [ 'string', 'list' ] } } }
 }
 """
     expected_decoded = {
         'int_feature_1': np.array([0], dtype=np.integer),
         'int_feature_2': np.array([1, 2, 3], dtype=np.integer),
         'float_feature_1': np.array([4.0], dtype=np.floating),
         'float_feature_2': np.array([5.0, 6.0], dtype=np.floating),
         'str_feature_1': np.array([b'female'], dtype=np.object),
         'str_feature_2': np.array([b'string', b'list'], dtype=np.object),
     }
     example = tf.train.Example()
     text_format.Merge(example_proto_text, example)
     with beam.Pipeline() as p:
         result = (p
                   | beam.Create([example.SerializeToString()])
                   | tf_example_decoder.DecodeTFExample())
         util.assert_that(
             result,
             test_util.make_example_dict_equal_fn(self, [expected_decoded]))
def generate_statistics_from_tfrecord(
        data_location,
        output_path=None,
        stats_options=options.StatsOptions(),
        pipeline_options=None,
):
    """Compute data statistics from TFRecord files containing TFExamples.

  Runs a Beam pipeline to compute the data statistics and return the result
  data statistics proto.

  This is a convenience method for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'GenerateStatistics'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    output_path: The file path to output data statistics result to. If None, we
      use a temporary directory. It will be a TFRecord file containing a single
      data statistics proto, and can be read with the 'load_statistics' API.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.gfile.Exists(output_dir_path):
        tf.gfile.MakeDirs(output_dir_path)

    # PyLint doesn't understand Beam PTransforms.
    # pylint: disable=no-value-for-parameter
    with beam.Pipeline(options=pipeline_options) as p:
        # Auto detect tfrecord file compression format based on input data
        # path suffix.
        _ = (
            p
            |
            'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=data_location)
            | 'DecodeData' >> tf_example_decoder.DecodeTFExample()
            |
            'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
            # TODO(b/112014711) Implement a custom sink to write the stats proto.
            | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                output_path,
                shard_name_template='',
                coder=beam.coders.ProtoCoder(
                    statistics_pb2.DatasetFeatureStatisticsList)))
    return load_statistics(output_path)
 def test_decode_example_with_beam_pipeline(self, example_proto_text,
                                            decoded_table):
     example = tf.train.Example()
     text_format.Merge(example_proto_text, example)
     with beam.Pipeline() as p:
         result = (p
                   | beam.Create([example.SerializeToString()])
                   | tf_example_decoder.DecodeTFExample())
         util.assert_that(
             result,
             test_util.make_arrow_tables_equal_fn(self, [decoded_table]))
示例#4
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        split_uris = []
        for artifact in input_dict['input_data']:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            # TODO(b/126263006): Support more stats_options through config.
            stats_options = options.StatsOptions()
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                output_uri = artifact_utils.get_split_uri(
                    output_dict['output'], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
示例#5
0
    def Do(self, input_dict: Dict[Text, List[types.TfxType]],
           output_dict: Dict[Text, List[types.TfxType]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of 'ExamplesPath' type. This should contain both
          'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of 'ExampleStatisticsPath' type. This should contain
          both 'train' and 'eval' split.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        split_to_instance = {x.split: x for x in input_dict['input_data']}
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p:
            # TODO(b/126263006): Support more stats_options through config.
            stats_options = options.StatsOptions()
            for split, instance in split_to_instance.items():
                tf.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(instance.uri)
                output_uri = types.get_split_uri(output_dict['output'], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
            tf.logging.info('Statistics written to {}.'.format(output_uri))
def validate_examples_in_tfrecord(
    data_location: Text,
    stats_options: options.StatsOptions,
    output_path: Optional[Text] = None,
    # TODO(b/131719250): Add option to output a sample of anomalous examples for
    # each anomaly reason.
    pipeline_options: Optional[PipelineOptions] = None,
) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Validates TFExamples in TFRecord files.

  Runs a Beam pipeline to detect anomalies on a per-example basis. If this
  function detects anomalous examples, it generates summary statistics regarding
  the set of examples that exhibit each anomaly.

  This is a convenience function for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    stats_options: `tfdv.StatsOptions` for generating data statistics. This must
      contain a schema.
    output_path: The file path to output data statistics result to. If None, the
      function uses a temporary directory. The output will be a TFRecord file
      containing a single data statistics list proto, and can be read with the
      'load_statistics' function.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.

  Returns:
    A DatasetFeatureStatisticsList proto in which each dataset consists of the
      set of examples that exhibit a particular anomaly.

  Raises:
    ValueError: If the specified stats_options does not include a schema.
  """
    if stats_options.schema is None:
        raise ValueError('The specified stats_options must include a schema.')
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(),
                                   'anomaly_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.io.gfile.exists(output_dir_path):
        tf.io.gfile.makedirs(output_dir_path)

    with beam.Pipeline(options=pipeline_options) as p:
        _ = (
            p
            |
            'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=data_location)
            | 'DecodeData' >>
            tf_example_decoder.DecodeTFExample(desired_batch_size=1)
            | 'DetectAnomalies' >>
            validation_api.IdentifyAnomalousExamples(stats_options)
            | 'GenerateSummaryStatistics' >>
            stats_impl.GenerateSlicedStatisticsImpl(stats_options,
                                                    is_slicing_enabled=True)
            # TODO(b/112014711) Implement a custom sink to write the stats proto.
            | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                output_path,
                shard_name_template='',
                coder=beam.coders.ProtoCoder(
                    statistics_pb2.DatasetFeatureStatisticsList)))

    return stats_gen_lib.load_statistics(output_path)
示例#7
0
文件: executor.py 项目: vibhatha/tfx
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        stats_options = options.StatsOptions()
        if STATS_OPTIONS_JSON_KEY in exec_properties:
            stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY]
            if stats_options_json:
                # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
                # json_utils
                stats_options = options.StatsOptions.from_json(
                    stats_options_json)
        if input_dict.get(SCHEMA_KEY):
            if stats_options.schema:
                raise ValueError(
                    'A schema was provided as an input and the '
                    'stats_options exec_property also contains a schema '
                    'value. At most one of these may be set.')
            else:
                schema = io_utils.SchemaReader().read(
                    io_utils.get_only_uri_in_dir(
                        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
                stats_options.schema = schema

        split_uris = []
        for artifact in input_dict[EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                output_uri = artifact_utils.get_split_uri(
                    output_dict[STATISTICS_KEY], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
示例#8
0
def generate_statistics_from_tfrecord(
    data_location: Text,
    output_path: Optional[bytes] = None,
    stats_options: options.StatsOptions = options.StatsOptions(),
    pipeline_options: Optional[PipelineOptions] = None,
    compression_type: Text = CompressionTypes.AUTO,
) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Compute data statistics from TFRecord files containing TFExamples.

  Runs a Beam pipeline to compute the data statistics and return the result
  data statistics proto.

  This is a convenience method for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'GenerateStatistics'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    output_path: The file path to output data statistics result to. If None, we
      use a temporary directory. It will be a TFRecord file containing a single
      data statistics proto, and can be read with the 'load_statistics' API.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.
    compression_type: Used to handle compressed input files. Default value is
      CompressionTypes.AUTO, in which case the file_path's extension will be
      used to detect the compression.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.io.gfile.exists(output_dir_path):
        tf.io.gfile.makedirs(output_dir_path)

    batch_size = (stats_options.desired_batch_size
                  if stats_options.desired_batch_size
                  and stats_options.desired_batch_size > 0 else
                  constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE)
    # PyLint doesn't understand Beam PTransforms.
    # pylint: disable=no-value-for-parameter
    with beam.Pipeline(options=pipeline_options) as p:
        # Auto detect tfrecord file compression format based on input data
        # path suffix.
        _ = (
            p
            | 'ReadData' >> beam.io.ReadFromTFRecord(
                file_pattern=data_location, compression_type=compression_type)
            | 'DecodeData' >>
            tf_example_decoder.DecodeTFExample(desired_batch_size=batch_size)
            |
            'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
            | 'WriteStatsOutput' >>
            stats_api.WriteStatisticsToTFRecord(output_path))
    return stats_util.load_statistics(output_path)
示例#9
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """
        Main execution logic for the Sequencer component

        :param input_dict: input channels
        :param output_dict: output channels
        :param exec_properties: the execution properties defined in the spec
        """

        source = exec_properties[StepKeys.SOURCE]
        args = exec_properties[StepKeys.ARGS]

        c = source_utils.load_source_path_class(source)

        # Get the schema
        schema_path = io_utils.get_only_uri_in_dir(
            artifact_utils.get_single_uri(input_dict[constants.SCHEMA]))
        schema = io_utils.SchemaReader().read(schema_path)

        # TODO: Getting the statistics might help the future implementations

        sequence_step: BaseSequencerStep = c(schema=schema,
                                             statistics=None,
                                             **args)

        # Get split names
        input_artifact = artifact_utils.get_single_instance(
            input_dict[constants.INPUT_EXAMPLES])
        split_names = artifact_utils.decode_split_names(
            input_artifact.split_names)

        # Create output artifact
        output_artifact = artifact_utils.get_single_instance(
            output_dict[constants.OUTPUT_EXAMPLES])
        output_artifact.split_names = artifact_utils.encode_split_names(
            split_names)

        with self._make_beam_pipeline() as p:
            for s in split_names:
                input_uri = io_utils.all_files_pattern(
                    artifact_utils.get_split_uri(
                        input_dict[constants.INPUT_EXAMPLES], s))

                output_uri = artifact_utils.get_split_uri(
                    output_dict[constants.OUTPUT_EXAMPLES], s)
                output_path = os.path.join(output_uri, self._DEFAULT_FILENAME)

                # Read and decode the data
                data = \
                    (p
                     | 'Read_' + s >> beam.io.ReadFromTFRecord(
                                file_pattern=input_uri)
                     | 'Decode_' + s >> tf_example_decoder.DecodeTFExample()
                     | 'ToDataFrame_' + s >> beam.ParDo(utils.ConvertToDataframe()))

                # Window into sessions
                s_data = \
                    (data
                     | 'AddCategory_' + s >> beam.ParDo(
                                sequence_step.get_category_do_fn())
                     | 'AddTimestamp_' + s >> beam.ParDo(
                                sequence_step.get_timestamp_do_fn())
                     | 'Sessions_' + s >> beam.WindowInto(
                                sequence_step.get_window()))

                # Combine and transform
                p_data = \
                    (s_data
                     | 'Combine_' + s >> beam.CombinePerKey(
                                sequence_step.get_combine_fn()))

                # Write the results
                _ = \
                    (p_data
                     | 'Global_' + s >> beam.WindowInto(GlobalWindows())
                     | 'RemoveKey_' + s >> beam.ParDo(RemoveKey())
                     | 'ToExample_' + s >> beam.Map(utils.df_to_example)
                     | 'Serialize_' + s >> beam.Map(utils.serialize)
                     | 'Write_' + s >> beam.io.WriteToTFRecord(
                                output_path,
                                file_name_suffix='.gz'))