Python GenerateStatistics示例，tensorflow_data_validation.GenerateStatistics Python示例

示例#1

0

显示文件

文件： sequence_example_e2e_test.py 项目： stjordanis/data-validation

    def test_e2e(self, stats_options, expected_stats_pbtxt,
                 expected_schema_pbtxt):
        tfxio = tf_sequence_example_record.TFSequenceExampleRecord(
            self._input_file, ['tfdv', 'test'])
        stats_file = os.path.join(self._output_dir, 'stats')
        with beam.Pipeline() as p:
            _ = (p
                 | 'TFXIORead' >> tfxio.BeamSource()
                 | 'GenerateStats' >> tfdv.GenerateStatistics(stats_options)
                 | 'WriteStats' >> tfdv.WriteStatisticsToTFRecord(stats_file))

        actual_stats = tfdv.load_statistics(stats_file)
        test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self,
            text_format.Parse(
                expected_stats_pbtxt,
                statistics_pb2.DatasetFeatureStatisticsList()))([actual_stats])
        actual_schema = tfdv.infer_schema(actual_stats,
                                          infer_feature_shape=True)

        if hasattr(actual_schema, 'generate_legacy_feature_spec'):
            actual_schema.ClearField('generate_legacy_feature_spec')
        self._assert_schema_equal(
            actual_schema,
            text_format.Parse(expected_schema_pbtxt, schema_pb2.Schema()))

示例#2

0

显示文件

def compute_stats(bq_table,
                  step,
                  stats_path,
                  max_rows=None,
                  pipeline_args=None):
    # todo : update doc
    """Computes statistics on the input data.

    Args:
        table: BigQuery table
        step: (test, train)
        stats_path: Directory in which stats are materialized.
        pipeline_args: additional DataflowRunner or DirectRunner args passed to the
          beam pipeline.
    """

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        query = sql_queries.get_train_test_sql_query(bq_table, step, max_rows)

        raw_data = (
            pipeline
            | 'ReadBigQuery' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True))
            | 'ConvertToTFDVInput' >> beam.Map(lambda x: pa.Table.from_pydict(
                {key: [[x[key]]]
                 for key in x if x[key] is not None})))

        _ = (raw_data
             | 'GenerateStatistics' >> tfdv.GenerateStatistics()
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 stats_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))

示例#3

0

显示文件

文件： tfdv_analyze_and_validate.py 项目： AfterShip/aftership-beam

def compute_stats(
    input_handle,
    stats_path,
    max_rows=None,
    for_eval=False,
    pipeline_args=None,
    publish_to_bq=None,
    metrics_dataset=None,
    metrics_table=None,
    project=None):
  """Computes statistics on the input data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    stats_path: Directory in which stats are materialized.
    max_rows: Number of rows to query from BigQuery
    for_eval: Query for eval set rows from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
  namespace = metrics_table
  pipeline = beam.Pipeline(argv=pipeline_args)
  metrics_monitor = None
  if publish_to_bq:
    metrics_monitor = MetricsReader(
        publish_to_bq=publish_to_bq,
        project_name=project,
        bq_table=metrics_table,
        bq_dataset=metrics_dataset,
        namespace=namespace,
        filters=MetricsFilter().with_namespace(namespace),
    )

  query = taxi.make_sql(
      table_name=input_handle, max_rows=max_rows, for_eval=for_eval)
  raw_data = (
      pipeline
      | 'ReadBigQuery' >> ReadFromBigQuery(
          query=query, project=project, use_standard_sql=True)
      | 'Measure time: Start' >> beam.ParDo(MeasureTime(namespace))
      | 'ConvertToTFDVInput' >> beam.Map(
          lambda x:
          {key: np.asarray([x[key]])
           for key in x if x[key] is not None}))

  _ = (
      raw_data
      | 'GenerateStatistics' >> tfdv.GenerateStatistics()
      | 'Measure time: End' >> beam.ParDo(MeasureTime(namespace))
      | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
          stats_path,
          shard_name_template='',
          coder=beam.coders.ProtoCoder(
              statistics_pb2.DatasetFeatureStatisticsList)))
  result = pipeline.run()
  result.wait_until_finish()
  if metrics_monitor:
    metrics_monitor.publish_metrics(result)

示例#4

0

显示文件

  def _ComputeTFDVStats(pcollection: beam.pvalue.PCollection,
                        schema: schema_pb2.Schema) -> beam.pvalue.PCollection:
    """Cmoputes Statistics with TFDV.

    Args:
      pcollection: pcollection of examples.
      schema: schema.

    Returns:
      PCollection of `DatasetFeatureStatisticsList`.
    """
    feature_specs_from_schema = schema_utils.schema_as_feature_spec(
        schema).feature_spec

    def EncodeTFDV(element, feature_specs):
      """Encodes element in an in-memory format that TFDV expects."""
      if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY not in element:
        raise ValueError(
            'Expected _TRANSFORM_INTERNAL_FEATURE_FOR_KEY ({}) to exist in the '
            'input but not found.'.format(_TRANSFORM_INTERNAL_FEATURE_FOR_KEY))

      # TODO(b/123549935): Obviate the numpy array conversions by
      # allowing TFDV to accept primitives in general, and TFT's
      # input/output format in particular.
      result = {}
      for feature_name, feature_spec in six.iteritems(feature_specs):
        feature_value = element.get(feature_name)
        if feature_value is None:
          result[feature_name] = None
        elif isinstance(feature_value, (np.ndarray, list)):
          result[feature_name] = np.asarray(
              feature_value, feature_spec.dtype.as_numpy_dtype)
        else:
          result[feature_name] = np.asarray(
              [feature_value], dtype=feature_spec.dtype.as_numpy_dtype)

      return result

    result = (pcollection
              # TODO(kestert): Remove encoding and batching steps once TFT
              # supports Arrow tables.
              | 'EncodeTFDV' >> beam.Map(
                  EncodeTFDV, feature_specs=feature_specs_from_schema))

    # TODO(pachristopher): Remove this once TFDV 0.14 is released.
    (major, minor, _) = tfdv.__version__.split('.')
    if int(major) > 0 or int(minor) >= 14:
      result |= ('BatchExamplesToArrowTables' >>
                 batch_util.BatchExamplesToArrowTables())

    return (result
            | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics(
                tfdv.StatsOptions(schema=schema)))

示例#5

0

显示文件

文件： tfdv_analyze_and_validate.py 项目： anitameh/tfx-1

def compute_stats(input_handle,
                  stats_path,
                  max_rows=None,
                  for_eval=False,
                  pipeline_args=None):
    """Computes statistics on the input data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    stats_path: Directory in which stats are materialized.
    max_rows: Number of rows to query from BigQuery
    for_eval: Query for eval set rows from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        if input_handle.lower().endswith('csv'):
            raw_data = (pipeline
                        | 'ReadData' >> beam.io.textio.ReadFromText(
                            file_pattern=input_handle, skip_header_lines=1)
                        | 'DecodeData' >> csv_decoder.DecodeCSV(
                            column_names=taxi.CSV_COLUMN_NAMES))
        else:
            query = taxi.make_sql(table_name=input_handle,
                                  max_rows=max_rows,
                                  for_eval=for_eval)
            raw_data = (
                pipeline
                | 'ReadBigQuery' >> beam.io.Read(
                    beam.io.BigQuerySource(query=query, use_standard_sql=True))
                | 'ConvertToTFDVInput' >> beam.Map(
                    lambda x: {
                        key: np.asarray([x[key]])  # pylint: disable=g-long-lambda
                        for key in x if x[key] is not None
                    }))
            # TODO(pachristopher): Remove this once TFDV 0.14 is released.
            (major, minor, _) = tfdv.__version__.split('.')
            if int(major) > 0 or int(minor) >= 14:
                raw_data |= ('BatchExamplesToArrowTables' >>
                             batch_util.BatchExamplesToArrowTables())

        _ = (raw_data
             | 'GenerateStatistics' >> tfdv.GenerateStatistics()
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 stats_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))

示例#6

0

显示文件

文件： executor.py 项目： zorrock/tfx

    def _ComputeTFDVStats(
            pcollection: beam.pvalue.PCollection,
            schema: schema_pb2.Schema) -> beam.pvalue.PCollection:
        """Cmoputes Statistics with TFDV.

    Args:
      pcollection: pcollection of examples.
      schema: schema.

    Returns:
      PCollection of `DatasetFeatureStatisticsList`.
    """
        feature_specs_from_schema = schema_utils.schema_as_feature_spec(
            schema).feature_spec

        def EncodeTFDV(element, feature_specs):
            """Encodes element in an in-memory format that TFDV expects."""
            if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY not in element:
                raise ValueError(
                    'Expected _TRANSFORM_INTERNAL_FEATURE_FOR_KEY ({}) to exist in the '
                    'input but not found.'.format(
                        _TRANSFORM_INTERNAL_FEATURE_FOR_KEY))

            # TODO(b/123549935): Obviate the numpy array conversions by
            # allowing TFDV to accept primitives in general, and TFT's
            # input/output format in particular.
            result = {}
            for feature_name, feature_spec in six.iteritems(feature_specs):
                feature_value = element.get(feature_name)
                if feature_value is None:
                    result[feature_name] = None
                elif isinstance(feature_value, (np.ndarray, list)):
                    result[feature_name] = np.asarray(
                        feature_value, feature_spec.dtype.as_numpy_dtype)
                else:
                    result[feature_name] = np.asarray(
                        [feature_value],
                        dtype=feature_spec.dtype.as_numpy_dtype)

            return result

        return (pcollection
                | 'EncodeTFDV' >> beam.Map(
                    EncodeTFDV, feature_specs=feature_specs_from_schema)
                | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics(
                    tfdv.StatsOptions(schema=schema)))

示例#7

0

显示文件

def run_pipeline(flags, pipeline_option):
    input_path = flags.input_path
    output_path = flags.output_path
    column_names = CSV_COLUMNS

    with beam.Pipeline(options=pipeline_option) as p:
        # If a header is not provided, assume the first line in a file
        # to be the header.
        skip_header_lines = 1 if column_names is None else 0

        _ = (p
             | 'ReadData' >> beam.io.textio.ReadFromText(
                 file_pattern=input_path, skip_header_lines=skip_header_lines)
             | 'DecodeData' >> csv_decoder.DecodeCSV(column_names=column_names)
             | 'GenerateStatistics' >> tfdv.GenerateStatistics()
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 output_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))

示例#8

0

显示文件

    def _ComputeTFDVStats(pcollection, schema):
        """Cmoputes Statistics with TFDV.

    Args:
      pcollection: pcollection of examples.
      schema: schema.

    Returns:
      PCollection of `DatasetFeatureStatisticsList`.
    """
        def EncodeTFDV(element):
            """Encodes element in an in-memory format that TFDV expects."""
            assert _TRANSFORM_INTERNAL_FEATURE_FOR_KEY in element

            # TODO(b/123549935): Obviate the numpy array converstions by
            # allowing TFDV to accept primitives in general, and TFT's
            # input/output format in particular.
            # TODO(kestert): Iterate through schema instead of element.items and
            # encode missing elements of `element` as None.
            result = {}
            for k, v in element.items():
                if k == _TRANSFORM_INTERNAL_FEATURE_FOR_KEY:
                    continue  # Make sure the synthetic key feature doesn't get encoded.
                elif isinstance(v, np.ndarray) or v is None:
                    result[k] = v
                elif isinstance(v, list):
                    if v:
                        result[k] = np.asarray(v)
                    else:
                        # An empty list.
                        # TODO(kestert): Use Metadata to determine the dtype.
                        continue  # Instead want: result[k] = np.asarray([], dtype=...)
                else:
                    result[k] = np.asarray([v])
            return result

        return (pcollection
                | 'EncodeTFDV' >> beam.Map(EncodeTFDV)
                | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics(
                    tfdv.StatsOptions(schema=schema)))

示例#9

0

显示文件

文件： drift_reports.py 项目： jarokaz/tfx-sandbox

def generate_drift_reports(
        request_response_log_table: str,
        instance_type: InstanceType,    
        feature_names: List[str],
        start_time: datetime.datetime,
        end_time: datetime.datetime,
        output_path: GCSPath,
        schema: schema_pb2.Schema,
        baseline_stats: statistics_pb2.DatasetFeatureStatisticsList,
        stats_options: stats_options.StatsOptions = stats_options.StatsOptions(),
        pipeline_options: Optional[PipelineOptions] = None,       
):
    """Computes statistics and anomalies for a time window in AI Platform Prediction
    request-response log.
  
    Args:
      request_response_log_table: A full name of a BigQuery table
        with the request_response_log
      instance_type: The type of instances logged in the request_response_log_table.
        Currently, the only supported instance types are: a simple list (InstanceType.SIMPLE_LIST)
        and a JSON object (InstanceType(JSON_OBJECT))
      feature_names: A list of feature names. Must be provided if the instance_type is
        InstanceType(SIMPLE_LIST)
      start_time: The beginning of a time window.
      end_time: The end of a time window.
      output_path: The GCS location to output the statistics and anomalies
        proto buffers to. The file names will be `stats.pb` and `anomalies.pbtxt`. 
      schema: A Schema protobuf describing the expected schema.
      stats_options: `tfdv.StatsOptions` for generating data statistics.
      pipeline_options: Optional beam pipeline options. This allows users to
        specify various beam pipeline execution parameters like pipeline runner
        (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
        See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
        more details.
    """

    query = _generate_query(request_response_log_table, start_time, end_time)    
    stats_output_path = os.path.join(output_path, _STATS_FILENAME)
    anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME)
    
    with beam.Pipeline(options=pipeline_options) as p:
        raw_examples = ( p
                   | 'GetData' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True)))
        
        if instance_type == InstanceType.SIMPLE_LIST:
            examples = (raw_examples
                       | 'SimpleInstancesToBeamExamples' >> beam.ParDo(SimpleListCoder(feature_names)))
        elif instance_type == InstanceType.JSON_OBJECT:
            examples = (raw_examples
                       | 'JSONObjectInstancesToBeamExamples' >> beam.ParDo(JSONObjectCoder()))  
        else:
            raise TypeError("Unsupported instance type")
            
        stats = (examples
                | 'BeamExamplesToArrow' >> batch_util.BatchExamplesToArrowTables()
                | 'GenerateStatistics' >> tfdv.GenerateStatistics(stats_options)
                )
        
        _ = (stats       
            | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                  file_path_prefix=stats_output_path,
                  shard_name_template='',
                  coder=beam.coders.ProtoCoder(
                      statistics_pb2.DatasetFeatureStatisticsList)))
        
        _ = (stats
            | 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema)
            | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText(
                                            file_path_prefix=anomalies_output_path,
                                            shard_name_template='',
                                            append_trailing_newlines=False))

示例#10

0

显示文件

文件： log_analyzer.py 项目： zoltanszekely21/mlops-on-gcp

def analyze_log_records(
    request_response_log_table: str,
    model: str,
    version: str,
    start_time: datetime,
    end_time: datetime,
    output_path: str,
    schema: schema_pb2.Schema,
    baseline_stats: Optional[
        statistics_pb2.DatasetFeatureStatisticsList] = None,
    time_window: Optional[timedelta] = None,
    pipeline_options: Optional[PipelineOptions] = None,
):
    """
    Computes statistics and detects anomalies for a time series of records 
    in an AI Platform Prediction request-response log.

    The function starts an Apache Beam job that calculates statistics and detects data anomalies
    in a time series of records retrieved from an AI Platform Prediction request-response log.
    Optionally, the function can also calculate stastics for a set of time slices within
    the time series. The output of the job is a statistics_pb2.DatasetFeatureStatisticsList
    protobuf with descriptive statistis and an anomalies_pb2.Anomalies protobuf
    with anomaly reports. The protobufs are stored to a GCS location. 

    Args:
      request_response_log_table: A full name of a BigQuery table
        with the request_response_log
      start_time: The start of the time series. The value will be rounded to minutes.
      end_time: The end of the time series. The value will be rounded to minutes. 
      output_path: The GCS location to output the statistics and anomaly
        proto buffers to. The file names will be `stats.pb` and `anomalies.pbtxt`. 
      schema: A Schema protobuf describing the expected schema.
      baseline_stats: If provided, the baseline statistics will be used to detect
        distribution anomalies.        
      time_window: If provided the  time series of records will be divided into 
        a set of consecutive time slices of the time_window width and the stats 
        will be calculated for each slice. 
      pipeline_options: Optional beam pipeline options. This allows users to
        specify various beam pipeline execution parameters like pipeline runner
        (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
        See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
        more details.
    """

    # Generate a BigQuery query
    end_time = end_time.replace(second=0, microsecond=0)
    start_time = start_time.replace(second=0, microsecond=0)
    query = _generate_query(
        table_name=request_response_log_table,
        model=model,
        version=version,
        start_time=start_time.strftime('%Y-%m-%dT%H:%M:%S'),
        end_time=end_time.strftime('%Y-%m-%dT%H:%M:%S'))

    # Configure slicing for statistics calculations
    stats_options = tfdv.StatsOptions(schema=schema)
    slicing_column = None
    if time_window:
        time_window = timedelta(days=time_window.days,
                                seconds=(time_window.seconds // 60) * 60)

        if end_time - start_time > time_window:
            slice_fn = tfdv.get_feature_value_slicer(
                features={_SLICING_COLUMN_NAME: None})
            stats_options.slice_functions = [slice_fn]
            slicing_column = _SLICING_COLUMN_NAME
            slicing_feature = schema.feature.add()
            slicing_feature.name = _SLICING_COLUMN_NAME
            slicing_feature.type = _SLICING_COLUMN_TYPE

    # Configure output paths
    stats_output_path = os.path.join(output_path, _STATS_FILENAME)
    anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME)

    # Define an start the pipeline
    with beam.Pipeline(options=pipeline_options) as p:
        raw_examples = (
            p
            | 'GetData' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))

        examples = (
            raw_examples
            | 'InstancesToBeamExamples' >> beam.ParDo(
                InstanceCoder(schema, end_time, time_window, slicing_column)))

        stats = (examples
                 | 'BeamExamplesToArrow' >>
                 tfdv.utils.batch_util.BatchExamplesToArrowRecordBatches()
                 | 'GenerateStatistics' >>
                 tfdv.GenerateStatistics(options=stats_options))

        _ = (stats
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 file_path_prefix=stats_output_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))

        anomalies = (stats
                     | 'ValidateStatistics' >> beam.Map(
                         tfdv.validate_statistics,
                         schema=schema,
                         previous_statistics=baseline_stats))

        _ = (anomalies
             | 'AlertIfAnomalies' >> beam.Map(_alert_if_anomalies,
                                              anomalies_output_path)
             | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText(
                 file_path_prefix=anomalies_output_path,
                 shard_name_template='',
                 append_trailing_newlines=False))

示例#11

0

显示文件

文件： gen_stats-Copy1.py 项目： jarokaz/tfx-sandbox

def generate_statistics_from_bq(
    query: Text,
    output_path: Text,
    schema: schema_pb2.Schema,
    stats_options: stats_options.StatsOptions = stats_options.StatsOptions(),
    pipeline_options: Optional[PipelineOptions] = None,
) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Computes data statistics from a BigQuery query result.
  
    Args:
      query: The BigQuery query.
      output_path: The file path to output data statistics result to. 
        It will be a TFRecord file containing a single
        data statistics proto, and can be read with the 'load_statistics' API.
        If you run this function on Google Cloud, you must specify an
        output_path. Specifying None may cause an error.
      schema: A Schema protobuf to use for data validation
      stats_options: `tfdv.StatsOptions` for generating data statistics.
      pipeline_options: Optional beam pipeline options. This allows users to
        specify various beam pipeline execution parameters like pipeline runner
        (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
        See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
        more details.
    Returns:
      A DatasetFeatureStatisticsList proto.
    """

    column_specs = _get_column_specs(query)
    if not validate_bq_types(_get_column_specs(query).values()):
        raise ValueError("Unsupported BigQuery data types.")

    batch_size = (stats_options.desired_batch_size
                  if stats_options.desired_batch_size
                  and stats_options.desired_batch_size > 0 else
                  tfdv.constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE)
    # PyLint doesn't understand Beam PTransforms.
    # pylint: disable=no-value-for-parameter

    stats_output_path = os.path.join(output_path, _STATS_FILENAME)
    anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME)

    with beam.Pipeline(options=pipeline_options) as p:
        stats = (
            p
            | 'GetData' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True))
            #        | 'DecodeData' >>  DecodeBigQuery(column_specs,
            #                                          desired_batch_size=batch_size)
            | 'DecodeExamples' >> batch_util.BatchExamplesToArrowTables()
            | 'GenerateStatistics' >> tfdv.GenerateStatistics())

        _ = (stats
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 file_path_prefix=stats_output_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))
        _ = (stats
             | 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics,
                                                schema=schema)
             | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText(
                 file_path_prefix=anomalies_output_path,
                 shard_name_template='',
                 append_trailing_newlines=False))

示例#12

0

显示文件

        'f1': [3],
        'f3': ['bbb'],
        'f4': [1]
    }]

    with beam.Pipeline(options=pipeline_options) as p:
        stats = (
            p
            | 'GetData' >> beam.Create(instances)
            #           | 'BatchDictionaries' >> beam.BatchElements(
            #                 min_batch_size = desired_batch_size,
            #                 max_batch_size = desired_batch_size)
            #           | 'CovertToArrowTables' >> beam.ParDo(
            #               BatchedDictsToArrowTable())
            | 'DecodeExamples' >> batch_util.BatchExamplesToArrowTables()
            | 'GenerateStatistics' >> tfdv.GenerateStatistics())

        _ = (stats
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 file_path_prefix=stats_output_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))

    # _ = (stats
    #| 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema)
    #    | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText(
    #                                  file_path_prefix=anomalies_output_path,
    #                                   shard_name_template='',
    #                                 append_trailing_newlines=True))