def test_batch_examples(self): examples = [{ 'a': np.array([1.0, 2.0], dtype=np.float32), 'b': np.array(['a', 'b', 'c', 'e']) }, { 'a': np.array([3.0, 4.0, 5.0], dtype=np.float32), }, { 'b': np.array(['d', 'e', 'f']), 'd': np.array([10, 20, 30], dtype=np.int64), }, { 'b': np.array(['a', 'b', 'c']) }, { 'c': np.array(['d', 'e', 'f']) }] expected_tables = [ pa.Table.from_arrays([ pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]], type=pa.list_(pa.float32())), pa.array([['a', 'b', 'c', 'e'], None]) ], ['a', 'b']), pa.Table.from_arrays([ pa.array([['d', 'e', 'f'], ['a', 'b', 'c']]), pa.array([[10, 20, 30], None], type=pa.list_(pa.int64())) ], ['b', 'd']), pa.Table.from_arrays([pa.array([['d', 'e', 'f']])], ['c']), ] with beam.Pipeline() as p: result = ( p | beam.Create(examples) | batch_util.BatchExamplesToArrowTables(desired_batch_size=2)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_tables))
def _ComputeTFDVStats(pcollection: beam.pvalue.PCollection, schema: schema_pb2.Schema) -> beam.pvalue.PCollection: """Cmoputes Statistics with TFDV. Args: pcollection: pcollection of examples. schema: schema. Returns: PCollection of `DatasetFeatureStatisticsList`. """ feature_specs_from_schema = schema_utils.schema_as_feature_spec( schema).feature_spec def EncodeTFDV(element, feature_specs): """Encodes element in an in-memory format that TFDV expects.""" if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY not in element: raise ValueError( 'Expected _TRANSFORM_INTERNAL_FEATURE_FOR_KEY ({}) to exist in the ' 'input but not found.'.format(_TRANSFORM_INTERNAL_FEATURE_FOR_KEY)) # TODO(b/123549935): Obviate the numpy array conversions by # allowing TFDV to accept primitives in general, and TFT's # input/output format in particular. result = {} for feature_name, feature_spec in six.iteritems(feature_specs): feature_value = element.get(feature_name) if feature_value is None: result[feature_name] = None elif isinstance(feature_value, (np.ndarray, list)): result[feature_name] = np.asarray( feature_value, feature_spec.dtype.as_numpy_dtype) else: result[feature_name] = np.asarray( [feature_value], dtype=feature_spec.dtype.as_numpy_dtype) return result result = (pcollection # TODO(kestert): Remove encoding and batching steps once TFT # supports Arrow tables. | 'EncodeTFDV' >> beam.Map( EncodeTFDV, feature_specs=feature_specs_from_schema)) # TODO(pachristopher): Remove this once TFDV 0.14 is released. (major, minor, _) = tfdv.__version__.split('.') if int(major) > 0 or int(minor) >= 14: result |= ('BatchExamplesToArrowTables' >> batch_util.BatchExamplesToArrowTables()) return (result | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics( tfdv.StatsOptions(schema=schema)))
def compute_stats(input_handle, stats_path, max_rows=None, for_eval=False, pipeline_args=None): """Computes statistics on the input data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. stats_path: Directory in which stats are materialized. max_rows: Number of rows to query from BigQuery for_eval: Query for eval set rows from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ with beam.Pipeline(argv=pipeline_args) as pipeline: if input_handle.lower().endswith('csv'): raw_data = (pipeline | 'ReadData' >> beam.io.textio.ReadFromText( file_pattern=input_handle, skip_header_lines=1) | 'DecodeData' >> csv_decoder.DecodeCSV( column_names=taxi.CSV_COLUMN_NAMES)) else: query = taxi.make_sql(table_name=input_handle, max_rows=max_rows, for_eval=for_eval) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'ConvertToTFDVInput' >> beam.Map( lambda x: { key: np.asarray([x[key]]) # pylint: disable=g-long-lambda for key in x if x[key] is not None })) # TODO(pachristopher): Remove this once TFDV 0.14 is released. (major, minor, _) = tfdv.__version__.split('.') if int(major) > 0 or int(minor) >= 14: raw_data |= ('BatchExamplesToArrowTables' >> batch_util.BatchExamplesToArrowTables()) _ = (raw_data | 'GenerateStatistics' >> tfdv.GenerateStatistics() | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( stats_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList)))
def expand(self, lines): """Decodes the input CSV records into Arrow tables. Args: lines: A PCollection of strings representing the lines in the CSV file. Returns: A PCollection of Arrow tables. """ return (lines | 'DecodeCSVToDict' >> DecodeCSVToDict( column_names=self._column_names, delimiter=self._delimiter, skip_blank_lines=self._skip_blank_lines, schema=self._schema, infer_type_from_schema=self._infer_type_from_schema) | 'BatchExamplesToArrowTables' >> batch_util.BatchExamplesToArrowTables( desired_batch_size=self._desired_batch_size))
def DecodeTFExample( examples, desired_batch_size=constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE): # pylint: disable=invalid-name """Decodes serialized TF examples into an Arrow table. Args: examples: A PCollection of strings representing serialized TF examples. desired_batch_size: Batch size. The output Arrow tables will have as many rows as the `desired_batch_size`. Returns: A PCollection of Arrow tables. """ decoder = TFExampleDecoder() return ( examples | 'ParseTFExamples' >> beam.Map(decoder.decode) | 'BatchExamplesToArrowTables' >> batch_util.BatchExamplesToArrowTables( desired_batch_size=desired_batch_size))
def generate_drift_reports( request_response_log_table: str, instance_type: InstanceType, feature_names: List[str], start_time: datetime.datetime, end_time: datetime.datetime, output_path: GCSPath, schema: schema_pb2.Schema, baseline_stats: statistics_pb2.DatasetFeatureStatisticsList, stats_options: stats_options.StatsOptions = stats_options.StatsOptions(), pipeline_options: Optional[PipelineOptions] = None, ): """Computes statistics and anomalies for a time window in AI Platform Prediction request-response log. Args: request_response_log_table: A full name of a BigQuery table with the request_response_log instance_type: The type of instances logged in the request_response_log_table. Currently, the only supported instance types are: a simple list (InstanceType.SIMPLE_LIST) and a JSON object (InstanceType(JSON_OBJECT)) feature_names: A list of feature names. Must be provided if the instance_type is InstanceType(SIMPLE_LIST) start_time: The beginning of a time window. end_time: The end of a time window. output_path: The GCS location to output the statistics and anomalies proto buffers to. The file names will be `stats.pb` and `anomalies.pbtxt`. schema: A Schema protobuf describing the expected schema. stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. """ query = _generate_query(request_response_log_table, start_time, end_time) stats_output_path = os.path.join(output_path, _STATS_FILENAME) anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME) with beam.Pipeline(options=pipeline_options) as p: raw_examples = ( p | 'GetData' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))) if instance_type == InstanceType.SIMPLE_LIST: examples = (raw_examples | 'SimpleInstancesToBeamExamples' >> beam.ParDo(SimpleListCoder(feature_names))) elif instance_type == InstanceType.JSON_OBJECT: examples = (raw_examples | 'JSONObjectInstancesToBeamExamples' >> beam.ParDo(JSONObjectCoder())) else: raise TypeError("Unsupported instance type") stats = (examples | 'BeamExamplesToArrow' >> batch_util.BatchExamplesToArrowTables() | 'GenerateStatistics' >> tfdv.GenerateStatistics(stats_options) ) _ = (stats | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( file_path_prefix=stats_output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) _ = (stats | 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema) | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText( file_path_prefix=anomalies_output_path, shard_name_template='', append_trailing_newlines=False))
def generate_statistics_from_bq( query: Text, output_path: Text, schema: schema_pb2.Schema, stats_options: stats_options.StatsOptions = stats_options.StatsOptions(), pipeline_options: Optional[PipelineOptions] = None, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Computes data statistics from a BigQuery query result. Args: query: The BigQuery query. output_path: The file path to output data statistics result to. It will be a TFRecord file containing a single data statistics proto, and can be read with the 'load_statistics' API. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. schema: A Schema protobuf to use for data validation stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. Returns: A DatasetFeatureStatisticsList proto. """ column_specs = _get_column_specs(query) if not validate_bq_types(_get_column_specs(query).values()): raise ValueError("Unsupported BigQuery data types.") batch_size = (stats_options.desired_batch_size if stats_options.desired_batch_size and stats_options.desired_batch_size > 0 else tfdv.constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE) # PyLint doesn't understand Beam PTransforms. # pylint: disable=no-value-for-parameter stats_output_path = os.path.join(output_path, _STATS_FILENAME) anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME) with beam.Pipeline(options=pipeline_options) as p: stats = ( p | 'GetData' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) # | 'DecodeData' >> DecodeBigQuery(column_specs, # desired_batch_size=batch_size) | 'DecodeExamples' >> batch_util.BatchExamplesToArrowTables() | 'GenerateStatistics' >> tfdv.GenerateStatistics()) _ = (stats | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( file_path_prefix=stats_output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) _ = (stats | 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema) | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText( file_path_prefix=anomalies_output_path, shard_name_template='', append_trailing_newlines=False))
}, { 'f1': [3], 'f3': ['bbb'], 'f4': [1] }] with beam.Pipeline(options=pipeline_options) as p: stats = ( p | 'GetData' >> beam.Create(instances) # | 'BatchDictionaries' >> beam.BatchElements( # min_batch_size = desired_batch_size, # max_batch_size = desired_batch_size) # | 'CovertToArrowTables' >> beam.ParDo( # BatchedDictsToArrowTable()) | 'DecodeExamples' >> batch_util.BatchExamplesToArrowTables() | 'GenerateStatistics' >> tfdv.GenerateStatistics()) _ = (stats | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( file_path_prefix=stats_output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) # _ = (stats #| 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema) # | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText( # file_path_prefix=anomalies_output_path, # shard_name_template='', # append_trailing_newlines=True))