def test_e2e(self, stats_options, expected_stats_pbtxt, expected_schema_pbtxt): tfxio = tf_sequence_example_record.TFSequenceExampleRecord( self._input_file, ['tfdv', 'test']) stats_file = os.path.join(self._output_dir, 'stats') with beam.Pipeline() as p: _ = (p | 'TFXIORead' >> tfxio.BeamSource() | 'GenerateStats' >> tfdv.GenerateStatistics(stats_options) | 'WriteStats' >> tfdv.WriteStatisticsToTFRecord(stats_file)) actual_stats = tfdv.load_statistics(stats_file) test_util.make_dataset_feature_stats_list_proto_equal_fn( self, text_format.Parse( expected_stats_pbtxt, statistics_pb2.DatasetFeatureStatisticsList()))([actual_stats]) actual_schema = tfdv.infer_schema(actual_stats, infer_feature_shape=True) if hasattr(actual_schema, 'generate_legacy_feature_spec'): actual_schema.ClearField('generate_legacy_feature_spec') self._assert_schema_equal( actual_schema, text_format.Parse(expected_schema_pbtxt, schema_pb2.Schema()))
def compute_stats(bq_table, step, stats_path, max_rows=None, pipeline_args=None): # todo : update doc """Computes statistics on the input data. Args: table: BigQuery table step: (test, train) stats_path: Directory in which stats are materialized. pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ with beam.Pipeline(argv=pipeline_args) as pipeline: query = sql_queries.get_train_test_sql_query(bq_table, step, max_rows) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'ConvertToTFDVInput' >> beam.Map(lambda x: pa.Table.from_pydict( {key: [[x[key]]] for key in x if x[key] is not None}))) _ = (raw_data | 'GenerateStatistics' >> tfdv.GenerateStatistics() | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( stats_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList)))
def compute_stats( input_handle, stats_path, max_rows=None, for_eval=False, pipeline_args=None, publish_to_bq=None, metrics_dataset=None, metrics_table=None, project=None): """Computes statistics on the input data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. stats_path: Directory in which stats are materialized. max_rows: Number of rows to query from BigQuery for_eval: Query for eval set rows from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ namespace = metrics_table pipeline = beam.Pipeline(argv=pipeline_args) metrics_monitor = None if publish_to_bq: metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=project, bq_table=metrics_table, bq_dataset=metrics_dataset, namespace=namespace, filters=MetricsFilter().with_namespace(namespace), ) query = taxi.make_sql( table_name=input_handle, max_rows=max_rows, for_eval=for_eval) raw_data = ( pipeline | 'ReadBigQuery' >> ReadFromBigQuery( query=query, project=project, use_standard_sql=True) | 'Measure time: Start' >> beam.ParDo(MeasureTime(namespace)) | 'ConvertToTFDVInput' >> beam.Map( lambda x: {key: np.asarray([x[key]]) for key in x if x[key] is not None})) _ = ( raw_data | 'GenerateStatistics' >> tfdv.GenerateStatistics() | 'Measure time: End' >> beam.ParDo(MeasureTime(namespace)) | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( stats_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) result = pipeline.run() result.wait_until_finish() if metrics_monitor: metrics_monitor.publish_metrics(result)
def _ComputeTFDVStats(pcollection: beam.pvalue.PCollection, schema: schema_pb2.Schema) -> beam.pvalue.PCollection: """Cmoputes Statistics with TFDV. Args: pcollection: pcollection of examples. schema: schema. Returns: PCollection of `DatasetFeatureStatisticsList`. """ feature_specs_from_schema = schema_utils.schema_as_feature_spec( schema).feature_spec def EncodeTFDV(element, feature_specs): """Encodes element in an in-memory format that TFDV expects.""" if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY not in element: raise ValueError( 'Expected _TRANSFORM_INTERNAL_FEATURE_FOR_KEY ({}) to exist in the ' 'input but not found.'.format(_TRANSFORM_INTERNAL_FEATURE_FOR_KEY)) # TODO(b/123549935): Obviate the numpy array conversions by # allowing TFDV to accept primitives in general, and TFT's # input/output format in particular. result = {} for feature_name, feature_spec in six.iteritems(feature_specs): feature_value = element.get(feature_name) if feature_value is None: result[feature_name] = None elif isinstance(feature_value, (np.ndarray, list)): result[feature_name] = np.asarray( feature_value, feature_spec.dtype.as_numpy_dtype) else: result[feature_name] = np.asarray( [feature_value], dtype=feature_spec.dtype.as_numpy_dtype) return result result = (pcollection # TODO(kestert): Remove encoding and batching steps once TFT # supports Arrow tables. | 'EncodeTFDV' >> beam.Map( EncodeTFDV, feature_specs=feature_specs_from_schema)) # TODO(pachristopher): Remove this once TFDV 0.14 is released. (major, minor, _) = tfdv.__version__.split('.') if int(major) > 0 or int(minor) >= 14: result |= ('BatchExamplesToArrowTables' >> batch_util.BatchExamplesToArrowTables()) return (result | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics( tfdv.StatsOptions(schema=schema)))
def compute_stats(input_handle, stats_path, max_rows=None, for_eval=False, pipeline_args=None): """Computes statistics on the input data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. stats_path: Directory in which stats are materialized. max_rows: Number of rows to query from BigQuery for_eval: Query for eval set rows from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ with beam.Pipeline(argv=pipeline_args) as pipeline: if input_handle.lower().endswith('csv'): raw_data = (pipeline | 'ReadData' >> beam.io.textio.ReadFromText( file_pattern=input_handle, skip_header_lines=1) | 'DecodeData' >> csv_decoder.DecodeCSV( column_names=taxi.CSV_COLUMN_NAMES)) else: query = taxi.make_sql(table_name=input_handle, max_rows=max_rows, for_eval=for_eval) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'ConvertToTFDVInput' >> beam.Map( lambda x: { key: np.asarray([x[key]]) # pylint: disable=g-long-lambda for key in x if x[key] is not None })) # TODO(pachristopher): Remove this once TFDV 0.14 is released. (major, minor, _) = tfdv.__version__.split('.') if int(major) > 0 or int(minor) >= 14: raw_data |= ('BatchExamplesToArrowTables' >> batch_util.BatchExamplesToArrowTables()) _ = (raw_data | 'GenerateStatistics' >> tfdv.GenerateStatistics() | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( stats_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList)))
def _ComputeTFDVStats( pcollection: beam.pvalue.PCollection, schema: schema_pb2.Schema) -> beam.pvalue.PCollection: """Cmoputes Statistics with TFDV. Args: pcollection: pcollection of examples. schema: schema. Returns: PCollection of `DatasetFeatureStatisticsList`. """ feature_specs_from_schema = schema_utils.schema_as_feature_spec( schema).feature_spec def EncodeTFDV(element, feature_specs): """Encodes element in an in-memory format that TFDV expects.""" if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY not in element: raise ValueError( 'Expected _TRANSFORM_INTERNAL_FEATURE_FOR_KEY ({}) to exist in the ' 'input but not found.'.format( _TRANSFORM_INTERNAL_FEATURE_FOR_KEY)) # TODO(b/123549935): Obviate the numpy array conversions by # allowing TFDV to accept primitives in general, and TFT's # input/output format in particular. result = {} for feature_name, feature_spec in six.iteritems(feature_specs): feature_value = element.get(feature_name) if feature_value is None: result[feature_name] = None elif isinstance(feature_value, (np.ndarray, list)): result[feature_name] = np.asarray( feature_value, feature_spec.dtype.as_numpy_dtype) else: result[feature_name] = np.asarray( [feature_value], dtype=feature_spec.dtype.as_numpy_dtype) return result return (pcollection | 'EncodeTFDV' >> beam.Map( EncodeTFDV, feature_specs=feature_specs_from_schema) | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics( tfdv.StatsOptions(schema=schema)))
def run_pipeline(flags, pipeline_option): input_path = flags.input_path output_path = flags.output_path column_names = CSV_COLUMNS with beam.Pipeline(options=pipeline_option) as p: # If a header is not provided, assume the first line in a file # to be the header. skip_header_lines = 1 if column_names is None else 0 _ = (p | 'ReadData' >> beam.io.textio.ReadFromText( file_pattern=input_path, skip_header_lines=skip_header_lines) | 'DecodeData' >> csv_decoder.DecodeCSV(column_names=column_names) | 'GenerateStatistics' >> tfdv.GenerateStatistics() | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList)))
def _ComputeTFDVStats(pcollection, schema): """Cmoputes Statistics with TFDV. Args: pcollection: pcollection of examples. schema: schema. Returns: PCollection of `DatasetFeatureStatisticsList`. """ def EncodeTFDV(element): """Encodes element in an in-memory format that TFDV expects.""" assert _TRANSFORM_INTERNAL_FEATURE_FOR_KEY in element # TODO(b/123549935): Obviate the numpy array converstions by # allowing TFDV to accept primitives in general, and TFT's # input/output format in particular. # TODO(kestert): Iterate through schema instead of element.items and # encode missing elements of `element` as None. result = {} for k, v in element.items(): if k == _TRANSFORM_INTERNAL_FEATURE_FOR_KEY: continue # Make sure the synthetic key feature doesn't get encoded. elif isinstance(v, np.ndarray) or v is None: result[k] = v elif isinstance(v, list): if v: result[k] = np.asarray(v) else: # An empty list. # TODO(kestert): Use Metadata to determine the dtype. continue # Instead want: result[k] = np.asarray([], dtype=...) else: result[k] = np.asarray([v]) return result return (pcollection | 'EncodeTFDV' >> beam.Map(EncodeTFDV) | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics( tfdv.StatsOptions(schema=schema)))
def generate_drift_reports( request_response_log_table: str, instance_type: InstanceType, feature_names: List[str], start_time: datetime.datetime, end_time: datetime.datetime, output_path: GCSPath, schema: schema_pb2.Schema, baseline_stats: statistics_pb2.DatasetFeatureStatisticsList, stats_options: stats_options.StatsOptions = stats_options.StatsOptions(), pipeline_options: Optional[PipelineOptions] = None, ): """Computes statistics and anomalies for a time window in AI Platform Prediction request-response log. Args: request_response_log_table: A full name of a BigQuery table with the request_response_log instance_type: The type of instances logged in the request_response_log_table. Currently, the only supported instance types are: a simple list (InstanceType.SIMPLE_LIST) and a JSON object (InstanceType(JSON_OBJECT)) feature_names: A list of feature names. Must be provided if the instance_type is InstanceType(SIMPLE_LIST) start_time: The beginning of a time window. end_time: The end of a time window. output_path: The GCS location to output the statistics and anomalies proto buffers to. The file names will be `stats.pb` and `anomalies.pbtxt`. schema: A Schema protobuf describing the expected schema. stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. """ query = _generate_query(request_response_log_table, start_time, end_time) stats_output_path = os.path.join(output_path, _STATS_FILENAME) anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME) with beam.Pipeline(options=pipeline_options) as p: raw_examples = ( p | 'GetData' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))) if instance_type == InstanceType.SIMPLE_LIST: examples = (raw_examples | 'SimpleInstancesToBeamExamples' >> beam.ParDo(SimpleListCoder(feature_names))) elif instance_type == InstanceType.JSON_OBJECT: examples = (raw_examples | 'JSONObjectInstancesToBeamExamples' >> beam.ParDo(JSONObjectCoder())) else: raise TypeError("Unsupported instance type") stats = (examples | 'BeamExamplesToArrow' >> batch_util.BatchExamplesToArrowTables() | 'GenerateStatistics' >> tfdv.GenerateStatistics(stats_options) ) _ = (stats | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( file_path_prefix=stats_output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) _ = (stats | 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema) | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText( file_path_prefix=anomalies_output_path, shard_name_template='', append_trailing_newlines=False))
def analyze_log_records( request_response_log_table: str, model: str, version: str, start_time: datetime, end_time: datetime, output_path: str, schema: schema_pb2.Schema, baseline_stats: Optional[ statistics_pb2.DatasetFeatureStatisticsList] = None, time_window: Optional[timedelta] = None, pipeline_options: Optional[PipelineOptions] = None, ): """ Computes statistics and detects anomalies for a time series of records in an AI Platform Prediction request-response log. The function starts an Apache Beam job that calculates statistics and detects data anomalies in a time series of records retrieved from an AI Platform Prediction request-response log. Optionally, the function can also calculate stastics for a set of time slices within the time series. The output of the job is a statistics_pb2.DatasetFeatureStatisticsList protobuf with descriptive statistis and an anomalies_pb2.Anomalies protobuf with anomaly reports. The protobufs are stored to a GCS location. Args: request_response_log_table: A full name of a BigQuery table with the request_response_log start_time: The start of the time series. The value will be rounded to minutes. end_time: The end of the time series. The value will be rounded to minutes. output_path: The GCS location to output the statistics and anomaly proto buffers to. The file names will be `stats.pb` and `anomalies.pbtxt`. schema: A Schema protobuf describing the expected schema. baseline_stats: If provided, the baseline statistics will be used to detect distribution anomalies. time_window: If provided the time series of records will be divided into a set of consecutive time slices of the time_window width and the stats will be calculated for each slice. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. """ # Generate a BigQuery query end_time = end_time.replace(second=0, microsecond=0) start_time = start_time.replace(second=0, microsecond=0) query = _generate_query( table_name=request_response_log_table, model=model, version=version, start_time=start_time.strftime('%Y-%m-%dT%H:%M:%S'), end_time=end_time.strftime('%Y-%m-%dT%H:%M:%S')) # Configure slicing for statistics calculations stats_options = tfdv.StatsOptions(schema=schema) slicing_column = None if time_window: time_window = timedelta(days=time_window.days, seconds=(time_window.seconds // 60) * 60) if end_time - start_time > time_window: slice_fn = tfdv.get_feature_value_slicer( features={_SLICING_COLUMN_NAME: None}) stats_options.slice_functions = [slice_fn] slicing_column = _SLICING_COLUMN_NAME slicing_feature = schema.feature.add() slicing_feature.name = _SLICING_COLUMN_NAME slicing_feature.type = _SLICING_COLUMN_TYPE # Configure output paths stats_output_path = os.path.join(output_path, _STATS_FILENAME) anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME) # Define an start the pipeline with beam.Pipeline(options=pipeline_options) as p: raw_examples = ( p | 'GetData' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) examples = ( raw_examples | 'InstancesToBeamExamples' >> beam.ParDo( InstanceCoder(schema, end_time, time_window, slicing_column))) stats = (examples | 'BeamExamplesToArrow' >> tfdv.utils.batch_util.BatchExamplesToArrowRecordBatches() | 'GenerateStatistics' >> tfdv.GenerateStatistics(options=stats_options)) _ = (stats | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( file_path_prefix=stats_output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) anomalies = (stats | 'ValidateStatistics' >> beam.Map( tfdv.validate_statistics, schema=schema, previous_statistics=baseline_stats)) _ = (anomalies | 'AlertIfAnomalies' >> beam.Map(_alert_if_anomalies, anomalies_output_path) | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText( file_path_prefix=anomalies_output_path, shard_name_template='', append_trailing_newlines=False))
def generate_statistics_from_bq( query: Text, output_path: Text, schema: schema_pb2.Schema, stats_options: stats_options.StatsOptions = stats_options.StatsOptions(), pipeline_options: Optional[PipelineOptions] = None, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Computes data statistics from a BigQuery query result. Args: query: The BigQuery query. output_path: The file path to output data statistics result to. It will be a TFRecord file containing a single data statistics proto, and can be read with the 'load_statistics' API. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. schema: A Schema protobuf to use for data validation stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. Returns: A DatasetFeatureStatisticsList proto. """ column_specs = _get_column_specs(query) if not validate_bq_types(_get_column_specs(query).values()): raise ValueError("Unsupported BigQuery data types.") batch_size = (stats_options.desired_batch_size if stats_options.desired_batch_size and stats_options.desired_batch_size > 0 else tfdv.constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE) # PyLint doesn't understand Beam PTransforms. # pylint: disable=no-value-for-parameter stats_output_path = os.path.join(output_path, _STATS_FILENAME) anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME) with beam.Pipeline(options=pipeline_options) as p: stats = ( p | 'GetData' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) # | 'DecodeData' >> DecodeBigQuery(column_specs, # desired_batch_size=batch_size) | 'DecodeExamples' >> batch_util.BatchExamplesToArrowTables() | 'GenerateStatistics' >> tfdv.GenerateStatistics()) _ = (stats | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( file_path_prefix=stats_output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) _ = (stats | 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema) | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText( file_path_prefix=anomalies_output_path, shard_name_template='', append_trailing_newlines=False))
'f1': [3], 'f3': ['bbb'], 'f4': [1] }] with beam.Pipeline(options=pipeline_options) as p: stats = ( p | 'GetData' >> beam.Create(instances) # | 'BatchDictionaries' >> beam.BatchElements( # min_batch_size = desired_batch_size, # max_batch_size = desired_batch_size) # | 'CovertToArrowTables' >> beam.ParDo( # BatchedDictsToArrowTable()) | 'DecodeExamples' >> batch_util.BatchExamplesToArrowTables() | 'GenerateStatistics' >> tfdv.GenerateStatistics()) _ = (stats | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( file_path_prefix=stats_output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) # _ = (stats #| 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema) # | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText( # file_path_prefix=anomalies_output_path, # shard_name_template='', # append_trailing_newlines=True))