def Load_TFDV(df): lencols = len(df.columns) # print(lencols) y_tfdv = [0] * lencols i = 0 for col in df.columns: # print(col) df_col = df[[col]] st_option = tfdv.StatsOptions(enable_semantic_domain_stats=True) stats = tfdv.generate_statistics_from_dataframe( df_col, stats_options=st_option) schema = tfdv.infer_schema(statistics=stats) categ_lst = get_categorical_features(schema) for x in categ_lst: y_tfdv[i] = 1 break xc = schema.feature # print(xc) for x in xc: cnt_NLD = str(x).count('natural_language_domain') cnt_TD = str(x).count('time_domain') if cnt_NLD: y_tfdv[i] = 3 if cnt_TD: y_tfdv[i] = 2 print(y_tfdv[i]) i = i + 1 return y_tfdv
def stats_list(): local_workspace = '/home/jarekk/workspace/analysis' local_tfrecords_file = '{}/log_records.tfrecords'.format(local_workspace) slice_fn = tfdv.get_feature_value_slicer(features={'time_window': None}) stats_options = tfdv.StatsOptions(slice_functions=[slice_fn]) stats_list = tfdv.generate_statistics_from_tfrecord( data_location=local_tfrecords_file, stats_options=stats_options) return stats_list
def _ComputeTFDVStats(pcollection: beam.pvalue.PCollection, schema: schema_pb2.Schema) -> beam.pvalue.PCollection: """Cmoputes Statistics with TFDV. Args: pcollection: pcollection of examples. schema: schema. Returns: PCollection of `DatasetFeatureStatisticsList`. """ feature_specs_from_schema = schema_utils.schema_as_feature_spec( schema).feature_spec def EncodeTFDV(element, feature_specs): """Encodes element in an in-memory format that TFDV expects.""" if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY not in element: raise ValueError( 'Expected _TRANSFORM_INTERNAL_FEATURE_FOR_KEY ({}) to exist in the ' 'input but not found.'.format(_TRANSFORM_INTERNAL_FEATURE_FOR_KEY)) # TODO(b/123549935): Obviate the numpy array conversions by # allowing TFDV to accept primitives in general, and TFT's # input/output format in particular. result = {} for feature_name, feature_spec in six.iteritems(feature_specs): feature_value = element.get(feature_name) if feature_value is None: result[feature_name] = None elif isinstance(feature_value, (np.ndarray, list)): result[feature_name] = np.asarray( feature_value, feature_spec.dtype.as_numpy_dtype) else: result[feature_name] = np.asarray( [feature_value], dtype=feature_spec.dtype.as_numpy_dtype) return result result = (pcollection # TODO(kestert): Remove encoding and batching steps once TFT # supports Arrow tables. | 'EncodeTFDV' >> beam.Map( EncodeTFDV, feature_specs=feature_specs_from_schema)) # TODO(pachristopher): Remove this once TFDV 0.14 is released. (major, minor, _) = tfdv.__version__.split('.') if int(major) > 0 or int(minor) >= 14: result |= ('BatchExamplesToArrowTables' >> batch_util.BatchExamplesToArrowTables()) return (result | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics( tfdv.StatsOptions(schema=schema)))
def update_schema(csv_file, schema): #get column names from passed in schema cols = [f.name for f in schema.feature].sort() options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True) new_batch_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,\ column_names=cols, stats_options = options) # Check eval data for errors by validating the eval data stats using the previously inferred schema updated_schema = tfdv.update_schema(schema, new_batch_stats) #tfdv.display_schema(schema=updated_schema) return updated_schema
def testConstructWithSchemaAndStatsOptions(self): examples = standard_artifacts.Examples() examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) schema = standard_artifacts.Schema() stats_options = tfdv.StatsOptions(weight_feature='weight') statistics_gen = component.StatisticsGen( examples=channel_utils.as_channel([examples]), schema=channel_utils.as_channel([schema]), stats_options=stats_options) self.assertEqual( standard_artifacts.ExampleStatistics.TYPE_NAME, statistics_gen.outputs[ standard_component_specs.STATISTICS_KEY].type_name)
def get_num_anomalies(csv_file, schema, environment='TRAINING'): #get column names from passed in schema cols = [f.name for f in schema.feature].sort() options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True) data_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,\ column_names=cols, stats_options=options) # Check eval data for errors by validating the eval data stats using the previously inferred schema anomalies = tfdv.validate_statistics(statistics=data_stats, schema=schema, environment=environment) #tfdv.display_anomalies(anomalies) return len(anomalies.anomaly_info)
def testDoWithSchemaAndStatsOptions(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) tf.io.gfile.makedirs(output_data_dir) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) schema = standard_artifacts.Schema() schema.uri = os.path.join(source_data_dir, 'schema_gen') input_dict = { executor.EXAMPLES_KEY: [examples], executor.SCHEMA_KEY: [schema] } exec_properties = { executor.STATS_OPTIONS_JSON_KEY: tfdv.StatsOptions(label_feature='company').to_json(), } # Create output dict. stats = standard_artifacts.ExampleStatistics() stats.uri = output_data_dir stats.split_names = artifact_utils.encode_split_names( ['train', 'eval']) output_dict = { executor.STATISTICS_KEY: [stats], } # Run executor. stats_gen_executor = executor.Executor() stats_gen_executor.Do(input_dict, output_dict, exec_properties=exec_properties) # Check statistics_gen outputs. self._validate_stats_output( os.path.join(stats.uri, 'train', 'stats_tfrecord')) self._validate_stats_output( os.path.join(stats.uri, 'eval', 'stats_tfrecord'))
def _ComputeTFDVStats( pcollection: beam.pvalue.PCollection, schema: schema_pb2.Schema) -> beam.pvalue.PCollection: """Cmoputes Statistics with TFDV. Args: pcollection: pcollection of examples. schema: schema. Returns: PCollection of `DatasetFeatureStatisticsList`. """ feature_specs_from_schema = schema_utils.schema_as_feature_spec( schema).feature_spec def EncodeTFDV(element, feature_specs): """Encodes element in an in-memory format that TFDV expects.""" if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY not in element: raise ValueError( 'Expected _TRANSFORM_INTERNAL_FEATURE_FOR_KEY ({}) to exist in the ' 'input but not found.'.format( _TRANSFORM_INTERNAL_FEATURE_FOR_KEY)) # TODO(b/123549935): Obviate the numpy array conversions by # allowing TFDV to accept primitives in general, and TFT's # input/output format in particular. result = {} for feature_name, feature_spec in six.iteritems(feature_specs): feature_value = element.get(feature_name) if feature_value is None: result[feature_name] = None elif isinstance(feature_value, (np.ndarray, list)): result[feature_name] = np.asarray( feature_value, feature_spec.dtype.as_numpy_dtype) else: result[feature_name] = np.asarray( [feature_value], dtype=feature_spec.dtype.as_numpy_dtype) return result return (pcollection | 'EncodeTFDV' >> beam.Map( EncodeTFDV, feature_specs=feature_specs_from_schema) | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics( tfdv.StatsOptions(schema=schema)))
def testConstructWithSchemaAndStatsOptions(self): examples = standard_artifacts.Examples() examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) schema = standard_artifacts.Schema() stats_options = tfdv.StatsOptions( weight_feature='weight', generators=[ # generators should be dropped tfdv.LiftStatsGenerator( schema=None, y_path=tfdv.FeaturePath(['label']), x_paths=[tfdv.FeaturePath(['feature'])]) ]) statistics_gen = component.StatisticsGen( examples=channel_utils.as_channel([examples]), schema=channel_utils.as_channel([schema]), stats_options=stats_options) self.assertEqual(standard_artifacts.ExampleStatistics.TYPE_NAME, statistics_gen.outputs['statistics'].type_name)
def testDoWithTwoSchemas(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) tf.io.gfile.makedirs(output_data_dir) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) schema = standard_artifacts.Schema() schema.uri = os.path.join(source_data_dir, 'schema_gen') input_dict = { executor.EXAMPLES_KEY: [examples], executor.SCHEMA_KEY: [schema] } exec_properties = { executor.STATS_OPTIONS_JSON_KEY: tfdv.StatsOptions(label_feature='company', schema=schema_pb2.Schema()).to_json(), executor.EXCLUDE_SPLITS_KEY: json_utils.dumps([]) } # Create output dict. stats = standard_artifacts.ExampleStatistics() stats.uri = output_data_dir stats.split_names = artifact_utils.encode_split_names( ['train', 'eval']) output_dict = { executor.STATISTICS_KEY: [stats], } # Run executor. stats_gen_executor = executor.Executor() with self.assertRaises(ValueError): stats_gen_executor.Do(input_dict, output_dict, exec_properties)
def _ComputeTFDVStats(pcollection, schema): """Cmoputes Statistics with TFDV. Args: pcollection: pcollection of examples. schema: schema. Returns: PCollection of `DatasetFeatureStatisticsList`. """ def EncodeTFDV(element): """Encodes element in an in-memory format that TFDV expects.""" assert _TRANSFORM_INTERNAL_FEATURE_FOR_KEY in element # TODO(b/123549935): Obviate the numpy array converstions by # allowing TFDV to accept primitives in general, and TFT's # input/output format in particular. # TODO(kestert): Iterate through schema instead of element.items and # encode missing elements of `element` as None. result = {} for k, v in element.items(): if k == _TRANSFORM_INTERNAL_FEATURE_FOR_KEY: continue # Make sure the synthetic key feature doesn't get encoded. elif isinstance(v, np.ndarray) or v is None: result[k] = v elif isinstance(v, list): if v: result[k] = np.asarray(v) else: # An empty list. # TODO(kestert): Use Metadata to determine the dtype. continue # Instead want: result[k] = np.asarray([], dtype=...) else: result[k] = np.asarray([v]) return result return (pcollection | 'EncodeTFDV' >> beam.Map(EncodeTFDV) | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics( tfdv.StatsOptions(schema=schema)))
def analyze_log_records( request_response_log_table: str, model: str, version: str, start_time: datetime, end_time: datetime, output_path: str, schema: schema_pb2.Schema, baseline_stats: Optional[ statistics_pb2.DatasetFeatureStatisticsList] = None, time_window: Optional[timedelta] = None, pipeline_options: Optional[PipelineOptions] = None, ): """ Computes statistics and detects anomalies for a time series of records in an AI Platform Prediction request-response log. The function starts an Apache Beam job that calculates statistics and detects data anomalies in a time series of records retrieved from an AI Platform Prediction request-response log. Optionally, the function can also calculate stastics for a set of time slices within the time series. The output of the job is a statistics_pb2.DatasetFeatureStatisticsList protobuf with descriptive statistis and an anomalies_pb2.Anomalies protobuf with anomaly reports. The protobufs are stored to a GCS location. Args: request_response_log_table: A full name of a BigQuery table with the request_response_log start_time: The start of the time series. The value will be rounded to minutes. end_time: The end of the time series. The value will be rounded to minutes. output_path: The GCS location to output the statistics and anomaly proto buffers to. The file names will be `stats.pb` and `anomalies.pbtxt`. schema: A Schema protobuf describing the expected schema. baseline_stats: If provided, the baseline statistics will be used to detect distribution anomalies. time_window: If provided the time series of records will be divided into a set of consecutive time slices of the time_window width and the stats will be calculated for each slice. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. """ # Generate a BigQuery query end_time = end_time.replace(second=0, microsecond=0) start_time = start_time.replace(second=0, microsecond=0) query = _generate_query( table_name=request_response_log_table, model=model, version=version, start_time=start_time.strftime('%Y-%m-%dT%H:%M:%S'), end_time=end_time.strftime('%Y-%m-%dT%H:%M:%S')) # Configure slicing for statistics calculations stats_options = tfdv.StatsOptions(schema=schema) slicing_column = None if time_window: time_window = timedelta(days=time_window.days, seconds=(time_window.seconds // 60) * 60) if end_time - start_time > time_window: slice_fn = tfdv.get_feature_value_slicer( features={_SLICING_COLUMN_NAME: None}) stats_options.slice_functions = [slice_fn] slicing_column = _SLICING_COLUMN_NAME slicing_feature = schema.feature.add() slicing_feature.name = _SLICING_COLUMN_NAME slicing_feature.type = _SLICING_COLUMN_TYPE # Configure output paths stats_output_path = os.path.join(output_path, _STATS_FILENAME) anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME) # Define an start the pipeline with beam.Pipeline(options=pipeline_options) as p: raw_examples = ( p | 'GetData' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) examples = ( raw_examples | 'InstancesToBeamExamples' >> beam.ParDo( InstanceCoder(schema, end_time, time_window, slicing_column))) stats = (examples | 'BeamExamplesToArrow' >> tfdv.utils.batch_util.BatchExamplesToArrowRecordBatches() | 'GenerateStatistics' >> tfdv.GenerateStatistics(options=stats_options)) _ = (stats | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( file_path_prefix=stats_output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) anomalies = (stats | 'ValidateStatistics' >> beam.Map( tfdv.validate_statistics, schema=schema, previous_statistics=baseline_stats)) _ = (anomalies | 'AlertIfAnomalies' >> beam.Map(_alert_if_anomalies, anomalies_output_path) | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText( file_path_prefix=anomalies_output_path, shard_name_template='', append_trailing_newlines=False))
import tensorflow as tf tf.get_logger().setLevel('ERROR') import tensorflow_data_validation as tfdv tfrecord_path = '/home/segmind/Desktop/test/tfdv/bikes_persons_dataset/dataset.tfrecord' semantic_stats_options = tfdv.StatsOptions(enable_semantic_domain_stats=True) #exit() stats = tfdv.generate_statistics_from_tfrecord( data_location=tfrecord_path, stats_options=semantic_stats_options) #print(stats) tfdv.visualize_statistics(stats)
import tensorflow_data_validation as tfdv import pandas as pd import datetime from tensorflow_data_validation.utils import slicing_util data_location = '/home/jarekk/workspace/test.csv' output_location = '/home/jarekk/workspace/stats.pb' slice_fn = slicing_util.get_feature_value_slicer( features={'time_window': None}) stats_options = tfdv.StatsOptions(slice_functions=[slice_fn]) stats = tfdv.generate_statistics_from_csv(data_location, stats_options=stats_options, output_path=output_location)
PATH_TO_WHL_FILE = './tfdv.whl' CLOUMNS = "FL_DATE, MKT_UNIQUE_CARRIER, ORIGIN_AIRPORT_ID, DEST_AIRPORT_ID, DEP_TIME, DEP_DELAY, ARR_DELAY, DISTANCE, dep_lat, dep_lng, arr_lat, arr_lng".split( ', ') # Create and set your PipelineOptions. options = PipelineOptions() # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = JOB_NAME google_cloud_options.staging_location = GCS_STAGING_LOCATION google_cloud_options.temp_location = GCS_TMP_LOCATION options.view_as(StandardOptions).runner = 'DataflowRunner' setup_options = options.view_as(SetupOptions) # PATH_TO_WHL_FILE should point to the downloaded tfdv wheel file. # setup_options.extra_packages = [PATH_TO_WHL_FILE] setup_options.setup_file = os.path.join(os.getcwd(), 'tfdv_setup.py') stat_options = tfdv.StatsOptions(sample_rate=0.01) tfdv.generate_statistics_from_csv( GCS_DATA_LOCATION, column_names=CLOUMNS, output_path=GCS_STATS_OUTPUT_PATH, stats_options=stat_options, pipeline_options=options, )
def get_post_transform_stats_options() -> tfdv.StatsOptions: return (tfdv.StatsOptions() if _POST_TRANSFORM_STATS_OPTIONS is None else _POST_TRANSFORM_STATS_OPTIONS)
max: 3 } } } } } """ # Do not inline the goldens in _TEST_CASES. This way indentation is easier to # manage. The rule is to have no first level indent for goldens. _TEST_CASES = [ dict( testcase_name='basic', stats_options=tfdv.StatsOptions(num_rank_histogram_buckets=3, num_values_histogram_buckets=3, num_histogram_buckets=3, num_quantiles_histogram_buckets=3, enable_semantic_domain_stats=True), expected_stats_pbtxt=_BASIC_GOLDEN_STATS, expected_inferred_schema_pbtxt=_BASIC_GOLDEN_INFERRED_SCHEMA, schema_for_validation_pbtxt=_BASIC_SCHEMA_FOR_VALIDATION, expected_anomalies_pbtxt=_BASIC_GOLDEN_ANOMALIES, expected_updated_schema_pbtxt=_BASIC_SCHEMA_FROM_UPDATE, ), dict( testcase_name='weight_and_label', stats_options=tfdv.StatsOptions(label_feature=_LABEL, weight_feature=_EXAMPLE_WEIGHT, num_rank_histogram_buckets=3, num_values_histogram_buckets=3, num_histogram_buckets=3,
from tfx.dsl.experimental import latest_blessed_model_resolver from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor from tfx.extensions.google_cloud_big_query.example_gen import component as big_query_example_gen_component # pylint: disable=unused-import from tfx.orchestration import pipeline from tfx.proto import example_gen_pb2, pusher_pb2 from tfx.proto import trainer_pb2 from tfx.types import Channel from tfx.types.standard_artifacts import Model from tfx.types.standard_artifacts import ModelBlessing from tfx.utils.dsl_utils import external_input import tensorflow_data_validation as tfdv CURATED_SCHEMA_GEN = "gs://raw_data_layer/schema/schema.pbtxt" STATS_OPTIONS = tfdv.StatsOptions(feature_whitelist=[ 'image/height', 'image/width', 'image/filename', 'image/source_id', 'image/format' ]) def create_pipeline( pipeline_name: Text, pipeline_root: Text, data_path: Text, # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source. # query: Text, preprocessing_fn: Text, run_fn: Text, train_args: trainer_pb2.TrainArgs, eval_args: trainer_pb2.EvalArgs, eval_accuracy_threshold: float, serving_model_dir: Text,