def skew_drift_validator(mode, gcp_bucket, control_set_path, treatment_set_path, feature_list_str, Linf_value): logging.basicConfig(level=logging.INFO) logging.info('Starting skew drift validator ..') logging.info('Input data:') logging.info('mode:{}'.format(mode)) logging.info('gcp_bucket:{}'.format(gcp_bucket)) logging.info('control_set_path:{}'.format(control_set_path)) logging.info('treatment_set_path:{}'.format(treatment_set_path)) logging.info('Linf_value:{}'.format(Linf_value)) feature_list = eval(feature_list_str) control_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + control_set_path, sep=',') treat_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + treatment_set_path, sep=',') control_stats = tfdv.generate_statistics_from_dataframe( dataframe=control_set_df) treat_stats = tfdv.generate_statistics_from_dataframe( dataframe=treat_set_df) control_schema = tfdv.infer_schema(control_stats) treat_schema = tfdv.infer_schema(treat_stats) for feature in feature_list: if (mode == "skew"): if (tfdv.get_feature(control_schema, feature).domain ): # if we have domain it is a categorical variable tfdv.get_feature( control_schema, feature ).skew_comparator.infinity_norm.threshold = Linf_value else: logging.critical( "feature: {} is not categorical".format(feature)) sys.exit(1) elif (mode == "drift"): tfdv.get_feature( control_schema, feature).drift_comparator.infinity_norm.threshold = Linf_value else: logging.critical("mode: {} not supported".format(mode)) sys.exit(1) anomalies = tfdv.validate_statistics(statistics=control_stats, schema=control_schema, serving_statistics=treat_stats) if (anomalies.anomaly_info): logging.info("Data-{} detected:".format(anomalies)) return anomalies else: logging.info("No data-{} detected".format(mode))
def Load_TFDV(df): lencols = len(df.columns) # print(lencols) y_tfdv = [0] * lencols i = 0 for col in df.columns: # print(col) df_col = df[[col]] st_option = tfdv.StatsOptions(enable_semantic_domain_stats=True) stats = tfdv.generate_statistics_from_dataframe( df_col, stats_options=st_option) schema = tfdv.infer_schema(statistics=stats) categ_lst = get_categorical_features(schema) for x in categ_lst: y_tfdv[i] = 1 break xc = schema.feature # print(xc) for x in xc: cnt_NLD = str(x).count('natural_language_domain') cnt_TD = str(x).count('time_domain') if cnt_NLD: y_tfdv[i] = 3 if cnt_TD: y_tfdv[i] = 2 print(y_tfdv[i]) i = i + 1 return y_tfdv
def compute_training_stats(self, stats_path=None): """compute training stats.""" dataset = self.fs.create_dataset(self.train_feature_set, "2009-01-01", "2016-01-01") training_df = self.fs.download_dataset_to_df(dataset, self.STAGING_LOCATION) training_stats = tfdv.generate_statistics_from_dataframe(training_df) if stats_path: logging.info("Saving training stats to %s", stats_path) demo_util.save_proto(training_stats, stats_path) else: logging.info("No stats_path provided; not saving stats") return training_stats
def feature_stats_dataset_agg(client, feature_stats_feature_set): time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) start_date = time_offset - timedelta(days=10) end_date = time_offset - timedelta(days=7) df1 = pd.DataFrame({ "datetime": [start_date] * 5, "entity_id": [i for i in range(5)], "strings": ["a", "b", "b", "b", "a"], "ints": [4, 3, 2, 6, 3], "floats": [2.1, 5.2, 4.3, 0.6, 0.1], }) ingestion_id_1 = client.ingest(feature_stats_feature_set, df1) df2 = pd.DataFrame({ "datetime": [start_date + timedelta(days=1)] * 3, "entity_id": [i for i in range(3)], "strings": ["a", "b", "c"], "ints": [2, 6, 7], "floats": [1.6, 2.4, 2], }) ingestion_id_2 = client.ingest(feature_stats_feature_set, df2) combined_df = pd.concat([df1, df2])[["strings", "ints", "floats"]] expected_stats = tfdv.generate_statistics_from_dataframe(combined_df) clear_unsupported_agg_fields(expected_stats) # Since TFDV computes population std dev for feature in expected_stats.datasets[0].features: if feature.HasField("num_stats"): name = feature.path.step[0] std = combined_df[name].std() feature.num_stats.std_dev = std time.sleep(10) return { "ids": [ingestion_id_1, ingestion_id_2], "start_date": datetime(start_date.year, start_date.month, start_date.day).replace(tzinfo=pytz.utc), "end_date": datetime(end_date.year, end_date.month, end_date.day).replace(tzinfo=pytz.utc), "stats": expected_stats, }
def tfdv_and_additional_anomalies( df: pd.DataFrame, tfdv_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]: """ Get TFDV and additional anomalies. Args: df {pandas.DataFrame}: dataframe tfdv_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: TFDV statistics Returns: Tuple[bool, Dict]: True if anomalies are detected, otherwise False, dictionary with structure: { <column_name>: { 'description': <description>, 'severity': 'ERROR', 'shortDescription': <short description>, 'reason': [{'type': <error_type>, 'shortDescription': <short description>, 'description': <description>}], 'path': {'step': [<column_name>]} } } """ df_statistics = tfdv.generate_statistics_from_dataframe(df) interval_anomalies_detected = False tfdv_anomalies_detected, tfdv_anomalies = tfdv_statistics_anomalies( tfdv_statistics, df_statistics) interval_anomalies = {} if len(tfdv_anomalies) > 0: tfdv_anomalies_detected = True if os.getenv('CHECK_NUMERIC_INTERVALS_ON_PREDICT') == 'true': interval_anomalies_detected, interval_anomalies = data_intervals_anomalies( df, tfdv_statistics) anomalies_detected = tfdv_anomalies_detected or interval_anomalies_detected anomalies = {**tfdv_anomalies, **interval_anomalies} return anomalies_detected, anomalies
def evaluate(self, model: BaseEstimator, num_repetitions: int, *corruptions: DataCorruption): schema = self.schema_from_train_data() baseline_predictions = model.predict_proba(self._task.test_data) baseline_score = self._task.score_on_test_data(baseline_predictions) results = [] # Repeatedly corrupt the test data for corruption in corruptions: corrupted_scores = [] anomalies = [] for _ in range(0, num_repetitions): test_data_copy = self._task.test_data.copy(deep=True) corrupted_data = corruption.transform(test_data_copy) # Determine whether tfdv finds anomalies in the data corrupted_data_stats = tfdv.generate_statistics_from_dataframe( corrupted_data) tfdv_anomalies = tfdv.validate_statistics( statistics=corrupted_data_stats, schema=schema) schema_anomalies = tfdv_anomalies.anomaly_info # Compute the prediction score on the test data corrupted_predictions = model.predict_proba(corrupted_data) corrupted_score = self._task.score_on_test_data( corrupted_predictions) anomalies.append(schema_anomalies) corrupted_scores.append(corrupted_score) results.append( SchemaValidationResult(corruption, anomalies, baseline_score, corrupted_scores)) return results
def feature_stats_dataset_basic(client, feature_stats_feature_set): N_ROWS = 20 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) df = pd.DataFrame({ "datetime": [time_offset] * N_ROWS, "entity_id": [i for i in range(N_ROWS)], "strings": ["a", "b"] * int(N_ROWS / 2), "ints": [int(i) for i in range(N_ROWS)], "floats": [10.5 - i for i in range(N_ROWS)], }) expected_stats = tfdv.generate_statistics_from_dataframe( df[["strings", "ints", "floats"]]) clear_unsupported_fields(expected_stats) # Since TFDV computes population std dev for feature in expected_stats.datasets[0].features: if feature.HasField("num_stats"): name = feature.path.step[0] std = df[name].std() feature.num_stats.std_dev = std ingestion_id = client.ingest(feature_stats_feature_set, df) time.sleep(10) return { "df": df, "id": ingestion_id, "date": datetime(time_offset.year, time_offset.month, time_offset.day).replace(tzinfo=pytz.utc), "stats": expected_stats, }
def test_feature_stats_force_refresh(client, feature_stats_dataset_basic, feature_stats_feature_set): df = feature_stats_dataset_basic["df"] df2 = pd.DataFrame({ "datetime": [df.iloc[0].datetime], "entity_id": [10], "strings": ["c"], "ints": [2], "floats": [1.3], }) client.ingest(feature_stats_feature_set, df2) time.sleep(10) actual_stats = client.get_statistics( "feature_stats", features=["strings", "ints", "floats"], store="historical", start_date=feature_stats_dataset_basic["date"], end_date=feature_stats_dataset_basic["date"] + timedelta(days=1), force_refresh=True, ) combined_df = pd.concat([df, df2]) expected_stats = tfdv.generate_statistics_from_dataframe(combined_df) clear_unsupported_fields(expected_stats) # Since TFDV computes population std dev for feature in expected_stats.datasets[0].features: if feature.HasField("num_stats"): name = feature.path.step[0] std = combined_df[name].std() feature.num_stats.std_dev = std assert_stats_equal(expected_stats, actual_stats)
# MAGIC %md # MAGIC ## Use Tensorflow Validation Library # MAGIC - check schema between the training and serving periods of time # MAGIC - check for data drift and skew between training and serving # COMMAND ---------- from sklearn.model_selection import train_test_split import tensorflow_data_validation as tfdv from tensorflow_data_validation.utils.display_util import get_statistics_html import warnings warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning) stats_train = tfdv.generate_statistics_from_dataframe( dataframe=train_df.toPandas()) stats_serve = tfdv.generate_statistics_from_dataframe(dataframe=fdf.toPandas()) schema = tfdv.infer_schema(statistics=stats_train) tfdv.display_schema(schema=schema) # COMMAND ---------- # Compare evaluation data with training data displayHTML( get_statistics_html(lhs_statistics=stats_serve, rhs_statistics=stats_train, lhs_name='SERVE_DATASET', rhs_name='TRAIN_DATASET')) # COMMAND ----------
we are using Tensorflow Data Validation since Azure Data validation seems to reside in a separate service ''' # %% import pyarrow import apache_beam as beam import apache_beam.io.iobase import tensorflow import tensorflow_data_validation as tfdv # %% #train_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_train) # %% train_stats = tfdv.generate_statistics_from_dataframe(train) # %% # %% ''' ## visualize statistics of train data ''' # %% tfdv.visualize_statistics(train_stats) # %%
def get_validation_report(deployment_id: int, timestamp_from: float, timestamp_to: float) -> JSONResponse: conf = Config() connection = psycopg2.connect( database=conf.get('PROJECTS_DB_NAME'), host=conf.get('DB_HOST'), port=conf.get('DB_PORT'), user=conf.get('DB_USER'), password=conf.get('DB_PASSWORD') ) cursor = connection.cursor() cursor.execute( f'SELECT model_uri FROM {DeployDbSchema.DEPLOYMENTS_TABLE} ' f'WHERE id = {deployment_id}' ) try: model_uri = cursor.fetchone()[0] except TypeError: raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found') cursor.execute( f'SELECT incoming_data FROM {DeployDbSchema.INCOMING_DATA_TABLE} ' f'WHERE deployment_id = {deployment_id} AND ' f' timestamp >= {timestamp_from} AND timestamp <= {timestamp_to}' ) schema_file_path = get_schema_file_path(model_uri) if not schema_file_exists(schema_file_path): return JSONResponse({}) tfdv_statistics = read_tfdv_statistics(schema_file_path) tfdv_statistics_dict = tfdv_object_to_dict(tfdv_statistics) data_batches = cursor.fetchall() dataframes = [] for batch in data_batches: df = load_data(batch[0]) dataframes.append(df) if len(dataframes) == 0: return JSONResponse({}) incoming_data_df = pd.concat(dataframes, ignore_index=False) """ Convert object columns to string to avoid errors when trying to generate TFDV statistics. If column contains string and numeric values it's type is object, but the column still contains values of different types. And TFDV tries to convert string value to numeric (integer, float) => error """ object_columns = incoming_data_df.select_dtypes(include='object').columns.tolist() incoming_data_df[object_columns] = incoming_data_df[object_columns].astype('str') incoming_data_statistics = tfdv.generate_statistics_from_dataframe(incoming_data_df) incoming_data_statistics_dict = tfdv_object_to_dict(incoming_data_statistics) tfdv_anomalies_detected, tfdv_anomalies = tfdv_statistics_anomalies( tfdv_statistics, incoming_data_statistics ) interval_anomalies_detected, interval_anomalies = data_intervals_anomalies( incoming_data_df, tfdv_statistics ) anomalies_detected = tfdv_anomalies_detected or interval_anomalies_detected anomalies = {**tfdv_anomalies, **interval_anomalies} return JSONResponse({ 'timestamp_from': timestamp_from, 'timestamp_to': timestamp_to, 'model_statistics': tfdv_statistics_dict, 'incoming_data_statistics': incoming_data_statistics_dict, 'anomalies_detected': anomalies_detected, 'anomalies_info': anomalies })
def test_batch_dataset_statistics(client): fs1 = client.get_feature_set(name="feature_set_1") fs2 = client.get_feature_set(name="feature_set_2") id_offset = 20 n_rows = 21 time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) features_1_df = pd.DataFrame({ "datetime": [time_offset] * n_rows, "entity_id": [id_offset + i for i in range(n_rows)], "feature_value6": ["a" for i in range(n_rows)], }) ingestion_id1 = client.ingest(fs1, features_1_df) features_2_df = pd.DataFrame({ "datetime": [time_offset] * n_rows, "other_entity_id": [id_offset + i for i in range(n_rows)], "other_feature_value7": [int(i) % 10 for i in range(0, n_rows)], }) ingestion_id2 = client.ingest(fs2, features_2_df) entity_df = pd.DataFrame({ "datetime": [time_offset] * n_rows, "entity_id": [id_offset + i for i in range(n_rows)], "other_entity_id": [id_offset + i for i in range(n_rows)], }) time.sleep(15) # wait for rows to get written to bq while True: rows_ingested1 = get_rows_ingested(client, fs1, ingestion_id1) rows_ingested2 = get_rows_ingested(client, fs2, ingestion_id2) if rows_ingested1 == len(features_1_df) and rows_ingested2 == len( features_2_df): print( f"Number of rows successfully ingested: {rows_ingested1}, {rows_ingested2}. Continuing." ) break time.sleep(30) feature_retrieval_job = client.get_historical_features( entity_rows=entity_df, feature_refs=["feature_value6", "feature_set_2:other_feature_value7"], project=PROJECT_NAME, compute_statistics=True, ) output = feature_retrieval_job.to_dataframe(timeout_sec=180) print(output.head(10)) stats = feature_retrieval_job.statistics(timeout_sec=180) clear_unsupported_fields(stats) expected_stats = tfdv.generate_statistics_from_dataframe( output[["feature_value6", "feature_set_2__other_feature_value7"]]) clear_unsupported_fields(expected_stats) # Since TFDV computes population std dev for feature in expected_stats.datasets[0].features: if feature.HasField("num_stats"): name = feature.path.step[0] std = output[name].std() feature.num_stats.std_dev = std assert_stats_equal(expected_stats, stats) clean_up_remote_files(feature_retrieval_job.get_avro_files())
import pandas as pd import tensorflow as tf import tensorflow_data_validation as tfdv # Simple dataset analysis dataset = pd.read_csv("data/pollution-small.csv") print(dataset.shape) training_data = dataset[:1600] print(training_data.describe()) test_set = dataset[1600:] print(test_set.describe()) # Generate training data statistics train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset) schema = tfdv.infer_schema(statistics=train_stats) print(tfdv.display_schema(schema)) test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set) # Compare test statistics with the Schema anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema) # Displaying all detected anomalies # Integer larger than 10 # STRING type when expected INT type # FLOAT type when expected INT type # Integer smaller than 0 print(tfdv.display_anomalies(anomalies)) # New data WITH anomalies
display(weatherDF['avg_temp_f', 'tot_precip_mm', 'avg_wnd_mps', 'avg_vis_m', 'avg_slp_hpa', 'avg_dewpt_f'].summary()) # COMMAND ---------- # MAGIC %md visualize statistics using Tensorflow data validation lib # COMMAND ---------- import tensorflow_data_validation as tfdv from tensorflow_data_validation.utils.display_util import get_statistics_html import warnings from sklearn.model_selection import train_test_split warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning) stats = tfdv.generate_statistics_from_dataframe(dataframe=weatherDF.toPandas()) tfdv.visualize_statistics(stats) displayHTML( get_statistics_html(stats) ) #tot_precip_mm has about 90% zeros!, avg_wnd_mps has about 16% #No missing data # COMMAND ---------- # MAGIC %md infer schema # COMMAND ---------- weather_data_schema = tfdv.infer_schema(statistics=stats) tfdv.display_schema(schema=weather_data_schema) # COMMAND ----------
args = parser.parse_args() experiment_name = 'IrisSVC' mlflow.set_experiment(experiment_name) DATASET = 'data/iris.csv' TARGET_LABELED_DATASET = 'data/labeled_iris.csv' TARGET_COLUMN = 'species' IRIS_STATISTICS = '/tmp/stats.tfdv' with mlflow.start_run() as run: dataset = pd.read_csv(DATASET) dataset[TARGET_COLUMN] = LabelEncoder().fit_transform(dataset[TARGET_COLUMN]) dataset.to_csv(TARGET_LABELED_DATASET, index=False) statistics = tfdv.generate_statistics_from_dataframe(dataset.drop(TARGET_COLUMN, axis=1)) with open(IRIS_STATISTICS, 'wb') as out_stats: out_stats.write(statistics.SerializeToString()) mlflow.log_artifact(DATASET) mlflow.log_artifact(TARGET_LABELED_DATASET) mlflow.log_artifact(IRIS_STATISTICS) train, test = train_test_split(dataset, test_size=0.2, random_state=42) X_train = train.drop(TARGET_COLUMN, axis=1).astype('float32') y_train = train[TARGET_COLUMN].astype('int32') X_test = test.drop(TARGET_COLUMN, axis=1).astype('float32') y_test = test[TARGET_COLUMN].astype('int32')
parser.add_argument('-o', '--output_dir', type=str, required=False, default='./output', help='Path to a where stats must be saved') parser.add_argument('-f', '--file_name', type=str, required=False, default='stats.txt', help='Name of the stats file') args = parser.parse_args() # tfdv doesnt support generating stats directly from parquet # so read through pandas parquet reader # Ideally, this should be be an accelerated parquet reader and stats # computation should happen via GPU df = pd.read_parquet(args.data_dir) stats = tfdv.generate_statistics_from_dataframe(df) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) output_path = os.path.join(args.output_dir, args.file_name) tfdv.write_stats_text(stats, output_path=output_path)
def run(self, task, model, schema, num_corruptions, performance_threshold): # Make sure the schema works on the clean test data assert (not self.has_anomaly(self.validate(schema, task.test_data))) baseline_predictions = model.predict_proba(task.test_data) baseline_score = task.score_on_test_data(baseline_predictions) random_corruptions = set() for _ in range(0, num_corruptions): num_columns = len(task.numerical_columns + task.categorical_columns + task.text_columns) p_numerical_column_affected = float(len( task.numerical_columns)) / num_columns p_categorical_column_affected = float(len( task.categorical_columns)) / num_columns p_text_column_affected = float(len( task.text_columns)) / num_columns affected_column_type = np.random.choice( ['numerical', 'categorical', 'text'], 1, p=[ p_numerical_column_affected, p_categorical_column_affected, p_text_column_affected ]) fraction = float(np.random.randint(100)) / 100 if affected_column_type == 'numerical': if len(task.numerical_columns) >= 2 and np.random.uniform( ) < 0.1: affected_columns = np.random.choice( task.numerical_columns, 2) random_corruptions.add( SwappedValues(affected_columns[0], affected_columns[1], fraction)) else: corruption_type = np.random.choice( ['missing', 'noise', 'scaling']) if corruption_type == 'missing': missingness = np.random.choice(['MCAR', 'MAR', 'MNAR']) affected_column = np.random.choice( task.numerical_columns) random_corruptions.add( MissingValues(affected_column, fraction, na_value=np.nan, missingness=missingness)) elif corruption_type == 'noise': affected_column = np.random.choice( task.numerical_columns) random_corruptions.add( GaussianNoise(affected_column, fraction)) elif corruption_type == 'scaling': affected_column = np.random.choice( task.numerical_columns) random_corruptions.add( Scaling(affected_column, fraction)) elif affected_column_type == 'categorical': if len(task.categorical_columns) >= 2 and np.random.uniform( ) < 0.1: affected_columns = np.random.choice( task.categorical_columns, 2) random_corruptions.add( SwappedValues(affected_columns[0], affected_columns[1], fraction)) else: corruption_type = np.random.choice(['missing', 'encoding']) if corruption_type == 'missing': missingness = np.random.choice(['MCAR', 'MAR', 'MNAR']) affected_column = np.random.choice( task.categorical_columns) random_corruptions.add( MissingValues(affected_column, fraction, na_value='', missingness=missingness)) elif corruption_type == 'encoding': affected_column = np.random.choice( task.categorical_columns) random_corruptions.add( BrokenCharacters(affected_column, fraction)) elif affected_column_type == 'text': if len(task.text_columns) >= 2 and np.random.uniform() < 0.1: affected_columns = np.random.choice(task.text_columns, 2) random_corruptions.add( SwappedValues(affected_columns[0], affected_columns[1], fraction)) else: corruption_type = np.random.choice(['missing', 'encoding']) if corruption_type == 'missing': missingness = np.random.choice(['MCAR', 'MAR', 'MNAR']) affected_column = np.random.choice(task.text_columns) random_corruptions.add( MissingValues(affected_column, fraction, na_value='', missingness=missingness)) elif corruption_type == 'encoding': affected_column = np.random.choice(task.text_columns) random_corruptions.add( BrokenCharacters(affected_column, fraction)) outcome = { 'corruption': [], 'status': [], 'anomalies': [], 'baseline_score': [], 'corrupted_score': [] } for corruption in random_corruptions: print(corruption) test_data_copy = task.test_data.copy(deep=True) corrupted_data = corruption.transform(test_data_copy) corrupted_data_stats = tfdv.generate_statistics_from_dataframe( corrupted_data) tfdv_anomalies = tfdv.validate_statistics( statistics=corrupted_data_stats, schema=schema) schema_anomalies = tfdv_anomalies.anomaly_info try: corrupted_predictions = model.predict_proba(corrupted_data) corrupted_score = task.score_on_test_data( corrupted_predictions) performance_drop = (baseline_score - corrupted_score) / baseline_score has_negative_impact = performance_drop > performance_threshold except: corrupted_score = None has_negative_impact = True has_anomalies = len(tfdv_anomalies.anomaly_info) != 0 if has_anomalies: if has_negative_impact: status = 'TP' else: status = 'FP' else: if not has_negative_impact: status = 'TN' else: status = 'FN' outcome['corruption'].append(str(corruption)) outcome['status'].append(status) outcome['anomalies'].append(str(schema_anomalies)) outcome['baseline_score'].append(baseline_score) outcome['corrupted_score'].append(corrupted_score) return pd.DataFrame.from_dict(outcome)
def validate(self, schema, data): stats = tfdv.generate_statistics_from_dataframe(data) return tfdv.validate_statistics(statistics=stats, schema=schema)
def schema_from_train_data(self): train_data_stats = tfdv.generate_statistics_from_dataframe( self._task.train_data) schema = tfdv.infer_schema(statistics=train_data_stats) return schema