示例#1
0
def skew_drift_validator(mode, gcp_bucket, control_set_path,
                         treatment_set_path, feature_list_str, Linf_value):
    logging.basicConfig(level=logging.INFO)
    logging.info('Starting skew drift validator ..')
    logging.info('Input data:')
    logging.info('mode:{}'.format(mode))
    logging.info('gcp_bucket:{}'.format(gcp_bucket))
    logging.info('control_set_path:{}'.format(control_set_path))
    logging.info('treatment_set_path:{}'.format(treatment_set_path))
    logging.info('Linf_value:{}'.format(Linf_value))

    feature_list = eval(feature_list_str)
    control_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + control_set_path,
                                 sep=',')
    treat_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + treatment_set_path,
                               sep=',')
    control_stats = tfdv.generate_statistics_from_dataframe(
        dataframe=control_set_df)
    treat_stats = tfdv.generate_statistics_from_dataframe(
        dataframe=treat_set_df)
    control_schema = tfdv.infer_schema(control_stats)
    treat_schema = tfdv.infer_schema(treat_stats)

    for feature in feature_list:
        if (mode == "skew"):
            if (tfdv.get_feature(control_schema, feature).domain
                ):  # if we have domain it is a categorical variable
                tfdv.get_feature(
                    control_schema, feature
                ).skew_comparator.infinity_norm.threshold = Linf_value
            else:
                logging.critical(
                    "feature: {} is not categorical".format(feature))
                sys.exit(1)
        elif (mode == "drift"):
            tfdv.get_feature(
                control_schema,
                feature).drift_comparator.infinity_norm.threshold = Linf_value
        else:
            logging.critical("mode: {} not supported".format(mode))
            sys.exit(1)
    anomalies = tfdv.validate_statistics(statistics=control_stats,
                                         schema=control_schema,
                                         serving_statistics=treat_stats)
    if (anomalies.anomaly_info):
        logging.info("Data-{} detected:".format(anomalies))
        return anomalies
    else:
        logging.info("No data-{} detected".format(mode))
def Load_TFDV(df):

    lencols = len(df.columns)
    # print(lencols)
    y_tfdv = [0] * lencols

    i = 0
    for col in df.columns:
        # print(col)
        df_col = df[[col]]
        st_option = tfdv.StatsOptions(enable_semantic_domain_stats=True)
        stats = tfdv.generate_statistics_from_dataframe(
            df_col, stats_options=st_option)
        schema = tfdv.infer_schema(statistics=stats)
        categ_lst = get_categorical_features(schema)
        for x in categ_lst:
            y_tfdv[i] = 1
            break

        xc = schema.feature
        # print(xc)
        for x in xc:
            cnt_NLD = str(x).count('natural_language_domain')
            cnt_TD = str(x).count('time_domain')

            if cnt_NLD: y_tfdv[i] = 3
            if cnt_TD: y_tfdv[i] = 2
        print(y_tfdv[i])
        i = i + 1

    return y_tfdv
示例#3
0
 def compute_training_stats(self, stats_path=None):
     """compute training stats."""
     dataset = self.fs.create_dataset(self.train_feature_set, "2009-01-01", "2016-01-01")
     training_df = self.fs.download_dataset_to_df(dataset, self.STAGING_LOCATION)
     training_stats = tfdv.generate_statistics_from_dataframe(training_df)
     
     if stats_path:
         logging.info("Saving training stats to %s", stats_path)
         demo_util.save_proto(training_stats, stats_path)
     else:
         logging.info("No stats_path provided; not saving stats")
     return training_stats
示例#4
0
def feature_stats_dataset_agg(client, feature_stats_feature_set):
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    start_date = time_offset - timedelta(days=10)
    end_date = time_offset - timedelta(days=7)
    df1 = pd.DataFrame({
        "datetime": [start_date] * 5,
        "entity_id": [i for i in range(5)],
        "strings": ["a", "b", "b", "b", "a"],
        "ints": [4, 3, 2, 6, 3],
        "floats": [2.1, 5.2, 4.3, 0.6, 0.1],
    })
    ingestion_id_1 = client.ingest(feature_stats_feature_set, df1)
    df2 = pd.DataFrame({
        "datetime": [start_date + timedelta(days=1)] * 3,
        "entity_id": [i for i in range(3)],
        "strings": ["a", "b", "c"],
        "ints": [2, 6, 7],
        "floats": [1.6, 2.4, 2],
    })
    ingestion_id_2 = client.ingest(feature_stats_feature_set, df2)

    combined_df = pd.concat([df1, df2])[["strings", "ints", "floats"]]
    expected_stats = tfdv.generate_statistics_from_dataframe(combined_df)
    clear_unsupported_agg_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = combined_df[name].std()
            feature.num_stats.std_dev = std

    time.sleep(10)

    return {
        "ids": [ingestion_id_1, ingestion_id_2],
        "start_date":
        datetime(start_date.year, start_date.month,
                 start_date.day).replace(tzinfo=pytz.utc),
        "end_date":
        datetime(end_date.year, end_date.month,
                 end_date.day).replace(tzinfo=pytz.utc),
        "stats":
        expected_stats,
    }
示例#5
0
def tfdv_and_additional_anomalies(
        df: pd.DataFrame,
        tfdv_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]:
    """
    Get TFDV and additional anomalies.
    Args:
        df {pandas.DataFrame}: dataframe
        tfdv_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: TFDV statistics
    Returns:
        Tuple[bool, Dict]:
            True if anomalies are detected, otherwise False,
            dictionary with structure:
            {
                <column_name>: {
                    'description': <description>,
                    'severity': 'ERROR',
                    'shortDescription': <short description>,
                    'reason': [{'type': <error_type>,
                                'shortDescription': <short description>,
                                'description': <description>}],
                    'path': {'step': [<column_name>]}
                }
            }
    """

    df_statistics = tfdv.generate_statistics_from_dataframe(df)
    interval_anomalies_detected = False
    tfdv_anomalies_detected, tfdv_anomalies = tfdv_statistics_anomalies(
        tfdv_statistics, df_statistics)
    interval_anomalies = {}

    if len(tfdv_anomalies) > 0:
        tfdv_anomalies_detected = True

    if os.getenv('CHECK_NUMERIC_INTERVALS_ON_PREDICT') == 'true':
        interval_anomalies_detected, interval_anomalies = data_intervals_anomalies(
            df, tfdv_statistics)

    anomalies_detected = tfdv_anomalies_detected or interval_anomalies_detected
    anomalies = {**tfdv_anomalies, **interval_anomalies}

    return anomalies_detected, anomalies
示例#6
0
    def evaluate(self, model: BaseEstimator, num_repetitions: int,
                 *corruptions: DataCorruption):

        schema = self.schema_from_train_data()

        baseline_predictions = model.predict_proba(self._task.test_data)
        baseline_score = self._task.score_on_test_data(baseline_predictions)

        results = []

        # Repeatedly corrupt the test data
        for corruption in corruptions:
            corrupted_scores = []
            anomalies = []
            for _ in range(0, num_repetitions):
                test_data_copy = self._task.test_data.copy(deep=True)
                corrupted_data = corruption.transform(test_data_copy)

                # Determine whether tfdv finds anomalies in the data
                corrupted_data_stats = tfdv.generate_statistics_from_dataframe(
                    corrupted_data)
                tfdv_anomalies = tfdv.validate_statistics(
                    statistics=corrupted_data_stats, schema=schema)

                schema_anomalies = tfdv_anomalies.anomaly_info

                # Compute the prediction score on the test data
                corrupted_predictions = model.predict_proba(corrupted_data)
                corrupted_score = self._task.score_on_test_data(
                    corrupted_predictions)

                anomalies.append(schema_anomalies)
                corrupted_scores.append(corrupted_score)

            results.append(
                SchemaValidationResult(corruption, anomalies, baseline_score,
                                       corrupted_scores))

        return results
示例#7
0
def feature_stats_dataset_basic(client, feature_stats_feature_set):

    N_ROWS = 20

    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    df = pd.DataFrame({
        "datetime": [time_offset] * N_ROWS,
        "entity_id": [i for i in range(N_ROWS)],
        "strings": ["a", "b"] * int(N_ROWS / 2),
        "ints": [int(i) for i in range(N_ROWS)],
        "floats": [10.5 - i for i in range(N_ROWS)],
    })

    expected_stats = tfdv.generate_statistics_from_dataframe(
        df[["strings", "ints", "floats"]])
    clear_unsupported_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = df[name].std()
            feature.num_stats.std_dev = std

    ingestion_id = client.ingest(feature_stats_feature_set, df)
    time.sleep(10)
    return {
        "df":
        df,
        "id":
        ingestion_id,
        "date":
        datetime(time_offset.year, time_offset.month,
                 time_offset.day).replace(tzinfo=pytz.utc),
        "stats":
        expected_stats,
    }
示例#8
0
def test_feature_stats_force_refresh(client, feature_stats_dataset_basic,
                                     feature_stats_feature_set):
    df = feature_stats_dataset_basic["df"]

    df2 = pd.DataFrame({
        "datetime": [df.iloc[0].datetime],
        "entity_id": [10],
        "strings": ["c"],
        "ints": [2],
        "floats": [1.3],
    })
    client.ingest(feature_stats_feature_set, df2)
    time.sleep(10)

    actual_stats = client.get_statistics(
        "feature_stats",
        features=["strings", "ints", "floats"],
        store="historical",
        start_date=feature_stats_dataset_basic["date"],
        end_date=feature_stats_dataset_basic["date"] + timedelta(days=1),
        force_refresh=True,
    )

    combined_df = pd.concat([df, df2])
    expected_stats = tfdv.generate_statistics_from_dataframe(combined_df)

    clear_unsupported_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = combined_df[name].std()
            feature.num_stats.std_dev = std

    assert_stats_equal(expected_stats, actual_stats)
# MAGIC %md
# MAGIC ## Use Tensorflow Validation Library
# MAGIC - check schema between the training and serving periods of time
# MAGIC - check for data drift and skew between training and serving

# COMMAND ----------

from sklearn.model_selection import train_test_split
import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils.display_util import get_statistics_html
import warnings

warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)

stats_train = tfdv.generate_statistics_from_dataframe(
    dataframe=train_df.toPandas())
stats_serve = tfdv.generate_statistics_from_dataframe(dataframe=fdf.toPandas())

schema = tfdv.infer_schema(statistics=stats_train)
tfdv.display_schema(schema=schema)

# COMMAND ----------

# Compare evaluation data with training data
displayHTML(
    get_statistics_html(lhs_statistics=stats_serve,
                        rhs_statistics=stats_train,
                        lhs_name='SERVE_DATASET',
                        rhs_name='TRAIN_DATASET'))

# COMMAND ----------
示例#10
0
we are using Tensorflow Data Validation since Azure Data validation seems to reside in a separate service
'''

# %%
import pyarrow
import apache_beam as beam
import apache_beam.io.iobase
import tensorflow
import tensorflow_data_validation as tfdv


# %%
#train_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_train)

# %%
train_stats = tfdv.generate_statistics_from_dataframe(train)

# %%


# %%
'''
## visualize statistics of train data
'''

# %%
tfdv.visualize_statistics(train_stats)

# %%

示例#11
0
def get_validation_report(deployment_id: int,
                   timestamp_from: float,
                   timestamp_to: float) -> JSONResponse:

    conf = Config()
    connection = psycopg2.connect(
            database=conf.get('PROJECTS_DB_NAME'),
            host=conf.get('DB_HOST'),
            port=conf.get('DB_PORT'),
            user=conf.get('DB_USER'),
            password=conf.get('DB_PASSWORD')
        )
    cursor = connection.cursor()

    cursor.execute(
        f'SELECT model_uri FROM {DeployDbSchema.DEPLOYMENTS_TABLE} '
        f'WHERE id = {deployment_id}'
    )

    try:
        model_uri = cursor.fetchone()[0]
    except TypeError:
        raise DeploymentNotFoundError(f'Deployment with ID {deployment_id} not found')

    cursor.execute(
        f'SELECT incoming_data FROM {DeployDbSchema.INCOMING_DATA_TABLE} '
        f'WHERE deployment_id = {deployment_id} AND '
        f'     timestamp >= {timestamp_from} AND timestamp <= {timestamp_to}'
    )

    schema_file_path = get_schema_file_path(model_uri)

    if not schema_file_exists(schema_file_path):
        return JSONResponse({})

    tfdv_statistics = read_tfdv_statistics(schema_file_path)
    tfdv_statistics_dict = tfdv_object_to_dict(tfdv_statistics)

    data_batches = cursor.fetchall()
    dataframes = []

    for batch in data_batches:
        df = load_data(batch[0])
        dataframes.append(df)

    if len(dataframes) == 0:
        return JSONResponse({})

    incoming_data_df = pd.concat(dataframes, ignore_index=False)

    """
     Convert object columns to string to avoid errors when trying to generate TFDV
     statistics. If column contains string and numeric values it's type is object, but 
     the column still contains values of different types. And TFDV tries to convert string 
     value to numeric (integer, float) => error 
    """
    object_columns = incoming_data_df.select_dtypes(include='object').columns.tolist()
    incoming_data_df[object_columns] = incoming_data_df[object_columns].astype('str')

    incoming_data_statistics = tfdv.generate_statistics_from_dataframe(incoming_data_df)
    incoming_data_statistics_dict = tfdv_object_to_dict(incoming_data_statistics)

    tfdv_anomalies_detected, tfdv_anomalies = tfdv_statistics_anomalies(
        tfdv_statistics, incoming_data_statistics
    )
    interval_anomalies_detected, interval_anomalies = data_intervals_anomalies(
        incoming_data_df, tfdv_statistics
    )
    anomalies_detected = tfdv_anomalies_detected or interval_anomalies_detected
    anomalies = {**tfdv_anomalies, **interval_anomalies}

    return JSONResponse({
        'timestamp_from': timestamp_from,
        'timestamp_to': timestamp_to,
        'model_statistics': tfdv_statistics_dict,
        'incoming_data_statistics': incoming_data_statistics_dict,
        'anomalies_detected': anomalies_detected,
        'anomalies_info': anomalies
    })
示例#12
0
def test_batch_dataset_statistics(client):
    fs1 = client.get_feature_set(name="feature_set_1")
    fs2 = client.get_feature_set(name="feature_set_2")
    id_offset = 20

    n_rows = 21
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    features_1_df = pd.DataFrame({
        "datetime": [time_offset] * n_rows,
        "entity_id": [id_offset + i for i in range(n_rows)],
        "feature_value6": ["a" for i in range(n_rows)],
    })
    ingestion_id1 = client.ingest(fs1, features_1_df)

    features_2_df = pd.DataFrame({
        "datetime": [time_offset] * n_rows,
        "other_entity_id": [id_offset + i for i in range(n_rows)],
        "other_feature_value7": [int(i) % 10 for i in range(0, n_rows)],
    })
    ingestion_id2 = client.ingest(fs2, features_2_df)

    entity_df = pd.DataFrame({
        "datetime": [time_offset] * n_rows,
        "entity_id": [id_offset + i for i in range(n_rows)],
        "other_entity_id": [id_offset + i for i in range(n_rows)],
    })

    time.sleep(15)  # wait for rows to get written to bq
    while True:
        rows_ingested1 = get_rows_ingested(client, fs1, ingestion_id1)
        rows_ingested2 = get_rows_ingested(client, fs2, ingestion_id2)
        if rows_ingested1 == len(features_1_df) and rows_ingested2 == len(
                features_2_df):
            print(
                f"Number of rows successfully ingested: {rows_ingested1}, {rows_ingested2}. Continuing."
            )
            break
        time.sleep(30)

    feature_retrieval_job = client.get_historical_features(
        entity_rows=entity_df,
        feature_refs=["feature_value6", "feature_set_2:other_feature_value7"],
        project=PROJECT_NAME,
        compute_statistics=True,
    )
    output = feature_retrieval_job.to_dataframe(timeout_sec=180)
    print(output.head(10))
    stats = feature_retrieval_job.statistics(timeout_sec=180)
    clear_unsupported_fields(stats)

    expected_stats = tfdv.generate_statistics_from_dataframe(
        output[["feature_value6", "feature_set_2__other_feature_value7"]])
    clear_unsupported_fields(expected_stats)

    # Since TFDV computes population std dev
    for feature in expected_stats.datasets[0].features:
        if feature.HasField("num_stats"):
            name = feature.path.step[0]
            std = output[name].std()
            feature.num_stats.std_dev = std

    assert_stats_equal(expected_stats, stats)
    clean_up_remote_files(feature_retrieval_job.get_avro_files())
示例#13
0
import pandas as pd
import tensorflow as tf
import tensorflow_data_validation as tfdv

# Simple dataset analysis
dataset = pd.read_csv("data/pollution-small.csv")
print(dataset.shape)

training_data = dataset[:1600]
print(training_data.describe())

test_set = dataset[1600:]
print(test_set.describe())

# Generate training data statistics
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset)
schema = tfdv.infer_schema(statistics=train_stats)
print(tfdv.display_schema(schema))

test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set)

# Compare test statistics with the Schema
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)
# Displaying all detected anomalies
# Integer larger than 10
# STRING type when expected INT type
# FLOAT type when expected INT type
# Integer smaller than 0
print(tfdv.display_anomalies(anomalies))

# New data WITH anomalies
display(weatherDF['avg_temp_f', 'tot_precip_mm', 'avg_wnd_mps', 'avg_vis_m',
                  'avg_slp_hpa', 'avg_dewpt_f'].summary())

# COMMAND ----------

# MAGIC %md visualize statistics  using Tensorflow data validation lib

# COMMAND ----------

import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils.display_util import get_statistics_html
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)

stats = tfdv.generate_statistics_from_dataframe(dataframe=weatherDF.toPandas())
tfdv.visualize_statistics(stats)
displayHTML(
    get_statistics_html(stats)
)  #tot_precip_mm has about 90% zeros!, avg_wnd_mps has about 16% #No missing data

# COMMAND ----------

# MAGIC %md infer schema

# COMMAND ----------

weather_data_schema = tfdv.infer_schema(statistics=stats)
tfdv.display_schema(schema=weather_data_schema)

# COMMAND ----------
示例#15
0
    args = parser.parse_args()

    experiment_name = 'IrisSVC'
    mlflow.set_experiment(experiment_name)

    DATASET = 'data/iris.csv'
    TARGET_LABELED_DATASET = 'data/labeled_iris.csv'
    TARGET_COLUMN = 'species'
    IRIS_STATISTICS = '/tmp/stats.tfdv'

    with mlflow.start_run() as run:
        dataset = pd.read_csv(DATASET)
        dataset[TARGET_COLUMN] = LabelEncoder().fit_transform(dataset[TARGET_COLUMN])
        dataset.to_csv(TARGET_LABELED_DATASET, index=False)

        statistics = tfdv.generate_statistics_from_dataframe(dataset.drop(TARGET_COLUMN, axis=1))

        with open(IRIS_STATISTICS, 'wb') as out_stats:
            out_stats.write(statistics.SerializeToString())

        mlflow.log_artifact(DATASET)
        mlflow.log_artifact(TARGET_LABELED_DATASET)
        mlflow.log_artifact(IRIS_STATISTICS)

        train, test = train_test_split(dataset, test_size=0.2, random_state=42)

        X_train = train.drop(TARGET_COLUMN, axis=1).astype('float32')
        y_train = train[TARGET_COLUMN].astype('int32')

        X_test = test.drop(TARGET_COLUMN, axis=1).astype('float32')
        y_test = test[TARGET_COLUMN].astype('int32')
示例#16
0
    parser.add_argument('-o',
                        '--output_dir',
                        type=str,
                        required=False,
                        default='./output',
                        help='Path to a where stats must be saved')

    parser.add_argument('-f',
                        '--file_name',
                        type=str,
                        required=False,
                        default='stats.txt',
                        help='Name of the stats file')

    args = parser.parse_args()

    # tfdv doesnt support generating stats directly from parquet
    # so read through pandas parquet reader
    # Ideally, this should be be an accelerated parquet reader and stats
    # computation should happen via GPU
    df = pd.read_parquet(args.data_dir)

    stats = tfdv.generate_statistics_from_dataframe(df)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    output_path = os.path.join(args.output_dir, args.file_name)

    tfdv.write_stats_text(stats, output_path=output_path)
示例#17
0
    def run(self, task, model, schema, num_corruptions, performance_threshold):
        # Make sure the schema works on the clean test data
        assert (not self.has_anomaly(self.validate(schema, task.test_data)))

        baseline_predictions = model.predict_proba(task.test_data)
        baseline_score = task.score_on_test_data(baseline_predictions)

        random_corruptions = set()

        for _ in range(0, num_corruptions):
            num_columns = len(task.numerical_columns +
                              task.categorical_columns + task.text_columns)
            p_numerical_column_affected = float(len(
                task.numerical_columns)) / num_columns
            p_categorical_column_affected = float(len(
                task.categorical_columns)) / num_columns
            p_text_column_affected = float(len(
                task.text_columns)) / num_columns

            affected_column_type = np.random.choice(
                ['numerical', 'categorical', 'text'],
                1,
                p=[
                    p_numerical_column_affected, p_categorical_column_affected,
                    p_text_column_affected
                ])

            fraction = float(np.random.randint(100)) / 100

            if affected_column_type == 'numerical':

                if len(task.numerical_columns) >= 2 and np.random.uniform(
                ) < 0.1:
                    affected_columns = np.random.choice(
                        task.numerical_columns, 2)
                    random_corruptions.add(
                        SwappedValues(affected_columns[0], affected_columns[1],
                                      fraction))
                else:

                    corruption_type = np.random.choice(
                        ['missing', 'noise', 'scaling'])

                    if corruption_type == 'missing':
                        missingness = np.random.choice(['MCAR', 'MAR', 'MNAR'])
                        affected_column = np.random.choice(
                            task.numerical_columns)
                        random_corruptions.add(
                            MissingValues(affected_column,
                                          fraction,
                                          na_value=np.nan,
                                          missingness=missingness))
                    elif corruption_type == 'noise':
                        affected_column = np.random.choice(
                            task.numerical_columns)
                        random_corruptions.add(
                            GaussianNoise(affected_column, fraction))
                    elif corruption_type == 'scaling':
                        affected_column = np.random.choice(
                            task.numerical_columns)
                        random_corruptions.add(
                            Scaling(affected_column, fraction))

            elif affected_column_type == 'categorical':

                if len(task.categorical_columns) >= 2 and np.random.uniform(
                ) < 0.1:
                    affected_columns = np.random.choice(
                        task.categorical_columns, 2)
                    random_corruptions.add(
                        SwappedValues(affected_columns[0], affected_columns[1],
                                      fraction))
                else:
                    corruption_type = np.random.choice(['missing', 'encoding'])

                    if corruption_type == 'missing':
                        missingness = np.random.choice(['MCAR', 'MAR', 'MNAR'])
                        affected_column = np.random.choice(
                            task.categorical_columns)
                        random_corruptions.add(
                            MissingValues(affected_column,
                                          fraction,
                                          na_value='',
                                          missingness=missingness))

                    elif corruption_type == 'encoding':
                        affected_column = np.random.choice(
                            task.categorical_columns)
                        random_corruptions.add(
                            BrokenCharacters(affected_column, fraction))

            elif affected_column_type == 'text':

                if len(task.text_columns) >= 2 and np.random.uniform() < 0.1:
                    affected_columns = np.random.choice(task.text_columns, 2)
                    random_corruptions.add(
                        SwappedValues(affected_columns[0], affected_columns[1],
                                      fraction))
                else:
                    corruption_type = np.random.choice(['missing', 'encoding'])

                    if corruption_type == 'missing':
                        missingness = np.random.choice(['MCAR', 'MAR', 'MNAR'])
                        affected_column = np.random.choice(task.text_columns)
                        random_corruptions.add(
                            MissingValues(affected_column,
                                          fraction,
                                          na_value='',
                                          missingness=missingness))

                    elif corruption_type == 'encoding':
                        affected_column = np.random.choice(task.text_columns)
                        random_corruptions.add(
                            BrokenCharacters(affected_column, fraction))

        outcome = {
            'corruption': [],
            'status': [],
            'anomalies': [],
            'baseline_score': [],
            'corrupted_score': []
        }

        for corruption in random_corruptions:
            print(corruption)
            test_data_copy = task.test_data.copy(deep=True)
            corrupted_data = corruption.transform(test_data_copy)

            corrupted_data_stats = tfdv.generate_statistics_from_dataframe(
                corrupted_data)
            tfdv_anomalies = tfdv.validate_statistics(
                statistics=corrupted_data_stats, schema=schema)

            schema_anomalies = tfdv_anomalies.anomaly_info

            try:
                corrupted_predictions = model.predict_proba(corrupted_data)
                corrupted_score = task.score_on_test_data(
                    corrupted_predictions)

                performance_drop = (baseline_score -
                                    corrupted_score) / baseline_score

                has_negative_impact = performance_drop > performance_threshold
            except:
                corrupted_score = None
                has_negative_impact = True

            has_anomalies = len(tfdv_anomalies.anomaly_info) != 0

            if has_anomalies:
                if has_negative_impact:
                    status = 'TP'
                else:
                    status = 'FP'
            else:
                if not has_negative_impact:
                    status = 'TN'
                else:
                    status = 'FN'

            outcome['corruption'].append(str(corruption))
            outcome['status'].append(status)
            outcome['anomalies'].append(str(schema_anomalies))
            outcome['baseline_score'].append(baseline_score)
            outcome['corrupted_score'].append(corrupted_score)

        return pd.DataFrame.from_dict(outcome)
示例#18
0
 def validate(self, schema, data):
     stats = tfdv.generate_statistics_from_dataframe(data)
     return tfdv.validate_statistics(statistics=stats, schema=schema)
示例#19
0
 def schema_from_train_data(self):
     train_data_stats = tfdv.generate_statistics_from_dataframe(
         self._task.train_data)
     schema = tfdv.infer_schema(statistics=train_data_stats)
     return schema