Пример #1
0
def skew_drift_validator(mode, gcp_bucket, control_set_path,
                         treatment_set_path, feature_list_str, Linf_value):
    logging.basicConfig(level=logging.INFO)
    logging.info('Starting skew drift validator ..')
    logging.info('Input data:')
    logging.info('mode:{}'.format(mode))
    logging.info('gcp_bucket:{}'.format(gcp_bucket))
    logging.info('control_set_path:{}'.format(control_set_path))
    logging.info('treatment_set_path:{}'.format(treatment_set_path))
    logging.info('Linf_value:{}'.format(Linf_value))

    feature_list = eval(feature_list_str)
    control_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + control_set_path,
                                 sep=',')
    treat_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + treatment_set_path,
                               sep=',')
    control_stats = tfdv.generate_statistics_from_dataframe(
        dataframe=control_set_df)
    treat_stats = tfdv.generate_statistics_from_dataframe(
        dataframe=treat_set_df)
    control_schema = tfdv.infer_schema(control_stats)
    treat_schema = tfdv.infer_schema(treat_stats)

    for feature in feature_list:
        if (mode == "skew"):
            if (tfdv.get_feature(control_schema, feature).domain
                ):  # if we have domain it is a categorical variable
                tfdv.get_feature(
                    control_schema, feature
                ).skew_comparator.infinity_norm.threshold = Linf_value
            else:
                logging.critical(
                    "feature: {} is not categorical".format(feature))
                sys.exit(1)
        elif (mode == "drift"):
            tfdv.get_feature(
                control_schema,
                feature).drift_comparator.infinity_norm.threshold = Linf_value
        else:
            logging.critical("mode: {} not supported".format(mode))
            sys.exit(1)
    anomalies = tfdv.validate_statistics(statistics=control_stats,
                                         schema=control_schema,
                                         serving_statistics=treat_stats)
    if (anomalies.anomaly_info):
        logging.info("Data-{} detected:".format(anomalies))
        return anomalies
    else:
        logging.info("No data-{} detected".format(mode))
Пример #2
0
def tfdv_skew_validator(feature_name, train_stats, serve_stats, schema, threshold):
    """
    Validate skew for the csv dataset
    """
    #this doesn't display skew anomalies as the book shows
    tfdv.get_feature(schema, feature_name).skew_comparator.infinity_norm.threshold = threshold
    skew_anomalies = tfdv.validate_statistics(statistics = train_stats,
                                                schema = schema,
                                                serving_statistics = serve_stats)
    tfdv.display_anomalies(skew_anomalies)
    return skew_anomalies
Пример #3
0
def tfdv_drift_validator(feature_name, train_stats, previous_stats, schema, threshold):
    """
    Validate drift for the csv dataset
    """
    #this doesn't display drift anomalies as the book shows
    tfdv.get_feature(schema, feature_name).drift_comparator.infinity_norm.threshold = threshold
    drift_anomalies = tfdv.validate_statistics(statistics=train_stats, 
                                                schema=schema, 
                                                previous_statistics= previous_stats
                                                )
    tfdv.display_anomalies(drift_anomalies)
    return drift_anomalies
Пример #4
0
def tfdv_detect_drift(
    stats_older_path: str, stats_new_path: str
) -> NamedTuple('Outputs', [('drift', str)]):

  import logging
  import time

  import tensorflow_data_validation as tfdv
  import tensorflow_data_validation.statistics.stats_impl

  logging.getLogger().setLevel(logging.INFO)
  logging.info('stats_older_path: %s', stats_older_path)
  logging.info('stats_new_path: %s', stats_new_path)

  if stats_older_path == 'none':
    return ('true', )

  stats1 = tfdv.load_statistics(stats_older_path)
  stats2 = tfdv.load_statistics(stats_new_path)

  schema1 = tfdv.infer_schema(statistics=stats1)
  tfdv.get_feature(schema1, 'duration').drift_comparator.jensen_shannon_divergence.threshold = 0.01
  drift_anomalies = tfdv.validate_statistics(
      statistics=stats2, schema=schema1, previous_statistics=stats1)
  logging.info('drift analysis results: %s', drift_anomalies.drift_skew_info)

  from google.protobuf.json_format import MessageToDict
  d = MessageToDict(drift_anomalies)
  val = d['driftSkewInfo'][0]['driftMeasurements'][0]['value']
  thresh = d['driftSkewInfo'][0]['driftMeasurements'][0]['threshold']
  logging.info('value %s and threshold %s', val, thresh)
  res = 'true'
  if val < thresh:
    res = 'false'
  logging.info('train decision: %s', res)
  return (res, )
# Compare evaluation data with training data
displayHTML(
    get_statistics_html(lhs_statistics=stats_serve,
                        rhs_statistics=stats_train,
                        lhs_name='SERVE_DATASET',
                        rhs_name='TRAIN_DATASET'))

# COMMAND ----------

anomalies = tfdv.validate_statistics(statistics=stats_serve, schema=schema)
tfdv.display_anomalies(anomalies)

# COMMAND ----------

# Add skew and drift comparators
temp_f = tfdv.get_feature(schema, 'avg_temp_f')
temp_f.skew_comparator.jensen_shannon_divergence.threshold = 0
temp_f.drift_comparator.jensen_shannon_divergence.threshold = 0

precip_mm = tfdv.get_feature(schema, 'tot_precip_mm')
precip_mm.skew_comparator.jensen_shannon_divergence.threshold = 0
precip_mm.drift_comparator.jensen_shannon_divergence.threshold = 0

_anomalies = tfdv.validate_statistics(stats_train,
                                      schema,
                                      serving_statistics=stats_serve)

tfdv.display_anomalies(_anomalies)

# COMMAND ----------
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    vldn_output = os.path.join(DATA_DIR, 'validation')

    # TODO: Understand why this was used in the conversion to the output json
    # key columns: list of the names for columns that should be treated as unique keys.
    key_columns = ['trip_start_timestamp']

    # read the first line of the cvs to have and ordered list of column names
    # (the Schema will scrable the features)
    with open(TRAIN_DATA) as f:
        column_names = f.readline().strip().split(',')

    stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
    schema = tfdv.infer_schema(stats)

    eval_stats = tfdv.generate_statistics_from_csv(
        data_location=EVALUATION_DATA)
    anomalies = tfdv.validate_statistics(eval_stats, schema)

    # Log anomalies
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error(
            'Anomaly in feature "{}": {}'.format(
                feature_name, anomaly_info.description))

    # show inferred schema
    tfdv.display_schema(schema=schema)
    # Resolve anomalies
    company = tfdv.get_feature(schema, 'company')
    company.distribution_constraints.min_domain_mass = 0.9

    # Add new value to the domain of feature payment_type.
    payment_type_domain = tfdv.get_domain(schema, 'payment_type')
    payment_type_domain.value.append('Prcard')

    # Validate eval stats after updating the schema
    updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
    tfdv.display_anomalies(updated_anomalies)

    # -----------------------DATA SAVING START---------------------------------
    if "column_names" in locals():
        _kale_resource_save(column_names, os.path.join(
            _kale_data_directory, "column_names"))
    else:
        print("_kale_resource_save: `column_names` not found.")
    if "schema" in locals():
        _kale_resource_save(schema, os.path.join(
            _kale_data_directory, "schema"))
    else:
        print("_kale_resource_save: `schema` not found.")
Пример #7
0
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)s - %(message)s',
                        level=logging.INFO,
                        datefmt='%d-%m-%y %H:%M:%S')
    logging.info(f"Args: {args}")

    stats1 = tfdv.load_stats_text(input_path=args.stats_file_1)
    stats2 = tfdv.load_stats_text(input_path=args.stats_file_2)

    schema1 = tfdv.infer_schema(statistics=stats1)

    # Custom rules, tweak this as required. This is just an example
    tfdv.get_feature(
        schema1,
        'I1').drift_comparator.jensen_shannon_divergence.threshold = 0.06

    # Calculate drift between the reference stats stats1, and the statistics from new data in stats2
    drift_anomalies = tfdv.validate_statistics(statistics=stats2,
                                               schema=schema1,
                                               previous_statistics=stats1)

    # Convert the .pb2 to dict
    drift = MessageToDict(drift_anomalies)

    value = drift['driftSkewInfo'][0]['driftMeasurements'][0]['value']
    threshold = drift['driftSkewInfo'][0]['driftMeasurements'][0]['threshold']
    logging.info(
        f"JS divergence value: {value}, and JS divergence threshold: {threshold}"
    )
Пример #8
0
# Displaying all detected anomalies
# Integer larger than 10
# STRING type when expected INT type
# FLOAT type when expected INT type
# Integer smaller than 0
print(tfdv.display_anomalies(anomalies))

# New data WITH anomalies
test_set_copy = test_set.copy()
test_set_copy.drop("soot", axis=1, inplace=True)

# Statistics based on data with anomalies
test_set_copy_stats = tfdv.generate_statistics_from_dataframe(
    dataframe=test_set_copy)
anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats,
                                         schema=schema)
print(tfdv.display_anomalies(anomalies_new))

# Prepare the schema for Serving,environment
schema.default_environment.append("TRAINING")
schema.default_environment.append("SERVING")

tfdv.get_feature(schema, "soot").not_in_environment.append("SERVING")

serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats,
                                                 schema,
                                                 environment="SERVING")
print(tfdv.display_anomalies(serving_env_anomalies))

# Freezing the schema
tfdv.write_schema_text(schema=schema, output_path='pollution_schema.pbtext')
Пример #9
0
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)

#display all detected anomalies in new data
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)

#Displayign all detected anomalies
tfdv.display_anomalies(anomalies)

#New data with anomalies
test_set_copy = test_set.copy()

test_set_copy.drop('soot', axis=1, inplace=True)

#stats based on data with anomalies
test_set_copy_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set_copy)

anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats, schema=schema)

tfdv.display_anomalies(anomalies_new)

#Stage 5: prepare the schema for serving
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

#Removing a target column from the Serving Schema
tfdv.get_feature(schema, 'soot').not_in_environment.append('SERVING')

#Checking for anomalies between serving environment and new test set
serving_env_anomalies = tfdv.validate_statistics(test_set_copy_stats, schema, environment='SERVING')

tfdv.display_anomalies(serving_env_anomalies)
Пример #10
0
def is_acceptable(train_batch, test_batch, method):
    if method == 'baseline':
        percent_null_range = determine_baseline_range(train_batch)
        test_batch_percent_null, __ = percent_null_dataframes(
            test_batch, test_batch)
        if percent_null_range['min'].values[
                0] <= test_batch_percent_null.values[0][
                    0] <= percent_null_range['max'].values[0]:
            return True
        else:
            return False

    elif method == 'original':
        completeness_range, distinct_range = determine_acceptable_metric_range(
            train_batch)
        test_batch_completeness, __ = completeness_dataframes(
            test_batch, test_batch)
        test_batch_distinct, __ = distinct_counts_dataframes(
            test_batch, test_batch)
        completeness_within_range = 0
        distinct_within_range = 0
        for i in range(len(test_batch_distinct.values)):
            if distinct_range[i:i + 1]['min'][0] <= test_batch_distinct.values[
                    i][0] <= distinct_range[i:i + 1]['max'][0]:
                distinct_within_range += 1

            if completeness_range[i:i + 1]['min'][
                    0] <= test_batch_completeness.values[i][
                        0] <= completeness_range[i:i + 1]['max'][0]:
                completeness_within_range += 1

        if completeness_within_range / test_batch_completeness.shape[
                0] > .8 and distinct_within_range / test_batch_completeness.shape[
                    0] > .8:
            return True
        else:
            return False

    elif method == 'tfdv':
        #set all flags (this is bc what we need to do with TFDV depends on the type of data we're dealing with)
        flights = False
        fb = False
        dirty = False
        #if this is flights data
        if 'FLIGHTS' in test_batch[0]: flights = True
        #if this is facebook data
        if 'FBPosts' in test_batch[0]: fb = True

        #if the test batch is a dirty file
        if 'dirty' in test_batch[0]: dirty = True

        ######## now work on updating the schema #######
        #update schema with all the files in the train batch
        updated_schema = acceptable_schema(train_batch)

        #make sure TFDV knows the columns that are only in the clean + not the dirty data
        updated_schema.default_environment.append('TRAINING')
        updated_schema.default_environment.append('TESTING')

        #specify that 'for_key' feature is not in TESTING environment
        if dirty and flights:
            tfdv.get_feature(updated_schema,
                             'for_key').not_in_environment.append('TESTING')
            #get the number of anomalies for the test batch prior to relaxing constraints
            num_anomalies = get_num_anomalies(test_batch[0],
                                              updated_schema,
                                              environment='TESTING')

        #don't need to do anything for the dirty facebook data (there are no extra columns to worry about)
        else:
            #if it's a clean file, default environment is TRAINING (or dirty facebook file since no extra cols)
            num_anomalies = get_num_anomalies(test_batch[0], updated_schema)

        #relax constraints for each type of data
        if flights:
            tfdv.get_feature(
                updated_schema,
                'ArrivalGate').distribution_constraints.min_domain_mass = 0.8
            tfdv.get_feature(
                updated_schema,
                'DepartureGate').distribution_constraints.min_domain_mass = 0.8
        if fb:
            tfdv.get_feature(
                updated_schema,
                'outlet').distribution_constraints.min_domain_mass = 0.65
            tfdv.get_feature(
                updated_schema,
                'domain').distribution_constraints.min_domain_mass = 0.6

        #get anomalies after having relaxed constraints
        if dirty and flights:
            adjusted_num_anomalies = get_num_anomalies(test_batch[0],
                                                       updated_schema,
                                                       environment='TESTING')
        else:
            adjusted_num_anomalies = get_num_anomalies(test_batch[0],
                                                       updated_schema)

        ######## determine whether acceptable or not ########
        #my definition of "acceptable" is just if the relaxed constraints had any affect on the anomalies
        #being present (so relaxing constraints should yield less anomalies after adjustment)
        if adjusted_num_anomalies < num_anomalies:
            return True
        else:
            return False

    elif method == 'exploratory':

        #if flights data
        if 'FLIGHTS' in test_batch[0]:
            data = 'flight'
        #if facebook data
        if 'FBPosts' in test_batch[0]:
            data = 'fb'

        if data == 'flight':
            test_batch_completeness, __ = completeness_dataframes(
                test_batch, test_batch)
            test_batch_distinct, __ = distinct_counts_dataframes(
                test_batch, test_batch)
            accepted = 0
            not_accepted = 0
            for i in range(len(test_batch_distinct.values)):
                # values based off mean values for flights data of distinct values and completeness
                if test_batch_distinct.values[i][
                        0] > 100 and test_batch_completeness.values[i][0] > 0.2:
                    not_accepted += 1  # if both values true than add to not_accepted
                else:
                    accepted += 1  # if not then accepted

            if not_accepted == 0:
                return True
            else:
                return False

        if data == 'fb':
            test_batch_completeness, __ = completeness_dataframes(
                test_batch, test_batch)
            test_batch_distinct, __ = distinct_counts_dataframes(
                test_batch, test_batch)
            accepted = 0
            not_accepted = 0
            for i in range(len(test_batch_distinct.values)):
                # values based off mean values for fb data of distinct values and completeness
                if test_batch_distinct.values[i][
                        0] > 30 and test_batch_completeness.values[i][0] > 0.25:
                    not_accepted += 1  # if both values true than add to not_accepted
                else:
                    accepted += 1  # if not then accepted

            if not_accepted == 0:
                return True
            else:
                return False
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)

# Display anomalies
tfdv.display_anomalies(anomalies)

"""Three features has inconsistent values, two of them has wrong type (net_manager, type_conn_perc), the purchase_are it is a feature expected to be outside of the train schema, it is due the fact correspond to a different region in Netherlands.

### Fixing the erros

The corrections will depend of our knowlegde of the data, looking at each feature and decide what action should be take, looking in the description of each feature will to decide how to correct the erros, **net_manager** is code of the regional network manager, **type_conn_perc: percentage** of presence of the principal type of connection in the zipcode range, **purchase_area** code of the area where the energy is purchased.

For the type_conn_perc let's covert it to BYTES and append the missing values to the purcha_are.
"""

# Convert the type_conn_perc to Bytes
type_conn_perc = tfdv.get_feature(schema, 'type_conn_perc')
type_conn_perc.type = 1 # Type 1 represent BYTES

# Add new value to the domain of feature net_manager.
purchase_area = tfdv.get_domain(schema, 'purchase_area')
purchase_area.value.append('Stedin')
purchase_area.value.append('Stedin Delfland')
purchase_area.value.append('Stedin Elektriciteit Zuid-Kennemerland')
purchase_area.value.append('Stedin Midden-Holland')
purchase_area.value.append('Stedin Schiedam')
purchase_area.value.append('Stedin Utrecht')
purchase_area.value.append('Stedin Weert')

# Validate eval stats after updating the schema 
updated_anomalies = tfdv.validate_statistics(test_stats, schema)
tfdv.display_anomalies(updated_anomalies)