Exemplo n.º 1
0
    def Do(self, input_dict, output_dict, exec_properties):
        """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        tf.logging.info('Validating schema against the computed statistics.')
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                types.get_single_uri(input_dict['schema'])))
        stats = tfdv.load_statistics(
            io_utils.get_only_uri_in_dir(
                types.get_split_uri(input_dict['stats'], 'eval')))
        output_uri = types.get_single_uri(output_dict['output'])
        anomalies = tfdv.validate_statistics(stats, schema)
        io_utils.write_pbtxt_file(os.path.join(output_uri, DEFAULT_FILE_NAME),
                                  anomalies)
        tf.logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
Exemplo n.º 2
0
def csv_statistics_validator(stats, schema):
    """
    Validate statistics from a csv dataset
    """
    stats_anomalies = tfdv.validate_statistics(statistics = stats, schema = schema)
    tfdv.display_anomalies(stats_anomalies)
    return stats_anomalies
Exemplo n.º 3
0
def tfdv_statistics_anomalies(
        source_statistics: DatasetFeatureStatisticsList,
        input_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]:
    """
    Compare two TDFV statistics and return anomalies.
    Args:
        source_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: source statistics
        input_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: input statistics
    Returns:
        Tuple[bool, Dict]:
            True if anomalies are detected, otherwise False,
            dictionary with structure:
            {
                <column_name>: {
                    'description': <description>,
                    'severity': 'ERROR',
                    'shortDescription': <short description>,
                    'reason': [{'type': <error_type>,
                                'shortDescription': <short description>,
                                'description': <description>}],
                    'path': {'step': [<column_name>]}
                }
            }
    """

    schema = tfdv.infer_schema(source_statistics)
    anomalies = tfdv_object_to_dict(
        tfdv.validate_statistics(statistics=input_statistics, schema=schema))
    tfdv_anomalies = anomalies.get('anomalyInfo', {})

    return len(tfdv_anomalies) > 0, tfdv_anomalies
Exemplo n.º 4
0
Arquivo: executor.py Projeto: zvrr/tfx
    def _Validate(self, inputs: Dict[Text, Any], outputs: Dict[Text,
                                                               Any]) -> None:
        """Validate the inputs and put validate result into outputs.

      This is the implementation part of example validator executor. This is
      intended for using or extending the executor without artifact dependecy.

    Args:
      inputs: A dictionary of labeled input values, including:
        - labels.STATS: the feature statistics to validate
        - labels.SCHEMA: the schema to respect
        - (Optional) labels.ENVIRONMENT: if an environment is specified, only
          validate the feature statistics of the fields in that environment.
          Otherwise, validate all fields.
        - (Optional) labels.PREV_SPAN_FEATURE_STATISTICS: the feature
          statistics of a previous span.
        - (Optional) labels.PREV_VERSION_FEATURE_STATISTICS: the feature
          statistics of a previous version.
        - (Optional) labels.FEATURES_NEEDED: the feature needed to be
          validated on.
        - (Optional) labels.VALIDATION_CONFIG: the configuration of this
          validation.
        - (Optional) labels.EXTERNAL_CONFIG_VERSION: the version number of
          external config file.
      outputs: A dictionary of labeled output values, including:
          - labels.SCHEMA_DIFF_PATH: the path to write the schema diff to
    """
        schema = value_utils.GetSoleValue(inputs, labels.SCHEMA)
        stats = value_utils.GetSoleValue(inputs, labels.STATS)
        schema_diff_path = value_utils.GetSoleValue(outputs,
                                                    labels.SCHEMA_DIFF_PATH)
        anomalies = tfdv.validate_statistics(stats, schema)
        io_utils.write_pbtxt_file(
            os.path.join(schema_diff_path, DEFAULT_FILE_NAME), anomalies)
Exemplo n.º 5
0
def detect_anomalies(stats_uri: Text, schema_uri: Text, split_name: Text):
    schema = get_schema_proto(schema_uri)
    stats = get_statistics_dataset_dict(stats_uri)
    if split_name not in stats:
        raise Exception(f'{split_name} split not present!')
    anomalies = tfdv.validate_statistics(stats[split_name], schema)
    tfdv.display_anomalies(anomalies)
Exemplo n.º 6
0
 def validate_stats_against_schema(self):  # type: () -> bool
     stats = tfdv.load_statistics(self.stats_path)
     self.anomalies = tfdv.validate_statistics(stats, self.schema)
     if len(self.anomalies.anomaly_info.items()) > 0:
         logger.error("Anomalies found in training dataset...")
         logger.error(str(self.anomalies.anomaly_info.items()))
         self.upload_anomalies()
         return False
     else:
         logger.info("No anomalies found")
         return True
Exemplo n.º 7
0
def tfdv_skew_validator(feature_name, train_stats, serve_stats, schema, threshold):
    """
    Validate skew for the csv dataset
    """
    #this doesn't display skew anomalies as the book shows
    tfdv.get_feature(schema, feature_name).skew_comparator.infinity_norm.threshold = threshold
    skew_anomalies = tfdv.validate_statistics(statistics = train_stats,
                                                schema = schema,
                                                serving_statistics = serve_stats)
    tfdv.display_anomalies(skew_anomalies)
    return skew_anomalies
Exemplo n.º 8
0
def data_validation(data_path):
    train = tfdv.generate_statistics_from_csv(
        os.path.join(data_path, 'train.csv'), delimiter=',')
    test = tfdv.generate_statistics_from_csv(
        os.path.join(data_path, 'train.csv'), delimiter=',')
    schema = tfdv.infer_schema(train)
    # print(schema)
    # tfdv.display_schema(schema)
    anomalies = tfdv.validate_statistics(statistics=test, schema=schema)
    # print(anomalies)
    # tfdv.display_anomalies(anomalies)
    print(text_format.MessageToString(anomalies))
Exemplo n.º 9
0
def tfdv_drift_validator(feature_name, train_stats, previous_stats, schema, threshold):
    """
    Validate drift for the csv dataset
    """
    #this doesn't display drift anomalies as the book shows
    tfdv.get_feature(schema, feature_name).drift_comparator.infinity_norm.threshold = threshold
    drift_anomalies = tfdv.validate_statistics(statistics=train_stats, 
                                                schema=schema, 
                                                previous_statistics= previous_stats
                                                )
    tfdv.display_anomalies(drift_anomalies)
    return drift_anomalies
Exemplo n.º 10
0
def skew_drift_validator(mode, gcp_bucket, control_set_path,
                         treatment_set_path, feature_list_str, Linf_value):
    logging.basicConfig(level=logging.INFO)
    logging.info('Starting skew drift validator ..')
    logging.info('Input data:')
    logging.info('mode:{}'.format(mode))
    logging.info('gcp_bucket:{}'.format(gcp_bucket))
    logging.info('control_set_path:{}'.format(control_set_path))
    logging.info('treatment_set_path:{}'.format(treatment_set_path))
    logging.info('Linf_value:{}'.format(Linf_value))

    feature_list = eval(feature_list_str)
    control_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + control_set_path,
                                 sep=',')
    treat_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + treatment_set_path,
                               sep=',')
    control_stats = tfdv.generate_statistics_from_dataframe(
        dataframe=control_set_df)
    treat_stats = tfdv.generate_statistics_from_dataframe(
        dataframe=treat_set_df)
    control_schema = tfdv.infer_schema(control_stats)
    treat_schema = tfdv.infer_schema(treat_stats)

    for feature in feature_list:
        if (mode == "skew"):
            if (tfdv.get_feature(control_schema, feature).domain
                ):  # if we have domain it is a categorical variable
                tfdv.get_feature(
                    control_schema, feature
                ).skew_comparator.infinity_norm.threshold = Linf_value
            else:
                logging.critical(
                    "feature: {} is not categorical".format(feature))
                sys.exit(1)
        elif (mode == "drift"):
            tfdv.get_feature(
                control_schema,
                feature).drift_comparator.infinity_norm.threshold = Linf_value
        else:
            logging.critical("mode: {} not supported".format(mode))
            sys.exit(1)
    anomalies = tfdv.validate_statistics(statistics=control_stats,
                                         schema=control_schema,
                                         serving_statistics=treat_stats)
    if (anomalies.anomaly_info):
        logging.info("Data-{} detected:".format(anomalies))
        return anomalies
    else:
        logging.info("No data-{} detected".format(mode))
Exemplo n.º 11
0
def get_num_anomalies(csv_file, schema, environment='TRAINING'):
    #get column names from passed in schema
    cols = [f.name for f in schema.feature].sort()

    options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
    data_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,\
                                                   column_names=cols, stats_options=options)

    # Check eval data for errors by validating the eval data stats using the previously inferred schema
    anomalies = tfdv.validate_statistics(statistics=data_stats,
                                         schema=schema,
                                         environment=environment)

    #tfdv.display_anomalies(anomalies)

    return len(anomalies.anomaly_info)
Exemplo n.º 12
0
def validate_stats(stats_path, schema_path, anomalies_path):
    """Validates the statistics against the schema and materializes anomalies.

    Args:
      stats_path: Location of the stats used to infer the schema.
      schema_path: Location of the schema to be used for validation.
      anomalies_path: Location where the detected anomalies are materialized.
    """
    # Validating schema against the computed statistics
    schema = my_metadata.read_schema(schema_path)

    stats = tfdv.load_statistics(stats_path)
    anomalies = tfdv.validate_statistics(stats, schema)

    # Writing anomalies to anomalies path to
    file_io.write_string_to_file(anomalies_path,
                                 text_format.MessageToString(anomalies))
Exemplo n.º 13
0
    def test_e2e(self, stats_options, expected_stats_pbtxt,
                 expected_inferred_schema_pbtxt, schema_for_validation_pbtxt,
                 expected_anomalies_pbtxt, expected_updated_schema_pbtxt):
        tfxio = tf_sequence_example_record.TFSequenceExampleRecord(
            self._input_file, ['tfdv', 'test'])
        stats_file = os.path.join(self._output_dir, 'stats')
        with beam.Pipeline() as p:
            _ = (p
                 | 'TFXIORead' >> tfxio.BeamSource()
                 | 'GenerateStats' >> tfdv.GenerateStatistics(stats_options)
                 | 'WriteStats' >> tfdv.WriteStatisticsToTFRecord(stats_file))

        actual_stats = tfdv.load_statistics(stats_file)
        test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self,
            text_format.Parse(
                expected_stats_pbtxt,
                statistics_pb2.DatasetFeatureStatisticsList()))([actual_stats])
        actual_inferred_schema = tfdv.infer_schema(actual_stats,
                                                   infer_feature_shape=True)

        if hasattr(actual_inferred_schema, 'generate_legacy_feature_spec'):
            actual_inferred_schema.ClearField('generate_legacy_feature_spec')
        self._assert_schema_equal(
            actual_inferred_schema,
            text_format.Parse(expected_inferred_schema_pbtxt,
                              schema_pb2.Schema()))

        schema_for_validation = text_format.Parse(schema_for_validation_pbtxt,
                                                  schema_pb2.Schema())
        actual_anomalies = tfdv.validate_statistics(actual_stats,
                                                    schema_for_validation)
        actual_anomalies.ClearField('baseline')
        self.assertEqual(
            actual_anomalies,
            text_format.Parse(expected_anomalies_pbtxt,
                              anomalies_pb2.Anomalies()))

        actual_updated_schema = tfdv.update_schema(schema_for_validation,
                                                   actual_stats,
                                                   infer_feature_shape=False)
        self._assert_schema_equal(
            actual_updated_schema,
            text_format.Parse(expected_updated_schema_pbtxt,
                              schema_pb2.Schema()))
Exemplo n.º 14
0
def validate_stats(stats_path, schema_path, anomalies_path):
    """Validates the statistics against the schema and materializes anomalies.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location of the schema to be used for validation.
    anomalies_path: Location where the detected anomalies are materialized.
  """
    print('Validating schema against the computed statistics.')
    schema = tfdv.load_schema_text(schema_path)
    stats = tfdv.load_statistics(stats_path)
    anomalies = tfdv.validate_statistics(stats, schema)
    print('Detected following anomalies:')
    print(text_format.MessageToString(anomalies))

    print('Writing anomalies to anomalies path.')
    file_io.write_string_to_file(anomalies_path,
                                 text_format.MessageToString(anomalies))
Exemplo n.º 15
0
def validate_stats(stats_path, schema_path, anomalies_path):
  """Validates the statistics against the schema and materializes anomalies.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location of the schema to be used for validation.
    anomalies_path: Location where the detected anomalies are materialized.
  """
  print('Validating schema against the computed statistics.')
  schema = taxi.read_schema(schema_path)

  stats = tfdv.load_statistics(stats_path)
  anomalies = tfdv.validate_statistics(stats, schema)
  print('Detected following anomalies:')
  print(text_format.MessageToString(anomalies))

  print('Writing anomalies to anomalies path.')
  file_io.write_string_to_file(anomalies_path,
                               text_format.MessageToString(anomalies))
Exemplo n.º 16
0
    def evaluate(self, model: BaseEstimator, num_repetitions: int,
                 *corruptions: DataCorruption):

        schema = self.schema_from_train_data()

        baseline_predictions = model.predict_proba(self._task.test_data)
        baseline_score = self._task.score_on_test_data(baseline_predictions)

        results = []

        # Repeatedly corrupt the test data
        for corruption in corruptions:
            corrupted_scores = []
            anomalies = []
            for _ in range(0, num_repetitions):
                test_data_copy = self._task.test_data.copy(deep=True)
                corrupted_data = corruption.transform(test_data_copy)

                # Determine whether tfdv finds anomalies in the data
                corrupted_data_stats = tfdv.generate_statistics_from_dataframe(
                    corrupted_data)
                tfdv_anomalies = tfdv.validate_statistics(
                    statistics=corrupted_data_stats, schema=schema)

                schema_anomalies = tfdv_anomalies.anomaly_info

                # Compute the prediction score on the test data
                corrupted_predictions = model.predict_proba(corrupted_data)
                corrupted_score = self._task.score_on_test_data(
                    corrupted_predictions)

                anomalies.append(schema_anomalies)
                corrupted_scores.append(corrupted_score)

            results.append(
                SchemaValidationResult(corruption, anomalies, baseline_score,
                                       corrupted_scores))

        return results
Exemplo n.º 17
0
 def validate_stats_against_schema(self,
                                   environment=None,
                                   previous_statistics=None,
                                   serving_statistics=None,
                                   ):
     # type: (str, DatasetFeatureStatisticsList, DatasetFeatureStatisticsList) -> bool
     stats = tfdv.load_statistics(self.stats_path)
     self.anomalies = tfdv.validate_statistics(
         stats,
         self.schema,
         environment=environment,
         previous_statistics=previous_statistics,
         serving_statistics=serving_statistics,
     )
     if len(self.anomalies.anomaly_info.items()) > 0:
         logger.error("Anomalies found in training dataset...")
         logger.error(str(self.anomalies.anomaly_info.items()))
         self.upload_anomalies()
         return False
     else:
         logger.info("No anomalies found")
         return True
Exemplo n.º 18
0
def tfdv_detect_drift(
    stats_older_path: str, stats_new_path: str
) -> NamedTuple('Outputs', [('drift', str)]):

  import logging
  import time

  import tensorflow_data_validation as tfdv
  import tensorflow_data_validation.statistics.stats_impl

  logging.getLogger().setLevel(logging.INFO)
  logging.info('stats_older_path: %s', stats_older_path)
  logging.info('stats_new_path: %s', stats_new_path)

  if stats_older_path == 'none':
    return ('true', )

  stats1 = tfdv.load_statistics(stats_older_path)
  stats2 = tfdv.load_statistics(stats_new_path)

  schema1 = tfdv.infer_schema(statistics=stats1)
  tfdv.get_feature(schema1, 'duration').drift_comparator.jensen_shannon_divergence.threshold = 0.01
  drift_anomalies = tfdv.validate_statistics(
      statistics=stats2, schema=schema1, previous_statistics=stats1)
  logging.info('drift analysis results: %s', drift_anomalies.drift_skew_info)

  from google.protobuf.json_format import MessageToDict
  d = MessageToDict(drift_anomalies)
  val = d['driftSkewInfo'][0]['driftMeasurements'][0]['value']
  thresh = d['driftSkewInfo'][0]['driftMeasurements'][0]['threshold']
  logging.info('value %s and threshold %s', val, thresh)
  res = 'true'
  if val < thresh:
    res = 'false'
  logging.info('train decision: %s', res)
  return (res, )
Exemplo n.º 19
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    tf.logging.info('Validating schema against the computed statistics.')
    schema = io_utils.SchemaReader().read(
        io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema'])))
    stats = tfdv.load_statistics(
        io_utils.get_only_uri_in_dir(
            types.get_split_uri(input_dict['stats'], 'eval')))
    output_uri = types.get_single_uri(output_dict['output'])
    anomalies = tfdv.validate_statistics(stats, schema)
    io_utils.write_pbtxt_file(
        os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies)
    tf.logging.info(
        'Validation complete. Anomalies written to {}.'.format(output_uri))
Exemplo n.º 20
0
# %%
# Compute stats for evaluation data
test_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_test)

# Compare evaluation data with training data
tfdv.visualize_statistics(lhs_statistics=test_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

# %%
'''
## evaluate for anomalies related with schema
'''

# %%
anomalies = tfdv.validate_statistics(statistics=train_stats, schema=schema)
tfdv.display_anomalies(anomalies)

# %%
## checking the data is compliant with the schema of the train part

# %%
data_stats = tfdv.generate_statistics_from_dataframe(data)
serving_anomalies = tfdv.validate_statistics(data_stats, schema)
tfdv.display_anomalies(serving_anomalies)

# %%


# %%
Exemplo n.º 21
0
                        datefmt='%d-%m-%y %H:%M:%S')
    logging.info(f"Args: {args}")

    stats1 = tfdv.load_stats_text(input_path=args.stats_file_1)
    stats2 = tfdv.load_stats_text(input_path=args.stats_file_2)

    schema1 = tfdv.infer_schema(statistics=stats1)

    # Custom rules, tweak this as required. This is just an example
    tfdv.get_feature(
        schema1,
        'I1').drift_comparator.jensen_shannon_divergence.threshold = 0.06

    # Calculate drift between the reference stats stats1, and the statistics from new data in stats2
    drift_anomalies = tfdv.validate_statistics(statistics=stats2,
                                               schema=schema1,
                                               previous_statistics=stats1)

    # Convert the .pb2 to dict
    drift = MessageToDict(drift_anomalies)

    value = drift['driftSkewInfo'][0]['driftMeasurements'][0]['value']
    threshold = drift['driftSkewInfo'][0]['driftMeasurements'][0]['threshold']
    logging.info(
        f"JS divergence value: {value}, and JS divergence threshold: {threshold}"
    )
    drift_detected = True
    if value < threshold:
        drift_detected = False
    logging.info(f"Drift detected: {drift_detected}")
Exemplo n.º 22
0
def run_validator(output_dir, column_names, key_columns, csv_data_file,
                  csv_data_file_to_validate):
    """Writes a TFDV-generated schema.
    Args:
      output_dir: output folder
      column_names: list of names for the columns in the CSV file. If omitted,
          the first line is treated as the column names.
      key_columns: list of the names for columns that should be
          treated as unique keys.
      csv_data_file: name of the CSV file to analyze and generate a schema.
      csv_data_file_to_validate: name of a CSV file to validate
          against the schema.
    """
    logging.getLogger().info('running in local mode')
    pipeline_options = None

    logging.getLogger().info('starting stats on tfdv')
    stats = tfdv.generate_statistics_from_csv(
        data_location=csv_data_file,
        column_names=column_names,
        delimiter=',',
        output_path=os.path.join(output_dir, 'data_stats.tfrecord'),
        pipeline_options=pipeline_options)
    schema = tfdv.infer_schema(stats)

    logging.getLogger().info('loading output_schema.pb2')
    with open('{}/output_schema.pb2'.format(output_dir), 'w+') as f:
        f.write(schema.SerializeToString())

    logging.getLogger().info('loading [output_dir] {} schema.pb2'.format(output_dir))
    with file_io.FileIO(os.path.join(output_dir, 'schema.pb2'), 'w+') as f:
        logging.getLogger().info('loading schema to {}'.format(f.name))
        f.write(schema.SerializeToString())

    schema_json = convert_schema_proto_to_json(
        schema, column_names, key_columns)

    logging.getLogger().info(' logging output_schema.json')
    with open('{}/output_schema.json'.format(output_dir), 'w+') as f:
        json.dump(schema_json, f)
    schema_json_file = os.path.join(output_dir, 'schema.json')

    with file_io.FileIO(schema_json_file, 'w+') as f:
        logging.getLogger().info('logging JSON schema to {}'.format(f.name))
        json.dump(schema_json, f)

    with open('{}/schema.txt'.format(output_dir), 'w+') as f:
        logging.getLogger().info('schema.txt to {}'.format(f.name))
        f.write(schema_json_file)

    logging.getLogger().info('Schema Write Done...')

    if not csv_data_file_to_validate:
        logging.getLogger().info('No csv file to validate')
        return

    logging.getLogger().info('Validation Stats...')
    validation_stats = tfdv.generate_statistics_from_csv(
        data_location=csv_data_file_to_validate,
        column_names=column_names,
        delimiter=',',
        output_path=os.path.join(output_dir, 'validation_data_stats.tfrecord'),
        pipeline_options=pipeline_options)
    anomalies = tfdv.validate_statistics(validation_stats, schema)

    logging.getLogger().info('logging output validation results ...')
    with open('{}/output_validation_result.txt'.format(output_dir), 'w+') as f:
        if len(anomalies.anomaly_info.items()) > 0:
            f.write('invalid')
        else:
            f.write('valid')
            return

    # logging.getLogger().info('logging anomalies result ...')
    # with file_io.FileIO(os.path.join(output_dir, 'anomalies.pb2'), 'w+') as f:
    #     logging.getLogger().info('logging anomalies to {}'.format(f.name))
    #     f.write(anomalies.SerializeToString())
    #
    # for feature_name, anomaly_info in anomalies.anomaly_info.items():
    #     logging.getLogger().error(
    #         'Anomaly in feature "{}": {}'.format(
    #             feature_name, anomaly_info.description))
    return 0
# COMMAND ----------

# MAGIC %md infer schema

# COMMAND ----------

weather_data_schema = tfdv.infer_schema(statistics=stats)
tfdv.display_schema(schema=weather_data_schema)

# COMMAND ----------

# MAGIC %md check for anomalies

# COMMAND ----------

weather_anomalies = tfdv.validate_statistics(statistics=stats,
                                             schema=weather_data_schema)
tfdv.display_anomalies(weather_anomalies)

# COMMAND ----------

# MAGIC %md average monthly temperature

# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT x.month,AVG(x.avg_temp_f) as avg_temp_f FROM (
# MAGIC SELECT MONTH(time) as month,YEAR(time) as year,SUM(avg_temp_f) as avg_temp_f
# MAGIC FROM dscc202_group05_db.weather_delta_bronze
# MAGIC GROUP BY MONTH(time), YEAR(time)
# MAGIC   ) x
# MAGIC GROUP BY x.month
Exemplo n.º 24
0
                          rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', 
                          rhs_name='TRAIN_DATASET')

"""In evalation of the data TFDV shows in the plot both the train and eval stats overlaid, it is possible to check the distribuition visually, it can speedy the change in the schema when necessary.

Looking in the plots we can consider both the train and eval has the same distribution, the difenrrence in the plots is just in the quantity of each value in the features.

### Anomalies

After generate stats for both train and eval dataset, it necessary to check if the schema define to each one is the same (range of values, type of feature and so on), TFDV has a function to look for anomalies, e.g, features define different for each schema.
"""

# Check eval data for errors by validating the Eval data stats using the 
# previously inferred schema.
anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=schema)

# Display anomalies
tfdv.display_anomalies(anomalies)

"""We have just one feature with anomalies, look deeply we can check the values are correct but the problem there is a period and a space in the values, so the TFDV considered it as a new values.

### Fixing the erros

The corrections will depend of our knowlegde of the data, looking at each feature and decide what action should be take, for our case it is not necessary to ajusted the schema, but to test how to correct the errors we will add the values to the domain and check again the anomalies
"""

# Add new value to the domain of feature wage.
wage = tfdv.get_domain(schema, 'wage')
wage.value.append(' <=50K.')
wage.value.append(' >50K.')
Exemplo n.º 25
0
training_data = dataset[:1600]
print(training_data.describe())

test_set = dataset[1600:]
print(test_set.describe())

# Generate training data statistics
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=dataset)
schema = tfdv.infer_schema(statistics=train_stats)
print(tfdv.display_schema(schema))

test_stats = tfdv.generate_statistics_from_dataframe(dataframe=test_set)

# Compare test statistics with the Schema
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)
# Displaying all detected anomalies
# Integer larger than 10
# STRING type when expected INT type
# FLOAT type when expected INT type
# Integer smaller than 0
print(tfdv.display_anomalies(anomalies))

# New data WITH anomalies
test_set_copy = test_set.copy()
test_set_copy.drop("soot", axis=1, inplace=True)

# Statistics based on data with anomalies
test_set_copy_stats = tfdv.generate_statistics_from_dataframe(
    dataframe=test_set_copy)
anomalies_new = tfdv.validate_statistics(statistics=test_set_copy_stats,
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    vldn_output = os.path.join(DATA_DIR, 'validation')

    # TODO: Understand why this was used in the conversion to the output json
    # key columns: list of the names for columns that should be treated as unique keys.
    key_columns = ['trip_start_timestamp']

    # read the first line of the cvs to have and ordered list of column names
    # (the Schema will scrable the features)
    with open(TRAIN_DATA) as f:
        column_names = f.readline().strip().split(',')

    stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
    schema = tfdv.infer_schema(stats)

    eval_stats = tfdv.generate_statistics_from_csv(
        data_location=EVALUATION_DATA)
    anomalies = tfdv.validate_statistics(eval_stats, schema)

    # Log anomalies
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error(
            'Anomaly in feature "{}": {}'.format(
                feature_name, anomaly_info.description))

    # show inferred schema
    tfdv.display_schema(schema=schema)
    # Resolve anomalies
    company = tfdv.get_feature(schema, 'company')
    company.distribution_constraints.min_domain_mass = 0.9

    # Add new value to the domain of feature payment_type.
    payment_type_domain = tfdv.get_domain(schema, 'payment_type')
    payment_type_domain.value.append('Prcard')

    # Validate eval stats after updating the schema
    updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
    tfdv.display_anomalies(updated_anomalies)

    # -----------------------DATA SAVING START---------------------------------
    if "column_names" in locals():
        _kale_resource_save(column_names, os.path.join(
            _kale_data_directory, "column_names"))
    else:
        print("_kale_resource_save: `column_names` not found.")
    if "schema" in locals():
        _kale_resource_save(schema, os.path.join(
            _kale_data_directory, "schema"))
    else:
        print("_kale_resource_save: `schema` not found.")
Exemplo n.º 27
0
def run_validator(output_dir, column_names, key_columns, csv_data_file,
                  csv_data_file_to_validate, project, mode):
    """Writes a TFDV-generated schema.

    Args:
      output_dir: output folder
      column_names: list of names for the columns in the CSV file. If omitted,
          the first line is treated as the column names.
      key_columns: list of the names for columns that should be
          treated as unique keys.
      csv_data_file: name of the CSV file to analyze and generate a schema.
      csv_data_file_to_validate: name of a CSV file to validate
          against the schema.
      project: the project to run dataflow in.
      mode: whether the job should be `local` or `cloud`.
    """
    if mode == 'local':
        pipeline_options = None
    elif mode == 'cloud':
        temp_dir = os.path.join(output_dir, 'tmp')
        options = {
            'job_name': ('pipeline-tfdv-' +
                         datetime.datetime.now().strftime('%y%m%d-%H%M%S')),
            'setup_file':
            './validation/setup.py',
            'project':
            project,
            'temp_location':
            temp_dir,
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        pipeline_options.view_as(StandardOptions).runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    stats = tfdv.generate_statistics_from_csv(
        data_location=csv_data_file,
        column_names=column_names,
        delimiter=',',
        output_path=os.path.join(output_dir, 'data_stats.tfrecord'),
        pipeline_options=pipeline_options)
    schema = tfdv.infer_schema(stats)
    with open('/output_schema.pb2', 'w+') as f:
        f.write(schema.SerializeToString())
    with file_io.FileIO(os.path.join(output_dir, 'schema.pb2'), 'w+') as f:
        logging.getLogger().info('Writing schema to {}'.format(f.name))
        f.write(schema.SerializeToString())
    schema_json = convert_schema_proto_to_json(schema, column_names,
                                               key_columns)
    with open('/output_schema.json', 'w+') as f:
        json.dump(schema_json, f)
    with file_io.FileIO(os.path.join(output_dir, 'schema.json'), 'w+') as f:
        logging.getLogger().info('Writing JSON schema to {}'.format(f.name))
        json.dump(schema_json, f)

    if not csv_data_file_to_validate:
        return

    validation_stats = tfdv.generate_statistics_from_csv(
        data_location=csv_data_file_to_validate,
        column_names=column_names,
        delimiter=',',
        output_path=os.path.join(output_dir, 'validation_data_stats.tfrecord'),
        pipeline_options=pipeline_options)
    anomalies = tfdv.validate_statistics(validation_stats, schema)
    with open('/output_validation_result.txt', 'w+') as f:
        if len(anomalies.anomaly_info.items()) > 0:
            f.write('invalid')
        else:
            f.write('valid')
            return

    with file_io.FileIO(os.path.join(output_dir, 'anomalies.pb2'), 'w+') as f:
        logging.getLogger().info('Writing anomalies to {}'.format(f.name))
        f.write(anomalies.SerializeToString())
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error('Anomaly in feature "{}": {}'.format(
            feature_name, anomaly_info.description))
Exemplo n.º 28
0
    def run(self, task, model, schema, num_corruptions, performance_threshold):
        # Make sure the schema works on the clean test data
        assert (not self.has_anomaly(self.validate(schema, task.test_data)))

        baseline_predictions = model.predict_proba(task.test_data)
        baseline_score = task.score_on_test_data(baseline_predictions)

        random_corruptions = set()

        for _ in range(0, num_corruptions):
            num_columns = len(task.numerical_columns +
                              task.categorical_columns + task.text_columns)
            p_numerical_column_affected = float(len(
                task.numerical_columns)) / num_columns
            p_categorical_column_affected = float(len(
                task.categorical_columns)) / num_columns
            p_text_column_affected = float(len(
                task.text_columns)) / num_columns

            affected_column_type = np.random.choice(
                ['numerical', 'categorical', 'text'],
                1,
                p=[
                    p_numerical_column_affected, p_categorical_column_affected,
                    p_text_column_affected
                ])

            fraction = float(np.random.randint(100)) / 100

            if affected_column_type == 'numerical':

                if len(task.numerical_columns) >= 2 and np.random.uniform(
                ) < 0.1:
                    affected_columns = np.random.choice(
                        task.numerical_columns, 2)
                    random_corruptions.add(
                        SwappedValues(affected_columns[0], affected_columns[1],
                                      fraction))
                else:

                    corruption_type = np.random.choice(
                        ['missing', 'noise', 'scaling'])

                    if corruption_type == 'missing':
                        missingness = np.random.choice(['MCAR', 'MAR', 'MNAR'])
                        affected_column = np.random.choice(
                            task.numerical_columns)
                        random_corruptions.add(
                            MissingValues(affected_column,
                                          fraction,
                                          na_value=np.nan,
                                          missingness=missingness))
                    elif corruption_type == 'noise':
                        affected_column = np.random.choice(
                            task.numerical_columns)
                        random_corruptions.add(
                            GaussianNoise(affected_column, fraction))
                    elif corruption_type == 'scaling':
                        affected_column = np.random.choice(
                            task.numerical_columns)
                        random_corruptions.add(
                            Scaling(affected_column, fraction))

            elif affected_column_type == 'categorical':

                if len(task.categorical_columns) >= 2 and np.random.uniform(
                ) < 0.1:
                    affected_columns = np.random.choice(
                        task.categorical_columns, 2)
                    random_corruptions.add(
                        SwappedValues(affected_columns[0], affected_columns[1],
                                      fraction))
                else:
                    corruption_type = np.random.choice(['missing', 'encoding'])

                    if corruption_type == 'missing':
                        missingness = np.random.choice(['MCAR', 'MAR', 'MNAR'])
                        affected_column = np.random.choice(
                            task.categorical_columns)
                        random_corruptions.add(
                            MissingValues(affected_column,
                                          fraction,
                                          na_value='',
                                          missingness=missingness))

                    elif corruption_type == 'encoding':
                        affected_column = np.random.choice(
                            task.categorical_columns)
                        random_corruptions.add(
                            BrokenCharacters(affected_column, fraction))

            elif affected_column_type == 'text':

                if len(task.text_columns) >= 2 and np.random.uniform() < 0.1:
                    affected_columns = np.random.choice(task.text_columns, 2)
                    random_corruptions.add(
                        SwappedValues(affected_columns[0], affected_columns[1],
                                      fraction))
                else:
                    corruption_type = np.random.choice(['missing', 'encoding'])

                    if corruption_type == 'missing':
                        missingness = np.random.choice(['MCAR', 'MAR', 'MNAR'])
                        affected_column = np.random.choice(task.text_columns)
                        random_corruptions.add(
                            MissingValues(affected_column,
                                          fraction,
                                          na_value='',
                                          missingness=missingness))

                    elif corruption_type == 'encoding':
                        affected_column = np.random.choice(task.text_columns)
                        random_corruptions.add(
                            BrokenCharacters(affected_column, fraction))

        outcome = {
            'corruption': [],
            'status': [],
            'anomalies': [],
            'baseline_score': [],
            'corrupted_score': []
        }

        for corruption in random_corruptions:
            print(corruption)
            test_data_copy = task.test_data.copy(deep=True)
            corrupted_data = corruption.transform(test_data_copy)

            corrupted_data_stats = tfdv.generate_statistics_from_dataframe(
                corrupted_data)
            tfdv_anomalies = tfdv.validate_statistics(
                statistics=corrupted_data_stats, schema=schema)

            schema_anomalies = tfdv_anomalies.anomaly_info

            try:
                corrupted_predictions = model.predict_proba(corrupted_data)
                corrupted_score = task.score_on_test_data(
                    corrupted_predictions)

                performance_drop = (baseline_score -
                                    corrupted_score) / baseline_score

                has_negative_impact = performance_drop > performance_threshold
            except:
                corrupted_score = None
                has_negative_impact = True

            has_anomalies = len(tfdv_anomalies.anomaly_info) != 0

            if has_anomalies:
                if has_negative_impact:
                    status = 'TP'
                else:
                    status = 'FP'
            else:
                if not has_negative_impact:
                    status = 'TN'
                else:
                    status = 'FN'

            outcome['corruption'].append(str(corruption))
            outcome['status'].append(status)
            outcome['anomalies'].append(str(schema_anomalies))
            outcome['baseline_score'].append(baseline_score)
            outcome['corrupted_score'].append(corrupted_score)

        return pd.DataFrame.from_dict(outcome)
Exemplo n.º 29
0
 def validate(self, schema, data):
     stats = tfdv.generate_statistics_from_dataframe(data)
     return tfdv.validate_statistics(statistics=stats, schema=schema)
                          rhs_statistics=train_stats,
                          lhs_name='Test_Dataset', 
                          rhs_name='Train_Dataset')

"""In evalation of the data TFDV shows in the plot both the train and test stats overlaid, it is possible to check the distribuition visually, it can speedy the change in the schema when necessary.

Looking in the results, we can infer easily the data from Liander region can't be use to predict the values on Stedin region, all the median, meadn, std, missing values are different between the dataset, but now we don't make any change and we will continue with the TFDV analysis.

### Anomalies

After generate stats for both train and test dataset, it necessary to check if the schema define to each one is the same (range of values, type of feature and so on), TFDV has a function to look for anomalies, e.g, features define different for each schema.
"""

# Check eval data for errors by validating the test data stats using the 
# previously inferred schema.
anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)

# Display anomalies
tfdv.display_anomalies(anomalies)

"""Three features has inconsistent values, two of them has wrong type (net_manager, type_conn_perc), the purchase_are it is a feature expected to be outside of the train schema, it is due the fact correspond to a different region in Netherlands.

### Fixing the erros

The corrections will depend of our knowlegde of the data, looking at each feature and decide what action should be take, looking in the description of each feature will to decide how to correct the erros, **net_manager** is code of the regional network manager, **type_conn_perc: percentage** of presence of the principal type of connection in the zipcode range, **purchase_area** code of the area where the energy is purchased.

For the type_conn_perc let's covert it to BYTES and append the missing values to the purcha_are.
"""

# Convert the type_conn_perc to Bytes
type_conn_perc = tfdv.get_feature(schema, 'type_conn_perc')
schema = tfdv.infer_schema(statistics=stats_train)
tfdv.display_schema(schema=schema)

# COMMAND ----------

# Compare evaluation data with training data
displayHTML(
    get_statistics_html(lhs_statistics=stats_serve,
                        rhs_statistics=stats_train,
                        lhs_name='SERVE_DATASET',
                        rhs_name='TRAIN_DATASET'))

# COMMAND ----------

anomalies = tfdv.validate_statistics(statistics=stats_serve, schema=schema)
tfdv.display_anomalies(anomalies)

# COMMAND ----------

# Add skew and drift comparators
temp_f = tfdv.get_feature(schema, 'avg_temp_f')
temp_f.skew_comparator.jensen_shannon_divergence.threshold = 0
temp_f.drift_comparator.jensen_shannon_divergence.threshold = 0

precip_mm = tfdv.get_feature(schema, 'tot_precip_mm')
precip_mm.skew_comparator.jensen_shannon_divergence.threshold = 0
precip_mm.drift_comparator.jensen_shannon_divergence.threshold = 0

_anomalies = tfdv.validate_statistics(stats_train,
                                      schema,
Exemplo n.º 32
0
# imports
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.components import CsvExampleGen
from tfx.proto import example_gen_pb2
import tensorflow as tf

import pandas as pd
import csv
import re 

# Set the Interaction Context
context = InteractiveContext(pipeline_root= _tfx_pipeline)

# Ingesting the data
from tfx.components import CsvExampleGen
from tfx.components import ImportExampleGen
from tfx.utils.dsl_utils import external_input
from tfx.proto import example_gen_pb2
import tensorflow_data_validation as tfdv

train_stats = tfdv.generate_statistics_from_tfrecord(data_location=complaints_tf_train_file)
val_stats = tfdv.generate_statistics_from_tfrecord(data_location=complaints_tf_val_file)

print(train_stats)

schema = tfdv.infer_schema(train_stats)

val_anomalies = tfdv.validate_statistics(statistics=val_stats, schema=schema)

print(val_anomalies)