예제 #1
0
def skew_drift_validator(mode, gcp_bucket, control_set_path,
                         treatment_set_path, feature_list_str, Linf_value):
    logging.basicConfig(level=logging.INFO)
    logging.info('Starting skew drift validator ..')
    logging.info('Input data:')
    logging.info('mode:{}'.format(mode))
    logging.info('gcp_bucket:{}'.format(gcp_bucket))
    logging.info('control_set_path:{}'.format(control_set_path))
    logging.info('treatment_set_path:{}'.format(treatment_set_path))
    logging.info('Linf_value:{}'.format(Linf_value))

    feature_list = eval(feature_list_str)
    control_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + control_set_path,
                                 sep=',')
    treat_set_df = pd.read_csv("gs://" + gcp_bucket + "/" + treatment_set_path,
                               sep=',')
    control_stats = tfdv.generate_statistics_from_dataframe(
        dataframe=control_set_df)
    treat_stats = tfdv.generate_statistics_from_dataframe(
        dataframe=treat_set_df)
    control_schema = tfdv.infer_schema(control_stats)
    treat_schema = tfdv.infer_schema(treat_stats)

    for feature in feature_list:
        if (mode == "skew"):
            if (tfdv.get_feature(control_schema, feature).domain
                ):  # if we have domain it is a categorical variable
                tfdv.get_feature(
                    control_schema, feature
                ).skew_comparator.infinity_norm.threshold = Linf_value
            else:
                logging.critical(
                    "feature: {} is not categorical".format(feature))
                sys.exit(1)
        elif (mode == "drift"):
            tfdv.get_feature(
                control_schema,
                feature).drift_comparator.infinity_norm.threshold = Linf_value
        else:
            logging.critical("mode: {} not supported".format(mode))
            sys.exit(1)
    anomalies = tfdv.validate_statistics(statistics=control_stats,
                                         schema=control_schema,
                                         serving_statistics=treat_stats)
    if (anomalies.anomaly_info):
        logging.info("Data-{} detected:".format(anomalies))
        return anomalies
    else:
        logging.info("No data-{} detected".format(mode))
예제 #2
0
 def run_tf_test(self):
     print("Running tf workflow test!")
     DATA_DIR = "./"
     TRAIN_DATA = os.path.join(DATA_DIR, 'cal_housing.csv')
     train_stats = tfdv.generate_statistics_from_csv(TRAIN_DATA,
                                                     delimiter=',')
     schema = tfdv.infer_schema(train_stats)
     enc_stats = json_format.MessageToJson(train_stats)
     enc_schema = json_format.MessageToJson(schema)
     data = pd.read_csv('cal_housing.csv')
     ds_info = {"name" : "cal_housing_dataset",\
                "description": "data about housing in California",\
        "encoded_stats": enc_stats,\
        "encoded_schema": enc_schema,\
        "source": "UCI ML Repository" }
     ds_reg = self.ap.register_dataset(ds_info)
     featureset = data.dtypes.to_dict()
     featureset = {k: str(featureset[k]) for k in featureset}
     featureset["name"] = "wine_no_transformations"
     fs_reg = self.ap.register_featureset(featureset, ds_reg["_key"])
     dataset = self.ap.lookup_dataset("cal_housing_dataset")
     retrieved_stats = dataset["encoded_stats"]
     retrieved_schema = dataset["encoded_schema"]
     #print("Retrieved stats: " + str(retrieved_stats))
     print("Completed tf workflow test!")
     return
예제 #3
0
def Load_TFDV(df):

    lencols = len(df.columns)
    # print(lencols)
    y_tfdv = [0] * lencols

    i = 0
    for col in df.columns:
        # print(col)
        df_col = df[[col]]
        st_option = tfdv.StatsOptions(enable_semantic_domain_stats=True)
        stats = tfdv.generate_statistics_from_dataframe(
            df_col, stats_options=st_option)
        schema = tfdv.infer_schema(statistics=stats)
        categ_lst = get_categorical_features(schema)
        for x in categ_lst:
            y_tfdv[i] = 1
            break

        xc = schema.feature
        # print(xc)
        for x in xc:
            cnt_NLD = str(x).count('natural_language_domain')
            cnt_TD = str(x).count('time_domain')

            if cnt_NLD: y_tfdv[i] = 3
            if cnt_TD: y_tfdv[i] = 2
        print(y_tfdv[i])
        i = i + 1

    return y_tfdv
예제 #4
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'train'. Stats on other splits are ignored.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'SchemaPath' artifact of size one.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    # TODO(zhitaoli): Move constants between this file and component.py to a
    # constants.py.
    train_stats_uri = io_utils.get_only_uri_in_dir(
        types.get_split_uri(input_dict['stats'], 'train'))
    output_uri = os.path.join(
        types.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME)

    infer_feature_shape = False
    tf.logging.info('Infering schema from statistics.')
    schema = tfdv.infer_schema(
        tfdv.load_statistics(train_stats_uri), infer_feature_shape)
    io_utils.write_pbtxt_file(output_uri, schema)
    tf.logging.info('Schema written to {}.'.format(output_uri))
예제 #5
0
def pandas_schema_anomalies(
        df: pd.DataFrame,
        tfdv_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]:
    """
    Get dataframe schema anomalies comparing pandas and TFDV schema.
    Args:
        df {pandas.DataFrame}: dataframe
        tfdv_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: TFDV statistics
    Returns:
        Tuple[bool, Dict]:
            True if anomalies are detected, otherwise False,
            dictionary with structure:
                {
                    <column_name>: {
                        'description': <description>,
                        'severity': 'ERROR',
                        'shortDescription': <short description>,
                        'reason': [{'type': <error_type>,
                                    'shortDescription': <short description>,
                                    'description': <description>}],
                        'path': {'step': [<column_name>]}
                    }
                }
    """

    tfdv_schema = tfdv.infer_schema(tfdv_statistics)
    pandas_schema = get_pandas_df_schema(df)

    return tfdv_pandas_schemas_anomalies(tfdv_schema, pandas_schema)
예제 #6
0
파일: executor.py 프로젝트: winsomexjs/tfx
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.

    Returns:
      None
    """
        # TODO(zhitaoli): Move constants between this file and component.py to a
        # constants.py.
        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(input_dict['stats'], 'train'))
        output_uri = os.path.join(
            artifact_utils.get_single_uri(output_dict['output']),
            _DEFAULT_FILE_NAME)

        infer_feature_shape = exec_properties['infer_feature_shape']
        absl.logging.info('Infering schema from statistics.')
        schema = tfdv.infer_schema(tfdv.load_statistics(train_stats_uri),
                                   infer_feature_shape)
        io_utils.write_pbtxt_file(output_uri, schema)
        absl.logging.info('Schema written to %s.' % output_uri)
예제 #7
0
def tfdv_statistics_anomalies(
        source_statistics: DatasetFeatureStatisticsList,
        input_statistics: DatasetFeatureStatisticsList) -> Tuple[bool, Dict]:
    """
    Compare two TDFV statistics and return anomalies.
    Args:
        source_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: source statistics
        input_statistics {tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatisticsList}: input statistics
    Returns:
        Tuple[bool, Dict]:
            True if anomalies are detected, otherwise False,
            dictionary with structure:
            {
                <column_name>: {
                    'description': <description>,
                    'severity': 'ERROR',
                    'shortDescription': <short description>,
                    'reason': [{'type': <error_type>,
                                'shortDescription': <short description>,
                                'description': <description>}],
                    'path': {'step': [<column_name>]}
                }
            }
    """

    schema = tfdv.infer_schema(source_statistics)
    anomalies = tfdv_object_to_dict(
        tfdv.validate_statistics(statistics=input_statistics, schema=schema))
    tfdv_anomalies = anomalies.get('anomalyInfo', {})

    return len(tfdv_anomalies) > 0, tfdv_anomalies
예제 #8
0
    def _provide_schema(self, input_dict,
                        exec_properties) -> schema_pb2.Schema:
        """Generates schema from either schema or statistics."""
        # TODO(zhitaoli): Move constants between this file and component.py to a
        # constants.py.
        stats = input_dict.get('stats') or input_dict.get('statistics')
        schema = input_dict.get('schema')

        if bool(stats) == bool(schema):
            raise ValueError(
                'Exactly only one of schema or stats must be provided')

        if schema:
            schema_uri = artifact_utils.get_single_uri(schema)
            absl.logging.info('Schema is provided. Reading from %s.' %
                              schema_uri)
            schema_reader = io_utils.SchemaReader()
            try:
                return schema_reader.read(
                    os.path.join(schema_uri, _DEFAULT_FILE_NAME))

            except tf.errors.NotFoundError:
                raise ValueError(
                    'Schema is provided, but failed to read from %s.' %
                    schema_uri)

        train_stats_uri = io_utils.get_only_uri_in_dir(
            artifact_utils.get_split_uri(stats, 'train'))
        infer_feature_shape = exec_properties['infer_feature_shape']
        return tfdv.infer_schema(tfdv.load_statistics(train_stats_uri),
                                 infer_feature_shape)
    def test_e2e(self, stats_options, expected_stats_pbtxt,
                 expected_schema_pbtxt):
        tfxio = tf_sequence_example_record.TFSequenceExampleRecord(
            self._input_file, ['tfdv', 'test'])
        stats_file = os.path.join(self._output_dir, 'stats')
        with beam.Pipeline() as p:
            _ = (p
                 | 'TFXIORead' >> tfxio.BeamSource()
                 | 'GenerateStats' >> tfdv.GenerateStatistics(stats_options)
                 | 'WriteStats' >> tfdv.WriteStatisticsToTFRecord(stats_file))

        actual_stats = tfdv.load_statistics(stats_file)
        test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self,
            text_format.Parse(
                expected_stats_pbtxt,
                statistics_pb2.DatasetFeatureStatisticsList()))([actual_stats])
        actual_schema = tfdv.infer_schema(actual_stats,
                                          infer_feature_shape=True)

        if hasattr(actual_schema, 'generate_legacy_feature_spec'):
            actual_schema.ClearField('generate_legacy_feature_spec')
        self._assert_schema_equal(
            actual_schema,
            text_format.Parse(expected_schema_pbtxt, schema_pb2.Schema()))
예제 #10
0
def tfrecord_statis_generator(file_path):
    """
    Generate statistics for the tfrecord dataset
    """
    tfrecord_stats = tfdv.generate_statistics_from_tfrecord(data_location = file_path)
    tfrecord_schema = tfdv.infer_schema(tfrecord_stats)
    tfdv.display_schema(tfrecord_schema)
    return tfrecord_stats, tfrecord_schema
예제 #11
0
def infer_schema_from_csv(csv_file, column_names):
    data_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,
                                                   column_names=column_names)

    #tfdv.visualize_statistics(data_stats)
    schema = tfdv.infer_schema(statistics=data_stats)

    return schema
예제 #12
0
def csv_statistics_generator(file_path):
    """
    Generate statistics for the csv dataset
    """
    csv_stats = tfdv.generate_statistics_from_csv(data_location = file_path,
                                                delimiter=',')
    csv_schema = tfdv.infer_schema(csv_stats)
    tfdv.display_schema(csv_schema)
    return csv_stats, csv_schema
예제 #13
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'stats': A list of 'ExampleStatistics' type which must contain
          split 'train'. Stats on other splits are ignored.
        - 'statistics': Synonym for 'stats'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.
        - exclude_splits: Names of splits that will not be taken into
          consideration when auto-generating a schema.

    Returns:
      None
    """
    # TODO(zhitaoli): Move constants between this file and component.py to a
    # constants.py.
    infer_feature_shape = exec_properties.get(INFER_FEATURE_SHAPE_KEY)

    # Load and deserialize exclude splits from execution properties.
    exclude_splits = json_utils.loads(
        exec_properties.get(EXCLUDE_SPLITS_KEY, 'null')) or []
    if not isinstance(exclude_splits, list):
      raise ValueError('exclude_splits in execution properties needs to be a '
                       'list. Got %s instead.' % type(exclude_splits))

    # Only one schema is generated for all splits.
    schema = None
    stats_artifact = artifact_utils.get_single_instance(
        input_dict[STATISTICS_KEY])
    for split in artifact_utils.decode_split_names(stats_artifact.split_names):
      if split in exclude_splits:
        continue

      logging.info('Processing schema from statistics for split %s.', split)
      stats_uri = io_utils.get_only_uri_in_dir(
          os.path.join(stats_artifact.uri, split))
      if not schema:
        schema = tfdv.infer_schema(
            tfdv.load_statistics(stats_uri), infer_feature_shape)
      else:
        schema = tfdv.update_schema(schema, tfdv.load_statistics(stats_uri),
                                    infer_feature_shape)

    output_uri = os.path.join(
        artifact_utils.get_single_uri(output_dict[SCHEMA_KEY]),
        _DEFAULT_FILE_NAME)
    io_utils.write_pbtxt_file(output_uri, schema)
    logging.info('Schema written to %s.', output_uri)
def _infer_schema(output_dir, stats):
    inferred_schema_output_path = os.path.join(output_dir,
                                               'inferred_schema.pb2')
    inferred_schema = tfdv.infer_schema(stats)
    file_io.write_string_to_file(inferred_schema_output_path,
                                 inferred_schema.SerializeToString())
    file_io.write_string_to_file('/tmp/inferred_schema_output_path.txt',
                                 inferred_schema_output_path)
    return inferred_schema
예제 #15
0
 def write_stats_and_schema(self, pipeline_args):  # type: (List[str]) -> None
     stats = self.write_stats(pipeline_args)
     if not self.schema:
         logger.warning(
             "Inferring a new schema for this dataset. If you want to use an existing schema, "
             "provide a value for schema_path in the constructor."
         )
         new_schema = tfdv.infer_schema(stats, infer_feature_shape=False)
         self.schema = new_schema
     self.upload_schema()
예제 #16
0
def data_validation(data_path):
    train = tfdv.generate_statistics_from_csv(
        os.path.join(data_path, 'train.csv'), delimiter=',')
    test = tfdv.generate_statistics_from_csv(
        os.path.join(data_path, 'train.csv'), delimiter=',')
    schema = tfdv.infer_schema(train)
    # print(schema)
    # tfdv.display_schema(schema)
    anomalies = tfdv.validate_statistics(statistics=test, schema=schema)
    # print(anomalies)
    # tfdv.display_anomalies(anomalies)
    print(text_format.MessageToString(anomalies))
예제 #17
0
    def parse_schema_from_stats(cls, stats_path):
        # type: (str) -> Tuple[Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]], Schema]  # noqa: E501
        """
        Returns TensorFlow Feature Spec and parsed tf.metadata Schema for given tf.metadata
        DatasetFeatureStatisticsList.

        :param stats_path: tf.metadata DatasetFeatureStatisticsList path
        """
        import tensorflow_data_validation as tfdv
        stats = tfdv.load_statistics(stats_path)
        schema = tfdv.infer_schema(stats)
        return schema_to_feature_spec(schema), schema
def infer_schema(stats_path, schema_path):
  """Infers a schema from stats in stats_path.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location where the inferred schema is materialized.
  """
  print('Infering schema from statistics.')
  schema = tfdv.infer_schema(
      tfdv.load_statistics(stats_path), infer_feature_shape=False)
  print(text_format.MessageToString(schema))

  print('Writing schema to output path.')
  file_io.write_string_to_file(schema_path, text_format.MessageToString(schema))
예제 #19
0
def infer_schema(stats_path, schema_path):
  """Infers a schema from stats in stats_path.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location where the inferred schema is materialized.
  """
  print('Infering schema from statistics.')
  schema = tfdv.infer_schema(
      tfdv.load_statistics(stats_path), infer_feature_shape=False)
  print(text_format.MessageToString(schema))

  print('Writing schema to output path.')
  file_io.write_string_to_file(schema_path, text_format.MessageToString(schema))
예제 #20
0
    def display_stats_for_examples(self, examples_id):
        """Displays stats for `examples_id`.

    Args:
      examples_id: A `int` indicating the id of a `TFXArtifactTypes.EXAMPLES`
          artifact.
    """
        stats_artifact = self.get_dest_artifact_of_type(
            examples_id, TFXArtifactTypes.EXAMPLE_STATS)
        if stats_artifact:
            tfdv.visualize_statistics(
                tfdv.load_statistics(
                    os.path.join(stats_artifact.uri, 'stats_tfrecord')))
            print("display shema")
            tfdv.display_schema(
                tfdv.infer_schema(statistics=tfdv.load_statistics(
                    os.path.join(stats_artifact.uri, 'stats_tfrecord'))))
예제 #21
0
def tfdv_detect_drift(
    stats_older_path: str, stats_new_path: str
) -> NamedTuple('Outputs', [('drift', str)]):

  import logging
  import time

  import tensorflow_data_validation as tfdv
  import tensorflow_data_validation.statistics.stats_impl

  logging.getLogger().setLevel(logging.INFO)
  logging.info('stats_older_path: %s', stats_older_path)
  logging.info('stats_new_path: %s', stats_new_path)

  if stats_older_path == 'none':
    return ('true', )

  stats1 = tfdv.load_statistics(stats_older_path)
  stats2 = tfdv.load_statistics(stats_new_path)

  schema1 = tfdv.infer_schema(statistics=stats1)
  tfdv.get_feature(schema1, 'duration').drift_comparator.jensen_shannon_divergence.threshold = 0.01
  drift_anomalies = tfdv.validate_statistics(
      statistics=stats2, schema=schema1, previous_statistics=stats1)
  logging.info('drift analysis results: %s', drift_anomalies.drift_skew_info)

  from google.protobuf.json_format import MessageToDict
  d = MessageToDict(drift_anomalies)
  val = d['driftSkewInfo'][0]['driftMeasurements'][0]['value']
  thresh = d['driftSkewInfo'][0]['driftMeasurements'][0]['threshold']
  logging.info('value %s and threshold %s', val, thresh)
  res = 'true'
  if val < thresh:
    res = 'false'
  logging.info('train decision: %s', res)
  return (res, )
def data_validation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    vldn_output = os.path.join(DATA_DIR, 'validation')

    # TODO: Understand why this was used in the conversion to the output json
    # key columns: list of the names for columns that should be treated as unique keys.
    key_columns = ['trip_start_timestamp']

    # read the first line of the cvs to have and ordered list of column names
    # (the Schema will scrable the features)
    with open(TRAIN_DATA) as f:
        column_names = f.readline().strip().split(',')

    stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
    schema = tfdv.infer_schema(stats)

    eval_stats = tfdv.generate_statistics_from_csv(
        data_location=EVALUATION_DATA)
    anomalies = tfdv.validate_statistics(eval_stats, schema)

    # Log anomalies
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error(
            'Anomaly in feature "{}": {}'.format(
                feature_name, anomaly_info.description))

    # show inferred schema
    tfdv.display_schema(schema=schema)
    # Resolve anomalies
    company = tfdv.get_feature(schema, 'company')
    company.distribution_constraints.min_domain_mass = 0.9

    # Add new value to the domain of feature payment_type.
    payment_type_domain = tfdv.get_domain(schema, 'payment_type')
    payment_type_domain.value.append('Prcard')

    # Validate eval stats after updating the schema
    updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
    tfdv.display_anomalies(updated_anomalies)

    # -----------------------DATA SAVING START---------------------------------
    if "column_names" in locals():
        _kale_resource_save(column_names, os.path.join(
            _kale_data_directory, "column_names"))
    else:
        print("_kale_resource_save: `column_names` not found.")
    if "schema" in locals():
        _kale_resource_save(schema, os.path.join(
            _kale_data_directory, "schema"))
    else:
        print("_kale_resource_save: `schema` not found.")
예제 #23
0
# %%


# %%
'''
## visualize statistics of train data
'''

# %%
tfdv.visualize_statistics(train_stats)

# %%


# %%
schema = tfdv.infer_schema(statistics=train_stats)

# %%


# %%
tfdv.display_schema(schema=schema)

# %%
'''
## check that training data complies with the schema inferred from train data !
'''

# %%
# Compute stats for evaluation data
test_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_test)
예제 #24
0
                        type=str,
                        required=False,
                        default='./val_stats/stats.txt',
                        help='Path to the validation stats .txt file ')

    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)s - %(message)s',
                        level=logging.INFO,
                        datefmt='%d-%m-%y %H:%M:%S')
    logging.info(f"Args: {args}")

    stats1 = tfdv.load_stats_text(input_path=args.stats_file_1)
    stats2 = tfdv.load_stats_text(input_path=args.stats_file_2)

    schema1 = tfdv.infer_schema(statistics=stats1)

    # Custom rules, tweak this as required. This is just an example
    tfdv.get_feature(
        schema1,
        'I1').drift_comparator.jensen_shannon_divergence.threshold = 0.06

    # Calculate drift between the reference stats stats1, and the statistics from new data in stats2
    drift_anomalies = tfdv.validate_statistics(statistics=stats2,
                                               schema=schema1,
                                               previous_statistics=stats1)

    # Convert the .pb2 to dict
    drift = MessageToDict(drift_anomalies)

    value = drift['driftSkewInfo'][0]['driftMeasurements'][0]['value']
예제 #25
0
def _gen_tfdv_schema(stats, args):
    schema = tfdv.infer_schema(stats)
    _save_tfdv_schema(schema, args)
    return schema
예제 #26
0

data_folder = Path("../dataset")
# below paths should be realtive to data_folder
users_file_glob = "AllUsers.csv" 
ads_file_glob = "AllAds.csv"
users_ads_ratings = "users-ads-without-gcp-ratings.csv"


users_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_file_glob}").as_posix())


tfdv.visualize_statistics(users_stats)


user_schema = tfdv.infer_schema(statistics=users_stats)
tfdv.display_schema(schema=user_schema)


ads_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{ads_file_glob}").as_posix())


tfdv.visualize_statistics(ads_stats)


ads_schema = tfdv.infer_schema(statistics=ads_stats)
tfdv.display_schema(schema=ads_schema)


users_ratings_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_ads_ratings}").as_posix())
예제 #27
0
def run_validator(output_dir, column_names, key_columns, csv_data_file,
                  csv_data_file_to_validate, project, mode):
    """Writes a TFDV-generated schema.

    Args:
      output_dir: output folder
      column_names: list of names for the columns in the CSV file. If omitted,
          the first line is treated as the column names.
      key_columns: list of the names for columns that should be
          treated as unique keys.
      csv_data_file: name of the CSV file to analyze and generate a schema.
      csv_data_file_to_validate: name of a CSV file to validate
          against the schema.
      project: the project to run dataflow in.
      mode: whether the job should be `local` or `cloud`.
    """
    if mode == 'local':
        pipeline_options = None
    elif mode == 'cloud':
        temp_dir = os.path.join(output_dir, 'tmp')
        options = {
            'job_name': ('pipeline-tfdv-' +
                         datetime.datetime.now().strftime('%y%m%d-%H%M%S')),
            'setup_file':
            './validation/setup.py',
            'project':
            project,
            'temp_location':
            temp_dir,
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        pipeline_options.view_as(StandardOptions).runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    stats = tfdv.generate_statistics_from_csv(
        data_location=csv_data_file,
        column_names=column_names,
        delimiter=',',
        output_path=os.path.join(output_dir, 'data_stats.tfrecord'),
        pipeline_options=pipeline_options)
    schema = tfdv.infer_schema(stats)
    with open('/output_schema.pb2', 'w+') as f:
        f.write(schema.SerializeToString())
    with file_io.FileIO(os.path.join(output_dir, 'schema.pb2'), 'w+') as f:
        logging.getLogger().info('Writing schema to {}'.format(f.name))
        f.write(schema.SerializeToString())
    schema_json = convert_schema_proto_to_json(schema, column_names,
                                               key_columns)
    with open('/output_schema.json', 'w+') as f:
        json.dump(schema_json, f)
    with file_io.FileIO(os.path.join(output_dir, 'schema.json'), 'w+') as f:
        logging.getLogger().info('Writing JSON schema to {}'.format(f.name))
        json.dump(schema_json, f)

    if not csv_data_file_to_validate:
        return

    validation_stats = tfdv.generate_statistics_from_csv(
        data_location=csv_data_file_to_validate,
        column_names=column_names,
        delimiter=',',
        output_path=os.path.join(output_dir, 'validation_data_stats.tfrecord'),
        pipeline_options=pipeline_options)
    anomalies = tfdv.validate_statistics(validation_stats, schema)
    with open('/output_validation_result.txt', 'w+') as f:
        if len(anomalies.anomaly_info.items()) > 0:
            f.write('invalid')
        else:
            f.write('valid')
            return

    with file_io.FileIO(os.path.join(output_dir, 'anomalies.pb2'), 'w+') as f:
        logging.getLogger().info('Writing anomalies to {}'.format(f.name))
        f.write(anomalies.SerializeToString())
    for feature_name, anomaly_info in anomalies.anomaly_info.items():
        logging.getLogger().error('Anomaly in feature "{}": {}'.format(
            feature_name, anomaly_info.description))
예제 #28
0
 def schema_from_train_data(self):
     train_data_stats = tfdv.generate_statistics_from_dataframe(
         self._task.train_data)
     schema = tfdv.infer_schema(statistics=train_data_stats)
     return schema
예제 #29
0
    def Do(self, input_dict: Dict[str, List[types.Artifact]],
           output_dict: Dict[str, List[types.Artifact]],
           exec_properties: Dict[str, Any]) -> None:
        """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - 'statistics': A list of 'ExampleStatistics' type which must contain
          split 'train'.
      output_dict: Output dict from key to a list of artifacts, including:
        - schema: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - infer_feature_shape: Whether or not to infer the shape of the feature.
        - exclude_splits: Names of splits that will not be taken into
          consideration when auto-generating a schema.

    Returns:
      None
    """
        infer_feature_shape = bool(
            exec_properties.get(
                standard_component_specs.INFER_FEATURE_SHAPE_KEY, True))

        # Load and deserialize exclude splits from execution properties.
        exclude_splits = json_utils.loads(
            exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY,
                                'null')) or []
        if not isinstance(exclude_splits, list):
            raise ValueError(
                'exclude_splits in execution properties needs to be a '
                'list. Got %s instead.' % type(exclude_splits))

        # Only one schema is generated for all splits.
        schema = None
        stats_artifact = artifact_utils.get_single_instance(
            input_dict[standard_component_specs.STATISTICS_KEY])
        for split in artifact_utils.decode_split_names(
                stats_artifact.split_names):
            if split in exclude_splits:
                continue

            logging.info('Processing schema from statistics for split %s.',
                         split)
            stats_uri = io_utils.get_only_uri_in_dir(
                artifact_utils.get_split_uri([stats_artifact], split))
            if artifact_utils.is_artifact_version_older_than(
                    stats_artifact,
                    artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE):  # pylint: disable=protected-access
                stats = tfdv.load_statistics(stats_uri)
            else:
                stats = tfdv.load_stats_binary(stats_uri)
            if not schema:
                schema = tfdv.infer_schema(stats, infer_feature_shape)
            else:
                schema = tfdv.update_schema(schema, stats, infer_feature_shape)

        output_uri = os.path.join(
            artifact_utils.get_single_uri(
                output_dict[standard_component_specs.SCHEMA_KEY]),
            DEFAULT_FILE_NAME)
        io_utils.write_pbtxt_file(output_uri, schema)
        logging.info('Schema written to %s.', output_uri)
# MAGIC - check for data drift and skew between training and serving

# COMMAND ----------

from sklearn.model_selection import train_test_split
import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils.display_util import get_statistics_html
import warnings

warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)

stats_train = tfdv.generate_statistics_from_dataframe(
    dataframe=train_df.toPandas())
stats_serve = tfdv.generate_statistics_from_dataframe(dataframe=fdf.toPandas())

schema = tfdv.infer_schema(statistics=stats_train)
tfdv.display_schema(schema=schema)

# COMMAND ----------

# Compare evaluation data with training data
displayHTML(
    get_statistics_html(lhs_statistics=stats_serve,
                        rhs_statistics=stats_train,
                        lhs_name='SERVE_DATASET',
                        rhs_name='TRAIN_DATASET'))

# COMMAND ----------

anomalies = tfdv.validate_statistics(statistics=stats_serve, schema=schema)
tfdv.display_anomalies(anomalies)
예제 #31
0
def main(argv=None):
  args = parse_arguments(sys.argv if argv is None else argv)
  schema_file = os.path.join(args.transform_dir, _SCHEMA_FILE)
  if args.runner == 'DataflowRunner':
    schema = utils.read_schema(schema_file)
    dataflow_options = {
        'job_name':
            args.job_name,
        'project':
            args.project,
        'service_account_email':
            '',
        'setup_file':
            os.path.abspath(
                os.path.join(os.path.dirname(__file__), 'setup.py')),
        'temp_location':
            os.path.join(args.output_dir, 'tmp')
    }
    pipeline_options = beam.pipeline.PipelineOptions(
        flags=[], **dataflow_options)
  else:
    pipeline_options = beam.pipeline.PipelineOptions(None)

    if os.path.exists(args.transform_dir):
      logger.info('Removing existing directory %s', args.transform_dir)
      shutil.rmtree(args.transform_dir)

    stats = tfdv.generate_statistics_from_csv(data_location=args.all_data)
    schema = tfdv.infer_schema(statistics=stats, infer_feature_shape=False)
    if not file_io.file_exists(args.transform_dir):
      file_io.recursive_create_dir(args.transform_dir)
    with file_io.FileIO(schema_file, 'w') as f:
      f.write(text_format.MessageToString(schema))
    logger.info('Generated %s', schema_file)
    logger.info('Running pipeline on %s environment', args.runner)

  with beam.Pipeline(args.runner, options=pipeline_options) as pipeline:
    with tft_beam.Context(temp_dir=os.path.join(args.output_dir, 'tmp')):
      # `mode=predict` should be used during inference time
      if args.mode == 'predict':
        logger.info('Transforming only prediction data.')
        transform_predict(
            pipeline=pipeline,
            predict_data=args.predict_data,
            data_source=args.data_source,
            output_dir=args.output_dir,
            schema=schema)
      else:
        logger.info('Transforming both training, evaluation and predict data.')
        transform_predict(
            pipeline=pipeline,
            predict_data=args.predict_data,
            data_source=args.data_source,
            output_dir=args.output_dir,
            schema=schema)
        transform_train_and_eval(
            pipeline=pipeline,
            train_data=args.train_data,
            eval_data=args.eval_data,
            data_source=args.data_source,
            transform_dir=args.transform_dir,
            output_dir=args.output_dir,
            schema=schema)