예제 #1
0
def Load_TFDV(df):

    lencols = len(df.columns)
    # print(lencols)
    y_tfdv = [0] * lencols

    i = 0
    for col in df.columns:
        # print(col)
        df_col = df[[col]]
        st_option = tfdv.StatsOptions(enable_semantic_domain_stats=True)
        stats = tfdv.generate_statistics_from_dataframe(
            df_col, stats_options=st_option)
        schema = tfdv.infer_schema(statistics=stats)
        categ_lst = get_categorical_features(schema)
        for x in categ_lst:
            y_tfdv[i] = 1
            break

        xc = schema.feature
        # print(xc)
        for x in xc:
            cnt_NLD = str(x).count('natural_language_domain')
            cnt_TD = str(x).count('time_domain')

            if cnt_NLD: y_tfdv[i] = 3
            if cnt_TD: y_tfdv[i] = 2
        print(y_tfdv[i])
        i = i + 1

    return y_tfdv
예제 #2
0
def stats_list():
    local_workspace = '/home/jarekk/workspace/analysis'
    local_tfrecords_file = '{}/log_records.tfrecords'.format(local_workspace)

    slice_fn = tfdv.get_feature_value_slicer(features={'time_window': None})
    stats_options = tfdv.StatsOptions(slice_functions=[slice_fn])

    stats_list = tfdv.generate_statistics_from_tfrecord(
        data_location=local_tfrecords_file, stats_options=stats_options)
    return stats_list
예제 #3
0
  def _ComputeTFDVStats(pcollection: beam.pvalue.PCollection,
                        schema: schema_pb2.Schema) -> beam.pvalue.PCollection:
    """Cmoputes Statistics with TFDV.

    Args:
      pcollection: pcollection of examples.
      schema: schema.

    Returns:
      PCollection of `DatasetFeatureStatisticsList`.
    """
    feature_specs_from_schema = schema_utils.schema_as_feature_spec(
        schema).feature_spec

    def EncodeTFDV(element, feature_specs):
      """Encodes element in an in-memory format that TFDV expects."""
      if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY not in element:
        raise ValueError(
            'Expected _TRANSFORM_INTERNAL_FEATURE_FOR_KEY ({}) to exist in the '
            'input but not found.'.format(_TRANSFORM_INTERNAL_FEATURE_FOR_KEY))

      # TODO(b/123549935): Obviate the numpy array conversions by
      # allowing TFDV to accept primitives in general, and TFT's
      # input/output format in particular.
      result = {}
      for feature_name, feature_spec in six.iteritems(feature_specs):
        feature_value = element.get(feature_name)
        if feature_value is None:
          result[feature_name] = None
        elif isinstance(feature_value, (np.ndarray, list)):
          result[feature_name] = np.asarray(
              feature_value, feature_spec.dtype.as_numpy_dtype)
        else:
          result[feature_name] = np.asarray(
              [feature_value], dtype=feature_spec.dtype.as_numpy_dtype)

      return result

    result = (pcollection
              # TODO(kestert): Remove encoding and batching steps once TFT
              # supports Arrow tables.
              | 'EncodeTFDV' >> beam.Map(
                  EncodeTFDV, feature_specs=feature_specs_from_schema))

    # TODO(pachristopher): Remove this once TFDV 0.14 is released.
    (major, minor, _) = tfdv.__version__.split('.')
    if int(major) > 0 or int(minor) >= 14:
      result |= ('BatchExamplesToArrowTables' >>
                 batch_util.BatchExamplesToArrowTables())

    return (result
            | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics(
                tfdv.StatsOptions(schema=schema)))
예제 #4
0
def update_schema(csv_file, schema):
    #get column names from passed in schema
    cols = [f.name for f in schema.feature].sort()

    options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
    new_batch_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,\
                                                        column_names=cols, stats_options = options)

    # Check eval data for errors by validating the eval data stats using the previously inferred schema
    updated_schema = tfdv.update_schema(schema, new_batch_stats)
    #tfdv.display_schema(schema=updated_schema)

    return updated_schema
예제 #5
0
 def testConstructWithSchemaAndStatsOptions(self):
     examples = standard_artifacts.Examples()
     examples.split_names = artifact_utils.encode_split_names(
         ['train', 'eval'])
     schema = standard_artifacts.Schema()
     stats_options = tfdv.StatsOptions(weight_feature='weight')
     statistics_gen = component.StatisticsGen(
         examples=channel_utils.as_channel([examples]),
         schema=channel_utils.as_channel([schema]),
         stats_options=stats_options)
     self.assertEqual(
         standard_artifacts.ExampleStatistics.TYPE_NAME,
         statistics_gen.outputs[
             standard_component_specs.STATISTICS_KEY].type_name)
예제 #6
0
def get_num_anomalies(csv_file, schema, environment='TRAINING'):
    #get column names from passed in schema
    cols = [f.name for f in schema.feature].sort()

    options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
    data_stats = tfdv.generate_statistics_from_csv(data_location=csv_file,\
                                                   column_names=cols, stats_options=options)

    # Check eval data for errors by validating the eval data stats using the previously inferred schema
    anomalies = tfdv.validate_statistics(statistics=data_stats,
                                         schema=schema,
                                         environment=environment)

    #tfdv.display_anomalies(anomalies)

    return len(anomalies.anomaly_info)
예제 #7
0
    def testDoWithSchemaAndStatsOptions(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        tf.io.gfile.makedirs(output_data_dir)

        # Create input dict.
        examples = standard_artifacts.Examples()
        examples.uri = os.path.join(source_data_dir, 'csv_example_gen')
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])

        schema = standard_artifacts.Schema()
        schema.uri = os.path.join(source_data_dir, 'schema_gen')

        input_dict = {
            executor.EXAMPLES_KEY: [examples],
            executor.SCHEMA_KEY: [schema]
        }

        exec_properties = {
            executor.STATS_OPTIONS_JSON_KEY:
            tfdv.StatsOptions(label_feature='company').to_json(),
        }

        # Create output dict.
        stats = standard_artifacts.ExampleStatistics()
        stats.uri = output_data_dir
        stats.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])
        output_dict = {
            executor.STATISTICS_KEY: [stats],
        }

        # Run executor.
        stats_gen_executor = executor.Executor()
        stats_gen_executor.Do(input_dict,
                              output_dict,
                              exec_properties=exec_properties)

        # Check statistics_gen outputs.
        self._validate_stats_output(
            os.path.join(stats.uri, 'train', 'stats_tfrecord'))
        self._validate_stats_output(
            os.path.join(stats.uri, 'eval', 'stats_tfrecord'))
예제 #8
0
파일: executor.py 프로젝트: zorrock/tfx
    def _ComputeTFDVStats(
            pcollection: beam.pvalue.PCollection,
            schema: schema_pb2.Schema) -> beam.pvalue.PCollection:
        """Cmoputes Statistics with TFDV.

    Args:
      pcollection: pcollection of examples.
      schema: schema.

    Returns:
      PCollection of `DatasetFeatureStatisticsList`.
    """
        feature_specs_from_schema = schema_utils.schema_as_feature_spec(
            schema).feature_spec

        def EncodeTFDV(element, feature_specs):
            """Encodes element in an in-memory format that TFDV expects."""
            if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY not in element:
                raise ValueError(
                    'Expected _TRANSFORM_INTERNAL_FEATURE_FOR_KEY ({}) to exist in the '
                    'input but not found.'.format(
                        _TRANSFORM_INTERNAL_FEATURE_FOR_KEY))

            # TODO(b/123549935): Obviate the numpy array conversions by
            # allowing TFDV to accept primitives in general, and TFT's
            # input/output format in particular.
            result = {}
            for feature_name, feature_spec in six.iteritems(feature_specs):
                feature_value = element.get(feature_name)
                if feature_value is None:
                    result[feature_name] = None
                elif isinstance(feature_value, (np.ndarray, list)):
                    result[feature_name] = np.asarray(
                        feature_value, feature_spec.dtype.as_numpy_dtype)
                else:
                    result[feature_name] = np.asarray(
                        [feature_value],
                        dtype=feature_spec.dtype.as_numpy_dtype)

            return result

        return (pcollection
                | 'EncodeTFDV' >> beam.Map(
                    EncodeTFDV, feature_specs=feature_specs_from_schema)
                | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics(
                    tfdv.StatsOptions(schema=schema)))
예제 #9
0
 def testConstructWithSchemaAndStatsOptions(self):
     examples = standard_artifacts.Examples()
     examples.split_names = artifact_utils.encode_split_names(
         ['train', 'eval'])
     schema = standard_artifacts.Schema()
     stats_options = tfdv.StatsOptions(
         weight_feature='weight',
         generators=[  # generators should be dropped
             tfdv.LiftStatsGenerator(
                 schema=None,
                 y_path=tfdv.FeaturePath(['label']),
                 x_paths=[tfdv.FeaturePath(['feature'])])
         ])
     statistics_gen = component.StatisticsGen(
         examples=channel_utils.as_channel([examples]),
         schema=channel_utils.as_channel([schema]),
         stats_options=stats_options)
     self.assertEqual(standard_artifacts.ExampleStatistics.TYPE_NAME,
                      statistics_gen.outputs['statistics'].type_name)
예제 #10
0
    def testDoWithTwoSchemas(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        tf.io.gfile.makedirs(output_data_dir)

        # Create input dict.
        examples = standard_artifacts.Examples()
        examples.uri = os.path.join(source_data_dir, 'csv_example_gen')
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])

        schema = standard_artifacts.Schema()
        schema.uri = os.path.join(source_data_dir, 'schema_gen')

        input_dict = {
            executor.EXAMPLES_KEY: [examples],
            executor.SCHEMA_KEY: [schema]
        }

        exec_properties = {
            executor.STATS_OPTIONS_JSON_KEY:
            tfdv.StatsOptions(label_feature='company',
                              schema=schema_pb2.Schema()).to_json(),
            executor.EXCLUDE_SPLITS_KEY:
            json_utils.dumps([])
        }

        # Create output dict.
        stats = standard_artifacts.ExampleStatistics()
        stats.uri = output_data_dir
        stats.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])
        output_dict = {
            executor.STATISTICS_KEY: [stats],
        }

        # Run executor.
        stats_gen_executor = executor.Executor()
        with self.assertRaises(ValueError):
            stats_gen_executor.Do(input_dict, output_dict, exec_properties)
예제 #11
0
    def _ComputeTFDVStats(pcollection, schema):
        """Cmoputes Statistics with TFDV.

    Args:
      pcollection: pcollection of examples.
      schema: schema.

    Returns:
      PCollection of `DatasetFeatureStatisticsList`.
    """
        def EncodeTFDV(element):
            """Encodes element in an in-memory format that TFDV expects."""
            assert _TRANSFORM_INTERNAL_FEATURE_FOR_KEY in element

            # TODO(b/123549935): Obviate the numpy array converstions by
            # allowing TFDV to accept primitives in general, and TFT's
            # input/output format in particular.
            # TODO(kestert): Iterate through schema instead of element.items and
            # encode missing elements of `element` as None.
            result = {}
            for k, v in element.items():
                if k == _TRANSFORM_INTERNAL_FEATURE_FOR_KEY:
                    continue  # Make sure the synthetic key feature doesn't get encoded.
                elif isinstance(v, np.ndarray) or v is None:
                    result[k] = v
                elif isinstance(v, list):
                    if v:
                        result[k] = np.asarray(v)
                    else:
                        # An empty list.
                        # TODO(kestert): Use Metadata to determine the dtype.
                        continue  # Instead want: result[k] = np.asarray([], dtype=...)
                else:
                    result[k] = np.asarray([v])
            return result

        return (pcollection
                | 'EncodeTFDV' >> beam.Map(EncodeTFDV)
                | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics(
                    tfdv.StatsOptions(schema=schema)))
예제 #12
0
def analyze_log_records(
    request_response_log_table: str,
    model: str,
    version: str,
    start_time: datetime,
    end_time: datetime,
    output_path: str,
    schema: schema_pb2.Schema,
    baseline_stats: Optional[
        statistics_pb2.DatasetFeatureStatisticsList] = None,
    time_window: Optional[timedelta] = None,
    pipeline_options: Optional[PipelineOptions] = None,
):
    """
    Computes statistics and detects anomalies for a time series of records 
    in an AI Platform Prediction request-response log.

    The function starts an Apache Beam job that calculates statistics and detects data anomalies
    in a time series of records retrieved from an AI Platform Prediction request-response log.
    Optionally, the function can also calculate stastics for a set of time slices within
    the time series. The output of the job is a statistics_pb2.DatasetFeatureStatisticsList
    protobuf with descriptive statistis and an anomalies_pb2.Anomalies protobuf
    with anomaly reports. The protobufs are stored to a GCS location. 

    Args:
      request_response_log_table: A full name of a BigQuery table
        with the request_response_log
      start_time: The start of the time series. The value will be rounded to minutes.
      end_time: The end of the time series. The value will be rounded to minutes. 
      output_path: The GCS location to output the statistics and anomaly
        proto buffers to. The file names will be `stats.pb` and `anomalies.pbtxt`. 
      schema: A Schema protobuf describing the expected schema.
      baseline_stats: If provided, the baseline statistics will be used to detect
        distribution anomalies.        
      time_window: If provided the  time series of records will be divided into 
        a set of consecutive time slices of the time_window width and the stats 
        will be calculated for each slice. 
      pipeline_options: Optional beam pipeline options. This allows users to
        specify various beam pipeline execution parameters like pipeline runner
        (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
        See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
        more details.
    """

    # Generate a BigQuery query
    end_time = end_time.replace(second=0, microsecond=0)
    start_time = start_time.replace(second=0, microsecond=0)
    query = _generate_query(
        table_name=request_response_log_table,
        model=model,
        version=version,
        start_time=start_time.strftime('%Y-%m-%dT%H:%M:%S'),
        end_time=end_time.strftime('%Y-%m-%dT%H:%M:%S'))

    # Configure slicing for statistics calculations
    stats_options = tfdv.StatsOptions(schema=schema)
    slicing_column = None
    if time_window:
        time_window = timedelta(days=time_window.days,
                                seconds=(time_window.seconds // 60) * 60)

        if end_time - start_time > time_window:
            slice_fn = tfdv.get_feature_value_slicer(
                features={_SLICING_COLUMN_NAME: None})
            stats_options.slice_functions = [slice_fn]
            slicing_column = _SLICING_COLUMN_NAME
            slicing_feature = schema.feature.add()
            slicing_feature.name = _SLICING_COLUMN_NAME
            slicing_feature.type = _SLICING_COLUMN_TYPE

    # Configure output paths
    stats_output_path = os.path.join(output_path, _STATS_FILENAME)
    anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME)

    # Define an start the pipeline
    with beam.Pipeline(options=pipeline_options) as p:
        raw_examples = (
            p
            | 'GetData' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))

        examples = (
            raw_examples
            | 'InstancesToBeamExamples' >> beam.ParDo(
                InstanceCoder(schema, end_time, time_window, slicing_column)))

        stats = (examples
                 | 'BeamExamplesToArrow' >>
                 tfdv.utils.batch_util.BatchExamplesToArrowRecordBatches()
                 | 'GenerateStatistics' >>
                 tfdv.GenerateStatistics(options=stats_options))

        _ = (stats
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 file_path_prefix=stats_output_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))

        anomalies = (stats
                     | 'ValidateStatistics' >> beam.Map(
                         tfdv.validate_statistics,
                         schema=schema,
                         previous_statistics=baseline_stats))

        _ = (anomalies
             | 'AlertIfAnomalies' >> beam.Map(_alert_if_anomalies,
                                              anomalies_output_path)
             | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText(
                 file_path_prefix=anomalies_output_path,
                 shard_name_template='',
                 append_trailing_newlines=False))
예제 #13
0
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

import tensorflow_data_validation as tfdv

tfrecord_path = '/home/segmind/Desktop/test/tfdv/bikes_persons_dataset/dataset.tfrecord'

semantic_stats_options = tfdv.StatsOptions(enable_semantic_domain_stats=True)

#exit()

stats = tfdv.generate_statistics_from_tfrecord(
    data_location=tfrecord_path, stats_options=semantic_stats_options)

#print(stats)
tfdv.visualize_statistics(stats)
예제 #14
0
import tensorflow_data_validation as tfdv
import pandas as pd
import datetime

from tensorflow_data_validation.utils import slicing_util

data_location = '/home/jarekk/workspace/test.csv'
output_location = '/home/jarekk/workspace/stats.pb'

slice_fn = slicing_util.get_feature_value_slicer(
    features={'time_window': None})

stats_options = tfdv.StatsOptions(slice_functions=[slice_fn])

stats = tfdv.generate_statistics_from_csv(data_location,
                                          stats_options=stats_options,
                                          output_path=output_location)
예제 #15
0
PATH_TO_WHL_FILE = './tfdv.whl'

CLOUMNS = "FL_DATE, MKT_UNIQUE_CARRIER, ORIGIN_AIRPORT_ID, DEST_AIRPORT_ID, DEP_TIME, DEP_DELAY, ARR_DELAY, DISTANCE, dep_lat, dep_lng, arr_lat, arr_lng".split(
    ', ')
# Create and set your PipelineOptions.
options = PipelineOptions()

# For Cloud execution, set the Cloud Platform project, job_name,
# staging location, temp_location and specify DataflowRunner.
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT_ID
google_cloud_options.job_name = JOB_NAME
google_cloud_options.staging_location = GCS_STAGING_LOCATION
google_cloud_options.temp_location = GCS_TMP_LOCATION
options.view_as(StandardOptions).runner = 'DataflowRunner'

setup_options = options.view_as(SetupOptions)
# PATH_TO_WHL_FILE should point to the downloaded tfdv wheel file.
# setup_options.extra_packages = [PATH_TO_WHL_FILE]
setup_options.setup_file = os.path.join(os.getcwd(), 'tfdv_setup.py')
stat_options = tfdv.StatsOptions(sample_rate=0.01)

tfdv.generate_statistics_from_csv(
    GCS_DATA_LOCATION,
    column_names=CLOUMNS,
    output_path=GCS_STATS_OUTPUT_PATH,
    stats_options=stat_options,
    pipeline_options=options,
)
예제 #16
0
def get_post_transform_stats_options() -> tfdv.StatsOptions:
  return (tfdv.StatsOptions() if _POST_TRANSFORM_STATS_OPTIONS is None
          else _POST_TRANSFORM_STATS_OPTIONS)
          max: 3
        }
      }
    }
  }
}
"""

# Do not inline the goldens in _TEST_CASES. This way indentation is easier to
# manage. The rule is to have no first level indent for goldens.
_TEST_CASES = [
    dict(
        testcase_name='basic',
        stats_options=tfdv.StatsOptions(num_rank_histogram_buckets=3,
                                        num_values_histogram_buckets=3,
                                        num_histogram_buckets=3,
                                        num_quantiles_histogram_buckets=3,
                                        enable_semantic_domain_stats=True),
        expected_stats_pbtxt=_BASIC_GOLDEN_STATS,
        expected_inferred_schema_pbtxt=_BASIC_GOLDEN_INFERRED_SCHEMA,
        schema_for_validation_pbtxt=_BASIC_SCHEMA_FOR_VALIDATION,
        expected_anomalies_pbtxt=_BASIC_GOLDEN_ANOMALIES,
        expected_updated_schema_pbtxt=_BASIC_SCHEMA_FROM_UPDATE,
    ),
    dict(
        testcase_name='weight_and_label',
        stats_options=tfdv.StatsOptions(label_feature=_LABEL,
                                        weight_feature=_EXAMPLE_WEIGHT,
                                        num_rank_histogram_buckets=3,
                                        num_values_histogram_buckets=3,
                                        num_histogram_buckets=3,
예제 #18
0
from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor
from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor
from tfx.extensions.google_cloud_big_query.example_gen import component as big_query_example_gen_component  # pylint: disable=unused-import
from tfx.orchestration import pipeline
from tfx.proto import example_gen_pb2, pusher_pb2
from tfx.proto import trainer_pb2
from tfx.types import Channel
from tfx.types.standard_artifacts import Model
from tfx.types.standard_artifacts import ModelBlessing
from tfx.utils.dsl_utils import external_input
import tensorflow_data_validation as tfdv

CURATED_SCHEMA_GEN = "gs://raw_data_layer/schema/schema.pbtxt"
STATS_OPTIONS = tfdv.StatsOptions(feature_whitelist=[
    'image/height', 'image/width', 'image/filename', 'image/source_id',
    'image/format'
])


def create_pipeline(
    pipeline_name: Text,
    pipeline_root: Text,
    data_path: Text,
    # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source.
    # query: Text,
    preprocessing_fn: Text,
    run_fn: Text,
    train_args: trainer_pb2.TrainArgs,
    eval_args: trainer_pb2.EvalArgs,
    eval_accuracy_threshold: float,
    serving_model_dir: Text,