예제 #1
0
  def display_stats_for_examples(self, examples_id, split='train'):
    """Displays stats for `examples_id`.

    Args:
      examples_id: A `int` indicating the id of a `TFXArtifactTypes.EXAMPLES`
        artifact.
      split: A `string` specifying the split name, by default 'train' is used.
    """
    stats_artifact = self.get_dest_artifact_of_type(
        examples_id, TFXArtifactTypes.EXAMPLE_STATS)
    if stats_artifact:
      tfdv.visualize_statistics(
          tfdv.load_statistics(
              os.path.join(stats_artifact.uri, split, 'stats_tfrecord')))
예제 #2
0
    def display_stats_for_examples(self, examples_id):
        """Displays stats for `examples_id`.

    Args:
      examples_id: A `int` indicating the id of a `TFXArtifactTypes.EXAMPLES`
          artifact.
    """
        stats_artifact = self.get_dest_artifact_of_type(
            examples_id, TFXArtifactTypes.EXAMPLE_STATS)
        if stats_artifact:
            tfdv.visualize_statistics(
                tfdv.load_statistics(
                    os.path.join(stats_artifact.uri, 'stats_tfrecord')))
            print("display shema")
            tfdv.display_schema(
                tfdv.infer_schema(statistics=tfdv.load_statistics(
                    os.path.join(stats_artifact.uri, 'stats_tfrecord'))))
예제 #3
0
# %%
#train_stats = tfdv.generate_statistics_from_csv(data_location=full_path_to_train)

# %%
train_stats = tfdv.generate_statistics_from_dataframe(train)

# %%


# %%
'''
## visualize statistics of train data
'''

# %%
tfdv.visualize_statistics(train_stats)

# %%


# %%
schema = tfdv.infer_schema(statistics=train_stats)

# %%


# %%
tfdv.display_schema(schema=schema)

# %%
'''
예제 #4
0
 def display(self, artifact: types.Artifact):
   stats_path = os.path.join(artifact.uri, 'stats_tfrecord')
   stats = tfdv.load_statistics(stats_path)
   tfdv.visualize_statistics(stats)
예제 #5
0
#
# The following cell may take 2–3 minutes to run. Please ignore the deprecation warnings.
#
# **NOTE:**  Please re-run the below cell if you are not getting the TensorFlow Data Validation widget in the output.

# In[4]:

# TODO 2

# The computation of statistics using TFDV.  The returned value is a DatasetFeatureStatisticsList protocol buffer.
stats = tfdv.generate_statistics_from_tfrecord(data_location=train_tf_file)

# TODO 2a

# A visualization of the statistics using Facets Overview.
tfdv.visualize_statistics(stats)

# ### TODO 3:  Use the TensorFlow Data Validation widget above to answer the following questions.

# #### **1. How many total examples are in the training dataset?**

# #### Solution
#
# See below solution.
#

# **There are 1.08 million total examples in the training dataset.**
#
#   The count column tells us how many examples there are for a given feature.  Each feature (`sexual_orientation`, `comment_text`, `gender`, etc.) has 1.08 million examples. The missing column tells us what percentage of examples are missing that feature.
#
# ![Screenshot of first row of Categorical Features table in the TFDV widget, with 1.08 million count of examples and 0% missing examples highlighted](https://developers.google.com/machine-learning/practica/fairness-indicators/colab-images/tfdv_screenshot_exercise1.png)
예제 #6
0
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import tensorflow_data_validation as tfdv

# The following variables are provided through dependency injection. These
# variables come from the specified input path and arguments provided by the
# API post request.
#
# source

train_stats = tfdv.generate_statistics_from_csv(
    data_location=source)  # noqa: F821

tfdv.visualize_statistics(train_stats)
예제 #7
0
import tensorflow_data_validation as tfdv

print(f'TFDV version: {tfdv.version.__version__}')


data_folder = Path("../dataset")
# below paths should be realtive to data_folder
users_file_glob = "AllUsers.csv" 
ads_file_glob = "AllAds.csv"
users_ads_ratings = "users-ads-without-gcp-ratings.csv"


users_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{users_file_glob}").as_posix())


tfdv.visualize_statistics(users_stats)


user_schema = tfdv.infer_schema(statistics=users_stats)
tfdv.display_schema(schema=user_schema)


ads_stats = tfdv.generate_statistics_from_csv((data_folder/f"*{ads_file_glob}").as_posix())


tfdv.visualize_statistics(ads_stats)


ads_schema = tfdv.infer_schema(statistics=ads_stats)
tfdv.display_schema(schema=ads_schema)
예제 #8
0
print('TFDV version: {}'.format(tfdv.version.__version__))

"""### Compute and visualize statistics

TFDV can compute descriptive statistics providing a quick overview of the data in terms of the features, shapes and distribution of the values.

First use tfdv.generate_statistics_from_csv to compute statistics for train data, TFDV can compute descriptive statistics, generate_statistics_from_csv works specifying a file pattern. The pattern can contain glob characters  (*, ?, [...] and sets).
"""

# Generate stats for Ste
train_stats = tfdv.generate_statistics_from_csv(
    data_location=os.path.join(DATA_DIR, 'data_employees.csv'))

# Visualize
tfdv.visualize_statistics(train_stats)

"""it is possible to check the distribution of each variable, categorical and numeric features are show separated,
<br>
The first thing it is possible to note the TERMINATED samples are very few compared with the ACTIVE, the class is very unbalanced
<br>
No missing data was found

### Evaluation Data

After check the training data, it will be compared with the test data,  if the data is different between the test and train data it will have problems when using the model for prediction.
"""

# Compute stats for evaluation data
test_stats = tfdv.generate_statistics_from_csv(
    data_location=os.path.join(DATA_DIR, 'eval_set.csv'))
예제 #9
0
                                                )
    tfdv.display_anomalies(drift_anomalies)
    return drift_anomalies


# %%
if __name__ == '__main__':
    #train val split
    train_val_split(file_path = config.DATA_FILE_PATH)

# %%
    #generating train val stats and schema, and then visualize it
    # data_stats, data_schema = csv_statistics_generator(file_path = config.DATA_FILE_PATH)
    train_stats, train_schema = csv_statistics_generator(file_path = config.TRAIN_FILE_PATH)
    val_stats, val_schema = csv_statistics_generator(file_path = config.VAL_FILE_PATH)
    tfdv.visualize_statistics(lhs_statistics = val_stats, rhs_statistics=train_stats,
                            lhs_name = 'VAL_DATASET', rhs_name = 'TRAIN_DATASET')    

# %%
    #check anomalies in train and val 
    train_anomalies = csv_statistics_validator(stats = train_stats, schema = train_schema)
    val_anomalies = csv_statistics_validator(stats = val_stats, schema = val_schema)


# %%
    skew_anomalies = tfdv_skew_validator(feature_name = 'company',
                                        train_stats = train_stats,
                                        serve_stats = val_stats,
                                        schema = train_schema,
                                        threshold = 0.01)

# %%