示例#1
0
def test_analyze_log_records():

    request_response_log_table = 'data_validation.covertype_classifier_logs_tf'
    project_id = 'mlops-dev-env'
    model = 'covertype_tf'
    version = 'v3'

    baseline_stats = None
    output_path = 'gs://mlops-dev-workspace/drift_monitor/output/covertype_tf/test'
    start_time = datetime.datetime.fromisoformat('2020-05-25T16:01:10')
    end_time = datetime.datetime.fromisoformat('2020-05-25T22:50:30')

    time_window = None
    time_window = datetime.timedelta(hours=1)

    schema_path = 'gs://mlops-dev-workspace/drift_monitor/schema/schema.pbtxt'
    schema = load_schema_text(schema_path)

    pipeline_options = PipelineOptions()
    google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    google_cloud_options.project = project_id

    logging.getLogger().setLevel(logging.INFO)

    analyze_log_records(request_response_log_table=request_response_log_table,
                        model=model,
                        version=version,
                        start_time=start_time,
                        end_time=end_time,
                        time_window=time_window,
                        output_path=output_path,
                        schema=schema,
                        baseline_stats=baseline_stats,
                        pipeline_options=pipeline_options)
示例#2
0
def load_embeddings(embedding_files_pattern, schema_file_path):

    embeddings = list()
    vocabulary = list()

    logging.info('Loading schema...')
    schema = tfdv.load_schema_text(schema_file_path)
    feature_sepc = schema_utils.schema_as_feature_spec(schema).feature_spec
    logging.info('Schema is loaded.')

    def _gzip_reader_fn(filenames):
        return tf.data.TFRecordDataset(filenames, compression_type='GZIP')

    dataset = tf.data.experimental.make_batched_features_dataset(
        embedding_files_pattern,
        batch_size=1,
        num_epochs=1,
        features=feature_sepc,
        reader=_gzip_reader_fn,
        shuffle=False)

    # Read embeddings from tfrecord files.
    logging.info('Loading embeddings from files...')
    for tfrecord_batch in dataset:
        vocabulary.append(tfrecord_batch["item_Id"].numpy()[0][0].decode())
        embedding = tfrecord_batch["embedding"].numpy()[0]
        normalized_embedding = embedding / np.linalg.norm(embedding)
        embeddings.append(normalized_embedding)
    logging.info('Embeddings loaded.')
    embeddings = np.array(embeddings)

    return vocabulary, embeddings
示例#3
0
def export_serving_model(classifier, serving_model_dir, raw_schema_location,
                         tft_output_dir):

    raw_schema = tfdv.load_schema_text(raw_schema_location)
    raw_feature_spec = schema_utils.schema_as_feature_spec(
        raw_schema).feature_spec

    tft_output = tft.TFTransformOutput(tft_output_dir)

    features_input_signature = {
        feature_name: tf.TensorSpec(shape=(None, 1),
                                    dtype=spec.dtype,
                                    name=feature_name)
        for feature_name, spec in raw_feature_spec.items()
        if feature_name in features.FEATURE_NAMES
    }

    signatures = {
        "serving_default":
        _get_serve_features_fn(
            classifier,
            tft_output).get_concrete_function(features_input_signature),
        "serving_tf_example":
        _get_serve_tf_examples_fn(classifier, tft_output,
                                  raw_feature_spec).get_concrete_function(
                                      tf.TensorSpec(shape=[None],
                                                    dtype=tf.string,
                                                    name="examples")),
    }

    logging.info("Model export started...")
    classifier.save(serving_model_dir, signatures=signatures)
    logging.info("Model export completed.")
    def __init__(self, embedding_files_prefix, schema_file_path, **kwargs):
        super(EmbeddingLookup, self).__init__(**kwargs)

        vocabulary = list()
        embeddings = list()

        logging.info('Loading schema...')
        schema = tfdv.load_schema_text(schema_file_path)
        feature_sepc = schema_utils.schema_as_feature_spec(schema).feature_spec
        logging.info('Schema is loadded.')

        def _gzip_reader_fn(filenames):
            return tf.data.TFRecordDataset(filenames, compression_type='GZIP')

        dataset = tf.data.experimental.make_batched_features_dataset(
            embedding_files_prefix,
            batch_size=1,
            num_epochs=1,
            features=feature_sepc,
            reader=_gzip_reader_fn,
            shuffle=False)

        # Read embeddings from tfrecord files.
        logging.info('Loading embeddings from files ...')
        for tfrecord_batch in dataset:
            vocabulary.append(tfrecord_batch["item_Id"].numpy()[0][0].decode())
            embeddings.append(tfrecord_batch["embedding"].numpy()[0])
        logging.info('Embeddings loaded.')

        embedding_size = len(embeddings[0])
        oov_embedding = np.zeros((1, embedding_size))
        self.embeddings = np.append(np.array(embeddings),
                                    oov_embedding,
                                    axis=0)
        logging.info(f'Embeddings: {self.embeddings.shape}')

        # Write vocabualry file.
        logging.info('Writing vocabulary to file ...')
        with open(VOCABULARY_FILE_NAME, 'w') as f:
            for item in vocabulary:
                f.write(f'{item}\n')
        logging.info(
            'Vocabulary file written and will be added as a model asset.')

        self.vocabulary_file = tf.saved_model.Asset(VOCABULARY_FILE_NAME)
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys=vocabulary, values=list(range(len(vocabulary))))
        self.token_to_id = tf.lookup.StaticHashTable(
            initializer, default_value=len(vocabulary))
示例#5
0
def validate_stats(stats_path, schema_path, anomalies_path):
    """Validates the statistics against the schema and materializes anomalies.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location of the schema to be used for validation.
    anomalies_path: Location where the detected anomalies are materialized.
  """
    print('Validating schema against the computed statistics.')
    schema = tfdv.load_schema_text(schema_path)
    stats = tfdv.load_statistics(stats_path)
    anomalies = tfdv.validate_statistics(stats, schema)
    print('Detected following anomalies:')
    print(text_format.MessageToString(anomalies))

    print('Writing anomalies to anomalies path.')
    file_io.write_string_to_file(anomalies_path,
                                 text_format.MessageToString(anomalies))
 def display(self, artifact: types.Artifact):
   schema_path = os.path.join(artifact.uri, 'schema.pbtxt')
   schema = tfdv.load_schema_text(schema_path)
   tfdv.display_schema(schema)
示例#7
0
# In[ ]:

get_ipython().system('pip3 install tfx tensorflow-data-validation')

# In[ ]:

#tag::importTFDV[]
import tensorflow_data_validation as tfdv
#end::importTFDV[]

# You can download your schema by looking at the inputs/outputs in your pipeline run for the schema gen stage

# In[ ]:

#tag::displaySchema{}
schema = tfdv.load_schema_text("schema_info_2")
tfdv.display_schema(schema)
#end::displaySchema[]

# In[ ]:

#tag::loadTFT[]
tfx_transform = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/tfx/Transform/component.yaml")
#end::loadTFT[]

# In[ ]:

module_file = "gcs://"

# In[ ]:
示例#8
0
                        dest='schema_file',
                        help='A path to a schema file',
                        default='schema.pbtxt')

    known_args, pipeline_args = parser.parse_known_args()

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        GoogleCloudOptions
    ).staging_location = '%s/staging' % known_args.dataflow_gcs_location
    pipeline_options.view_as(
        GoogleCloudOptions
    ).temp_location = '%s/temp' % known_args.dataflow_gcs_location

    stats_options = stats_options.StatsOptions()
    schema = tfdv.load_schema_text(known_args.schema_file)

    anomalies_output_path = os.path.join(known_args.output_path, 'test.txt')
    stats_output_path = os.path.join(known_args.output_path, 'stats.pb')

    desired_batch_size = (stats_options.desired_batch_size
                          if stats_options.desired_batch_size
                          and stats_options.desired_batch_size > 0 else
                          tfdv.constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE)

    instances = [{
        'f1': [1],
        'f2': [0.1],
        'f3': ['aaa']
    }, {
        'f1': [2],
示例#9
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFT Preprocessing."""

import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_data_validation as tfdv

TARGET_FEATURE_NAME = 'income_bracket'
WEIGHT_FEATURE_NAME = 'fnlwgt'
RAW_SCHEMA_LOCATION = 'raw_schema/schema.pbtxt'

raw_schema = tfdv.load_schema_text(RAW_SCHEMA_LOCATION)


def _prep(feature):
    #return tf.squeeze(feature, axis=1)
    return feature


def preprocessing_fn(input_features):

    processed_features = {}

    for feature in raw_schema.feature:

        # Pass the target feature as is.
        if feature.name in [TARGET_FEATURE_NAME, WEIGHT_FEATURE_NAME]:
示例#10
0
        raise ValueError("The end_time cannot be earlier than the start_time")

    time_window=None
    if known_args.time_window:
        if not re.fullmatch('[0-9]+[hm]', known_args.time_window):
            raise ValueError("Incorrect format for time_window")
        if known_args.time_window[-1]=='h': 
            time_window = datetime.timedelta(hours=int(known_args.time_window[0:-1]))
        else:
            time_window = datetime.timedelta(minutes=int(known_args.time_window[0:-1]))

    baseline_stats = None
    if known_args.baseline_stats_file:
        baseline_stats = load_statistics(known_args.baseline_stats_file)

    schema = load_schema_text(known_args.schema_file)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).setup_file = _SETUP_FILE

    logging.log(logging.INFO, "Starting the request-response log analysis pipeline...")
    analyze_log_records(
        request_response_log_table=known_args.request_response_log_table,
        model=known_args.model,
        version=known_args.version,
        start_time=start_time,
        end_time=end_time,
        output_path=known_args.output_path,
        schema=schema,
        baseline_stats=baseline_stats,
        time_window=time_window,
def Balancer(examples: InputArtifact[Examples], schema: InputArtifact[Schema],
             statistics: InputArtifact[ExampleStatistics],
             balanced_examples: OutputArtifact[Examples],
             column: Parameter[str]) -> None:
    splits_list = artifact_utils.decode_split_names(
        split_names=examples.split_names)
    balanced_examples.split_names = artifact_utils.encode_split_names(
        splits=splits_list)
    for split in splits_list:
        raw_schema = tfdv.load_schema_text(
            f'{schema.uri}/schema.pbtxt')  # Avoid hardcoding these
        parsed_schema = tft.tf_metadata.schema_utils.schema_as_feature_spec(
            raw_schema).feature_spec
        uri = tfx.types.artifact_utils.get_split_uri([statistics], split)
        stats = tfdv.load_statistics(f'{uri}/stats_tfrecord')  # Same as above
        for dataset in stats.datasets:
            for feature in dataset.features:
                if feature.path.step == [column]:
                    for histogram in feature.num_stats.histograms:
                        if histogram.type == histogram.HistogramType.STANDARD:
                            print(histogram)
                            sample_counts = [
                                bucket.sample_count
                                for bucket in histogram.buckets
                            ]
                            original_size = feature.num_stats.common_stats.tot_num_values
        max_count = max(sample_counts)
        max_category = np.argmax(sample_counts)
        min_count = min(sample_counts)
        n_categories = len(sample_counts)
        print(
            f'Biggest category count: {max_count}, smallest category count: {min_count}'
        )
        new_oversampled_size = int(max_count * n_categories)
        new_undersampled_size = int(min_count * n_categories)
        oversampled_size_increase = new_oversampled_size / original_size
        undersampled_size_decrease = new_undersampled_size / original_size

        def decode(record_bytes):
            return tf.io.parse_single_example(record_bytes, parsed_schema)

        uri = tfx.types.artifact_utils.get_split_uri([examples], split)
        dataset = tf.data.TFRecordDataset(
            tf.data.Dataset.list_files(f'{uri}/*'),  # Make smarter
            compression_type='GZIP').map(decode)
        targets_only = dataset.map(lambda x: tf.squeeze(x[column]))
        uniques = targets_only.apply(tf.data.experimental.unique())
        datasets = []
        for u in uniques:
            print(f'Filtering class {u}')
            datasets.append(
                dataset.filter(lambda x: tf.squeeze(x[column]) == u).repeat())
        weights = np.ones(n_categories) / n_categories
        sampled = tf.data.experimental.sample_from_datasets(datasets, weights)
        if 'train' in split:  # In anticipation of name changes - TFX 0.30.0 uses 'Split-train'
            print(
                f'{split}: size increase from {original_size} to {new_oversampled_size} '
                f'({oversampled_size_increase:.1f} times)')
            sampled = sampled.take(new_oversampled_size)
        else:
            print(
                f'{split}: size decrease from {original_size} to {new_undersampled_size} '
                f'({100*undersampled_size_decrease:.1f}%)')
            sampled = sampled.take(new_undersampled_size)

        def _float_feature(value):
            return tf.train.Feature(float_list=tf.train.FloatList(
                value=[value]))

        def _int64_feature(value):
            return tf.train.Feature(int64_list=tf.train.Int64List(
                value=[value]))

        func_mapper = {tf.int64: _int64_feature, tf.float32: _float_feature}
        # To make absolute sure of the ordering, since new dicts are presented all the time.
        keys = parsed_schema.keys()

        def serialize(*args):
            feature = {
                key: func_mapper[tensor.dtype](tensor.numpy())
                for key, tensor in zip(keys, args)
            }
            example_proto = tf.train.Example(features=tf.train.Features(
                feature=feature))
            return example_proto.SerializeToString()

        def tf_serialize(x):
            tensors = [x[key] for key in keys]
            return tf.py_function(serialize, tensors, tf.string)

        sampled = sampled.map(tf_serialize)
        # Shard
        uri = tfx.types.artifact_utils.get_split_uri([balanced_examples],
                                                     split)
        path = f'{uri}/balanced.tfrecord'
        writer = tf.data.experimental.TFRecordWriter(path)
        writer.write(sampled)
        print(f'Balanced files for split {split} written to {path}')