def test_analyze_log_records(): request_response_log_table = 'data_validation.covertype_classifier_logs_tf' project_id = 'mlops-dev-env' model = 'covertype_tf' version = 'v3' baseline_stats = None output_path = 'gs://mlops-dev-workspace/drift_monitor/output/covertype_tf/test' start_time = datetime.datetime.fromisoformat('2020-05-25T16:01:10') end_time = datetime.datetime.fromisoformat('2020-05-25T22:50:30') time_window = None time_window = datetime.timedelta(hours=1) schema_path = 'gs://mlops-dev-workspace/drift_monitor/schema/schema.pbtxt' schema = load_schema_text(schema_path) pipeline_options = PipelineOptions() google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) google_cloud_options.project = project_id logging.getLogger().setLevel(logging.INFO) analyze_log_records(request_response_log_table=request_response_log_table, model=model, version=version, start_time=start_time, end_time=end_time, time_window=time_window, output_path=output_path, schema=schema, baseline_stats=baseline_stats, pipeline_options=pipeline_options)
def load_embeddings(embedding_files_pattern, schema_file_path): embeddings = list() vocabulary = list() logging.info('Loading schema...') schema = tfdv.load_schema_text(schema_file_path) feature_sepc = schema_utils.schema_as_feature_spec(schema).feature_spec logging.info('Schema is loaded.') def _gzip_reader_fn(filenames): return tf.data.TFRecordDataset(filenames, compression_type='GZIP') dataset = tf.data.experimental.make_batched_features_dataset( embedding_files_pattern, batch_size=1, num_epochs=1, features=feature_sepc, reader=_gzip_reader_fn, shuffle=False) # Read embeddings from tfrecord files. logging.info('Loading embeddings from files...') for tfrecord_batch in dataset: vocabulary.append(tfrecord_batch["item_Id"].numpy()[0][0].decode()) embedding = tfrecord_batch["embedding"].numpy()[0] normalized_embedding = embedding / np.linalg.norm(embedding) embeddings.append(normalized_embedding) logging.info('Embeddings loaded.') embeddings = np.array(embeddings) return vocabulary, embeddings
def export_serving_model(classifier, serving_model_dir, raw_schema_location, tft_output_dir): raw_schema = tfdv.load_schema_text(raw_schema_location) raw_feature_spec = schema_utils.schema_as_feature_spec( raw_schema).feature_spec tft_output = tft.TFTransformOutput(tft_output_dir) features_input_signature = { feature_name: tf.TensorSpec(shape=(None, 1), dtype=spec.dtype, name=feature_name) for feature_name, spec in raw_feature_spec.items() if feature_name in features.FEATURE_NAMES } signatures = { "serving_default": _get_serve_features_fn( classifier, tft_output).get_concrete_function(features_input_signature), "serving_tf_example": _get_serve_tf_examples_fn(classifier, tft_output, raw_feature_spec).get_concrete_function( tf.TensorSpec(shape=[None], dtype=tf.string, name="examples")), } logging.info("Model export started...") classifier.save(serving_model_dir, signatures=signatures) logging.info("Model export completed.")
def __init__(self, embedding_files_prefix, schema_file_path, **kwargs): super(EmbeddingLookup, self).__init__(**kwargs) vocabulary = list() embeddings = list() logging.info('Loading schema...') schema = tfdv.load_schema_text(schema_file_path) feature_sepc = schema_utils.schema_as_feature_spec(schema).feature_spec logging.info('Schema is loadded.') def _gzip_reader_fn(filenames): return tf.data.TFRecordDataset(filenames, compression_type='GZIP') dataset = tf.data.experimental.make_batched_features_dataset( embedding_files_prefix, batch_size=1, num_epochs=1, features=feature_sepc, reader=_gzip_reader_fn, shuffle=False) # Read embeddings from tfrecord files. logging.info('Loading embeddings from files ...') for tfrecord_batch in dataset: vocabulary.append(tfrecord_batch["item_Id"].numpy()[0][0].decode()) embeddings.append(tfrecord_batch["embedding"].numpy()[0]) logging.info('Embeddings loaded.') embedding_size = len(embeddings[0]) oov_embedding = np.zeros((1, embedding_size)) self.embeddings = np.append(np.array(embeddings), oov_embedding, axis=0) logging.info(f'Embeddings: {self.embeddings.shape}') # Write vocabualry file. logging.info('Writing vocabulary to file ...') with open(VOCABULARY_FILE_NAME, 'w') as f: for item in vocabulary: f.write(f'{item}\n') logging.info( 'Vocabulary file written and will be added as a model asset.') self.vocabulary_file = tf.saved_model.Asset(VOCABULARY_FILE_NAME) initializer = tf.lookup.KeyValueTensorInitializer( keys=vocabulary, values=list(range(len(vocabulary)))) self.token_to_id = tf.lookup.StaticHashTable( initializer, default_value=len(vocabulary))
def validate_stats(stats_path, schema_path, anomalies_path): """Validates the statistics against the schema and materializes anomalies. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location of the schema to be used for validation. anomalies_path: Location where the detected anomalies are materialized. """ print('Validating schema against the computed statistics.') schema = tfdv.load_schema_text(schema_path) stats = tfdv.load_statistics(stats_path) anomalies = tfdv.validate_statistics(stats, schema) print('Detected following anomalies:') print(text_format.MessageToString(anomalies)) print('Writing anomalies to anomalies path.') file_io.write_string_to_file(anomalies_path, text_format.MessageToString(anomalies))
def display(self, artifact: types.Artifact): schema_path = os.path.join(artifact.uri, 'schema.pbtxt') schema = tfdv.load_schema_text(schema_path) tfdv.display_schema(schema)
# In[ ]: get_ipython().system('pip3 install tfx tensorflow-data-validation') # In[ ]: #tag::importTFDV[] import tensorflow_data_validation as tfdv #end::importTFDV[] # You can download your schema by looking at the inputs/outputs in your pipeline run for the schema gen stage # In[ ]: #tag::displaySchema{} schema = tfdv.load_schema_text("schema_info_2") tfdv.display_schema(schema) #end::displaySchema[] # In[ ]: #tag::loadTFT[] tfx_transform = kfp.components.load_component_from_file( "pipelines-0.2.5/components/tfx/Transform/component.yaml") #end::loadTFT[] # In[ ]: module_file = "gcs://" # In[ ]:
dest='schema_file', help='A path to a schema file', default='schema.pbtxt') known_args, pipeline_args = parser.parse_known_args() pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( GoogleCloudOptions ).staging_location = '%s/staging' % known_args.dataflow_gcs_location pipeline_options.view_as( GoogleCloudOptions ).temp_location = '%s/temp' % known_args.dataflow_gcs_location stats_options = stats_options.StatsOptions() schema = tfdv.load_schema_text(known_args.schema_file) anomalies_output_path = os.path.join(known_args.output_path, 'test.txt') stats_output_path = os.path.join(known_args.output_path, 'stats.pb') desired_batch_size = (stats_options.desired_batch_size if stats_options.desired_batch_size and stats_options.desired_batch_size > 0 else tfdv.constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE) instances = [{ 'f1': [1], 'f2': [0.1], 'f3': ['aaa'] }, { 'f1': [2],
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """TFT Preprocessing.""" import tensorflow as tf import tensorflow_transform as tft import tensorflow_data_validation as tfdv TARGET_FEATURE_NAME = 'income_bracket' WEIGHT_FEATURE_NAME = 'fnlwgt' RAW_SCHEMA_LOCATION = 'raw_schema/schema.pbtxt' raw_schema = tfdv.load_schema_text(RAW_SCHEMA_LOCATION) def _prep(feature): #return tf.squeeze(feature, axis=1) return feature def preprocessing_fn(input_features): processed_features = {} for feature in raw_schema.feature: # Pass the target feature as is. if feature.name in [TARGET_FEATURE_NAME, WEIGHT_FEATURE_NAME]:
raise ValueError("The end_time cannot be earlier than the start_time") time_window=None if known_args.time_window: if not re.fullmatch('[0-9]+[hm]', known_args.time_window): raise ValueError("Incorrect format for time_window") if known_args.time_window[-1]=='h': time_window = datetime.timedelta(hours=int(known_args.time_window[0:-1])) else: time_window = datetime.timedelta(minutes=int(known_args.time_window[0:-1])) baseline_stats = None if known_args.baseline_stats_file: baseline_stats = load_statistics(known_args.baseline_stats_file) schema = load_schema_text(known_args.schema_file) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).setup_file = _SETUP_FILE logging.log(logging.INFO, "Starting the request-response log analysis pipeline...") analyze_log_records( request_response_log_table=known_args.request_response_log_table, model=known_args.model, version=known_args.version, start_time=start_time, end_time=end_time, output_path=known_args.output_path, schema=schema, baseline_stats=baseline_stats, time_window=time_window,
def Balancer(examples: InputArtifact[Examples], schema: InputArtifact[Schema], statistics: InputArtifact[ExampleStatistics], balanced_examples: OutputArtifact[Examples], column: Parameter[str]) -> None: splits_list = artifact_utils.decode_split_names( split_names=examples.split_names) balanced_examples.split_names = artifact_utils.encode_split_names( splits=splits_list) for split in splits_list: raw_schema = tfdv.load_schema_text( f'{schema.uri}/schema.pbtxt') # Avoid hardcoding these parsed_schema = tft.tf_metadata.schema_utils.schema_as_feature_spec( raw_schema).feature_spec uri = tfx.types.artifact_utils.get_split_uri([statistics], split) stats = tfdv.load_statistics(f'{uri}/stats_tfrecord') # Same as above for dataset in stats.datasets: for feature in dataset.features: if feature.path.step == [column]: for histogram in feature.num_stats.histograms: if histogram.type == histogram.HistogramType.STANDARD: print(histogram) sample_counts = [ bucket.sample_count for bucket in histogram.buckets ] original_size = feature.num_stats.common_stats.tot_num_values max_count = max(sample_counts) max_category = np.argmax(sample_counts) min_count = min(sample_counts) n_categories = len(sample_counts) print( f'Biggest category count: {max_count}, smallest category count: {min_count}' ) new_oversampled_size = int(max_count * n_categories) new_undersampled_size = int(min_count * n_categories) oversampled_size_increase = new_oversampled_size / original_size undersampled_size_decrease = new_undersampled_size / original_size def decode(record_bytes): return tf.io.parse_single_example(record_bytes, parsed_schema) uri = tfx.types.artifact_utils.get_split_uri([examples], split) dataset = tf.data.TFRecordDataset( tf.data.Dataset.list_files(f'{uri}/*'), # Make smarter compression_type='GZIP').map(decode) targets_only = dataset.map(lambda x: tf.squeeze(x[column])) uniques = targets_only.apply(tf.data.experimental.unique()) datasets = [] for u in uniques: print(f'Filtering class {u}') datasets.append( dataset.filter(lambda x: tf.squeeze(x[column]) == u).repeat()) weights = np.ones(n_categories) / n_categories sampled = tf.data.experimental.sample_from_datasets(datasets, weights) if 'train' in split: # In anticipation of name changes - TFX 0.30.0 uses 'Split-train' print( f'{split}: size increase from {original_size} to {new_oversampled_size} ' f'({oversampled_size_increase:.1f} times)') sampled = sampled.take(new_oversampled_size) else: print( f'{split}: size decrease from {original_size} to {new_undersampled_size} ' f'({100*undersampled_size_decrease:.1f}%)') sampled = sampled.take(new_undersampled_size) def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList( value=[value])) def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List( value=[value])) func_mapper = {tf.int64: _int64_feature, tf.float32: _float_feature} # To make absolute sure of the ordering, since new dicts are presented all the time. keys = parsed_schema.keys() def serialize(*args): feature = { key: func_mapper[tensor.dtype](tensor.numpy()) for key, tensor in zip(keys, args) } example_proto = tf.train.Example(features=tf.train.Features( feature=feature)) return example_proto.SerializeToString() def tf_serialize(x): tensors = [x[key] for key in keys] return tf.py_function(serialize, tensors, tf.string) sampled = sampled.map(tf_serialize) # Shard uri = tfx.types.artifact_utils.get_split_uri([balanced_examples], split) path = f'{uri}/balanced.tfrecord' writer = tf.data.experimental.TFRecordWriter(path) writer.write(sampled) print(f'Balanced files for split {split} written to {path}')