def _example_serving_receiver_fn( tf_transform_output: tft.TFTransformOutput, schema: schema_pb2.Schema, label_key: Text) -> tf.estimator.export.ServingInputReceiver: """Build the serving in inputs. Parameters ---------- tf_transform_output: A TFTransformOutput. schema: the schema of the input data. Returns ------- Tensorflow graph which parses examples, applying tf-transform to them. """ # Pull out the feature spec and throwout the label raw_feature_spec = _get_raw_feature_spec(schema) raw_feature_spec.pop(label_key) # Define the raw inputs from taken from the user receiver_tensors = {} for key in raw_feature_spec: absl.logging.info("KEY {}".format(key)) dtype = raw_feature_spec[key].dtype receiver_tensors[key] = tf.compat.v1.placeholder(dtype=dtype, shape=[None], name='input_' + key) # Define the inputs into the the graph features = {} for key in receiver_tensors: batch_size = tf.shape(receiver_tensors[key])[0] indices = tf.cast(tf.expand_dims(tf.range(batch_size), -1), tf.int64) zeros = tf.zeros_like(indices) indices = tf.concat([indices, zeros], axis=1) features[key] = tf.SparseTensor(indices=indices, values=receiver_tensors[key], dense_shape=[batch_size, 1]) # Transform the features. transformed_features = tf_transform_output.transform_raw_features(features) return tf.estimator.export.ServingInputReceiver(transformed_features, receiver_tensors)
def _input_fn( file_pattern: Text, tf_transform_output: tft.TFTransformOutput, batch_size: int = 2) -> tf.data.Dataset: transformed_feature_spec = ( tf_transform_output.transformed_feature_spec().copy()) dataset = tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=batch_size, # Shuffle must be False, as Zip operation is not transitive shuffle=False, features=transformed_feature_spec, reader=_gzip_reader_fn) train_dataset = dataset.map(create_training_data) label_dataset = dataset.map(create_label_data) dataset = tf.data.Dataset.zip((train_dataset, label_dataset)) return dataset
def make_serving_signatures(model, tf_transform_features: tft.TFTransformOutput, serving_batch_size: Optional[int] = None): """Returns the serving signatures. Args: model: the model function to apply to the transformed features. tf_transform_features: The transformation to apply to the serialized tf.Example. serving_batch_size: an optional specification for a concrete serving batch size. Returns: The signatures to use for saving the mode. The 'serving_default' signature will be a concrete function that takes a serialized tf.Example, parses it, transformes the features and then applies the model. """ model.tft_layer = tf_transform_features.transform_features_layer() @tf.function def serve_tf_examples_fn(serialized_tf_examples): """Returns the output to be used in the serving signature.""" feature_spec = tf_transform_features.raw_feature_spec() feature_spec.pop(_LABEL_KEY) parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec) transformed_features = model.tft_layer(parsed_features) return model(transformed_features) return { 'serving_default': serve_tf_examples_fn.get_concrete_function( tf.TensorSpec(shape=[serving_batch_size], dtype=tf.string, name='examples')) }
def input_fn(self, file_pattern: List[Text], tf_transform_output: tft.TFTransformOutput): xf_feature_spec = tf_transform_output.transformed_feature_spec() xf_feature_spec = { x: xf_feature_spec[x] for x in xf_feature_spec if x.endswith('_xf') } dataset = tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=self.batch_size, features=xf_feature_spec, reader=self._gzip_reader_fn, num_epochs=self.epochs) def split_inputs_labels(x): inputs = {} labels = {} for e in x: if not e.startswith('label'): inputs[e] = x[e] else: labels[e] = x[e] labels = { label[len('label_'):-len('_xf')]: labels[label] for label in labels.keys() } return inputs, labels dataset = dataset.map(split_inputs_labels) dataset = dataset.map(lambda x, y: x) return dataset
def input_fn(self, file_pattern: List[Text], tf_transform_output: tft.TFTransformOutput): """ Feedforward input_fn for loading data from TFRecords saved to a location on disk. Args: file_pattern: File pattern matching saved TFRecords on disk. tf_transform_output: Output of the preceding Transform / Preprocessing component. Returns: dataset: tf.data.Dataset created out of the input files. """ xf_feature_spec = tf_transform_output.transformed_feature_spec() dataset = tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=self.batch_size, features=xf_feature_spec, reader=self._gzip_reader_fn, num_epochs=1, drop_final_batch=True) def split_columns(x): inputs = {} labels = {} for e in x: if not naming_utils.check_if_transformed_label(e): inputs[e] = x[e] else: labels[e] = x[e] return inputs, labels dataset = dataset.map(split_columns) return dataset
def _input_fn(file_pattern: List[Text], tf_transform_output: tft.TFTransformOutput, label_key: str, batch_size: int = 200) -> tf.data.Dataset: """Generates features and label for tuning/training. Args: file_pattern: List of paths or patterns of input tfrecord files. tf_transform_output: A TFTransformOutput. label_key: label key. batch_size: representing the number of consecutive elements of returned dataset to combine in a single batch Returns: A dataset that contains (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ transformed_feature_spec = ( tf_transform_output.transformed_feature_spec().copy()) dataset = tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=batch_size, features=transformed_feature_spec, reader=_gzip_reader_fn, label_key=_transformed_name(label_key)) # If the input dataset is file-based but the number of files is less than # the number of workers, an error will be raised. Turning off auto shard # policy here so that Dataset will sharded by data instead of by file. options = tf.data.Options() options.experimental_distribute.auto_shard_policy = ( tf.data.experimental.AutoShardPolicy.DATA) dataset = dataset.with_options(options) return dataset
def _input_fn(file_pattern: List[Text], tf_transform_output: tft.TFTransformOutput, batch_size: int = 200) -> tf.data.Dataset: """Generates features and label for tuning/training. Args: file_pattern: List of paths or patterns of input tfrecord files. tf_transform_output: A TFTransformOutput. batch_size: representing the number of consecutive elements of returned dataset to combine in a single batch Returns: A dataset that contains (features, indices) tuple where features is a dictionary of Tensors, and indices is a single Tensor of label indices. """ transformed_feature_spec = ( tf_transform_output.transformed_feature_spec().copy()) dataset = tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=batch_size, features=transformed_feature_spec, reader=_gzip_reader_fn, label_key=_transformed_name(_LABEL_KEY)) return dataset
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--tfrecord-file', dest='tfrecord_file', required=True) parser.add_argument('--tft-artifacts-dir', dest='tft_artifacts_dir', required=True) args = parser.parse_args() tfrecords_list = utils.list_tfrecords(args.tfrecord_file) tft_metadata_dir = os.path.join(args.tft_artifacts_dir, transform_fn_io.TRANSFORM_FN_DIR) tft_metadata = TFTransformOutput(args.tft_artifacts_dir) input_fn_op = input_fn(tfrecords_list, tft_metadata, 1) input_fn_next = input_fn_op.make_one_shot_iterator().get_next() data_dict = {'community_area_code': [], 'target': []} stop = False with tf.Session() as sess: while True: try: batch_X, batch_Y = sess.run(input_fn_next) for k in batch_X.keys(): if np.any(np.isnan(batch_X[k])): logger.info("{} {}".format(k, batch_X[k]))
def get_deep_and_wide_columns(tft_transform_dir, embedding_size=8): """Creates deep and wide feature_column lists. Args: tf_transform_dir: (str), directory in which the tf-transform model was written during the preprocessing step. embedding_size: (int), the number of dimensions used to represent categorical features when providing them as inputs to the DNN. Returns: [tf.feature_column],[tf.feature_column]: deep and wide feature_column lists. """ tft_output = TFTransformOutput(tft_transform_dir) transformed_feature_spec = tft_output.transformed_feature_spec() transformed_feature_spec.pop(my_metadata.transformed_name(my_metadata.LABEL_KEY)) deep_columns = {} wide_columns = {} for transformed_key, tensor in transformed_feature_spec.items(): # Separate features by deep and wide if transformed_key in my_metadata.transformed_names(my_metadata.VOCAB_FEATURE_KEYS): if transformed_key not in my_metadata.transformed_names(my_metadata.CATEGORICAL_FEATURE_KEYS_TO_BE_REMOVED): wide_columns[transformed_key] = tf.feature_column.categorical_column_with_identity( key=transformed_key, num_buckets=tft_output.vocabulary_size_by_name(transformed_key) + my_metadata.OOV_SIZE ) elif transformed_key in my_metadata.transformed_names(my_metadata.HASH_STRING_FEATURE_KEYS): if transformed_key not in my_metadata.transformed_names(my_metadata.CATEGORICAL_FEATURE_KEYS_TO_BE_REMOVED): wide_columns[transformed_key] = tf.feature_column.categorical_column_with_identity( key=transformed_key, num_buckets=my_metadata.HASH_STRING_FEATURE_KEYS[my_metadata.original_name(transformed_key)] ) elif transformed_key in my_metadata.transformed_names(my_metadata.NUMERIC_FEATURE_KEYS): if transformed_key not in my_metadata.transformed_names(my_metadata.NUMERIC_FEATURE_KEYS_TO_BE_REMOVED): deep_columns[transformed_key] = tf.feature_column.numeric_column(transformed_key) elif ( (transformed_key.endswith(my_metadata.transformed_name('_bucketized')) and transformed_key.replace( my_metadata.transformed_name('_bucketized'), '') in my_metadata.TO_BE_BUCKETIZED_FEATURE)): wide_columns[transformed_key] = tf.feature_column.categorical_column_with_identity( key=transformed_key, num_buckets=tft_output.num_buckets_for_transformed_feature(transformed_key) ) else: raise LookupError('The couple (%s, %s) is not consistent with utils.my_metadata' % (key, tensor)) # # # creating new categorical features wide_columns.update( { 'pickup_latitude_bucketized_xf_x_pickup_longitude_bucketized_xf' : tf.feature_column.crossed_column( ['pickup_latitude_bucketized_xf', 'pickup_longitude_bucketized_xf'], hash_bucket_size=int(1e3)), } ) # # # creating new numeric features from categorical features deep_columns.update( { # Use indicator columns for low dimensional vocabularies 'trip_start_day_xf_indicator': tf.feature_column.indicator_column(wide_columns['trip_start_day_xf']), # Use embedding columns for high dimensional vocabularies 'company_xf_embedding': tf.feature_column.embedding_column( wide_columns['company_xf'], dimension=embedding_size) } ) return deep_columns.values(), wide_columns.values()