def preprocessing_fn(inputs): """User defined preprocessing function for criteo columns. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ # TODO(b/35001605) Make this "passthrough" more DRY. result = {'score': inputs['score'], 'toplevel': inputs['toplevel']} result['subreddit_id'] = tft.string_to_int( inputs['subreddit'], frequency_threshold=frequency_threshold) # TODO(b/35318962): Obviate the need for this workaround on Dense features. # FeatureColumns expect shape (batch_size, 1), not just (batch_size) # All features added to results up to this point are dense and require this # workaround. All following features will be sparse. result = { k: tft.map(lambda x: tf.expand_dims(x, -1), v) for k, v in result.items() } for name in ('author', 'comment_body', 'comment_parent_body'): words = tft.map(tf.string_split, inputs[name]) # TODO(b/33467613) Translate these to bag-of-words style sparse features. result[name + '_bow'] = tft.string_to_int( words, frequency_threshold=frequency_threshold) return result
def preprocess_fn(dictrow): return { 'customer_id': tft.string_to_int(dictrow['customer_id'], vocab_filename='customers_mapping'), 'sku': tft.string_to_int(dictrow['sku'], vocab_filename='skus_mapping'), 'action': dictrow['action'] }
def preprocessing_fn(inputs): return { 'index1': tft.string_to_int(tft.map(tf.string_split, inputs['a']), default_value=-99, top_k=2), # As above but using a string for top_k (and changing the # default_value to showcase things). 'index2': tft.string_to_int(tft.map(tf.string_split, inputs['a']), default_value=-9, top_k='2') }
def preprocessing_fn(inputs): return { 'index1': tft.string_to_int(tft.map(tf.string_split, inputs['a']), default_value=-99, frequency_threshold=2), # As above but using a string for frequency_threshold (and changing # the default_value to showcase things). 'index2': tft.string_to_int(tft.map(tf.string_split, inputs['a']), default_value=-9, frequency_threshold='2') }
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # Update outputs of both kinds to convert from shape (batch,), i.e. a batch # of scalars, to shape (batch, 1), i.e. a batch of vectors of length 1. # This is needed so the output can be easily wrapped in `FeatureColumn`s. for key in NUMERIC_COLUMNS + CATEGORICAL_COLUMNS: outputs[key] = tft.map(lambda x: tf.expand_dims(x, -1), outputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs
def preprocess(input_features): output_features = {} output_features[metadata.TARGET_FEATURE_NAME] = input_features[metadata.TARGET_FEATURE_NAME] for feature_name in metadata.NUMERIC_FEATURE_NAMES: #output_features[feature_name+"_scaled"] = tft.scale_to_z_score(input_features[feature_name]) output_features[feature_name] = tft.scale_to_z_score(input_features[feature_name]) quantiles = tft.quantiles(input_features[feature_name], num_buckets=NUM_BUCKETS, epsilon=0.01) output_features[feature_name+"_bucketized"] = tft.apply_buckets(input_features[feature_name], bucket_boundaries=quantiles) for feature_name in metadata.CATEGORICAL_FEATURE_NAMES: tft.uniques(input_features[feature_name], vocab_filename=feature_name) output_features[feature_name] = input_features[feature_name] # sba added this #output_features[feature_name+"_integerized"] = tft.string_to_int(input_features[feature_name], #vocab_filename=feature_name) for feature_name in metadata.VOCAB_FEATURE_NAMES: output_features[feature_name +"_integerized"] = tft.string_to_int(input_features[feature_name],top_k=metadata.VOCAB_SIZE, num_oov_buckets=metadata.OOV_SIZE, vocab_filename=feature_name) return output_features
def preprocessing_fn(input_features): # get the text of clean_title text = input_features['clean_title'] # extract embeddings using tf.hub embeddings = tft.apply_function(get_embeddings, text) # tokenize text text_tokens = tf.string_split(text, parameters.DELIMITERS) # bag of words (bow) indices text_tokens_indices = tft.string_to_int(text_tokens, top_k=parameters.VOCAB_SIZE) # tf.idf bag_of_words_indices, tf_idf = tft.tfidf(text_tokens_indices, parameters.VOCAB_SIZE + 1) output_features = dict() output_features['topic'] = input_features['topic'] output_features['title'] = input_features['raw_title'] output_features['bow'] = bag_of_words_indices output_features['tf_idf'] = tf_idf output_features['embeddings'] = embeddings return output_features
def preprocessing_fn(inputs): """User defined preprocessing function for criteo columns. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ # TODO(b/35001605) Make this "passthrough" more DRY. result = {'clicked': inputs['clicked']} for name in INTEGER_COLUMN_NAMES: result[name] = inputs[name] for name in CATEGORICAL_COLUMN_NAMES: result[name + '_id'] = tft.string_to_int( inputs[name], frequency_threshold=frequency_threshold) # TODO(b/35318962): Obviate the need for this workaround on Dense features. # FeatureColumns expect shape (batch_size, 1), not just (batch_size) result = { k: tft.map(lambda x: tf.expand_dims(x, -1), v) for k, v in result.items() } return result
def wrapped_preprocessing_fn(inputs): outputs = preprocessing.preprocess(inputs) for key in outputs: if outputs[key].dtype == tf.bool: outputs[key] = tft.string_to_int( tf.as_string(outputs[key]), vocab_filename='vocab_' + key) return outputs
def preprocess_tft(inputs): import copy import numpy as np def center(x): return x - tft.mean(x) result = copy.copy(inputs) # shallow copy result['mother_age_tft'] = center(inputs['mother_age']) result['gestation_weeks_centered'] = tft.scale_to_0_1(inputs['gestation_weeks']) result['mother_race_tft'] = tft.string_to_int(inputs['mother_race']) return result
def preprocessing_fn(inputs): """TFT preprocessing function. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ features_dict = {} for col_schema in schema: col_name = col_schema['name'] if col_schema['type'] == 'NUMBER': features_dict[col_name] = inputs[col_name] elif col_schema['type'] == 'CATEGORY': features_dict[col_name] = tft.string_to_int( inputs[col_name], vocab_filename='vocab_' + col_name) elif col_schema['type'] == 'TEXT': tokens = tf.string_split(inputs[col_name], DELIMITERS) # TODO: default_value = 0 is wrong. It means OOV gets 0 for their index. # But this is to workaround the issue that trainer can use the true vocab # size. Otherwise trainer has to use VOCAB_SIZE defined in this file which # is too large. I am talking to TFT folks on this. If there is no workaround, # user has to provide a vocab_size. indices = tft.string_to_int(tokens, vocab_filename='vocab_' + col_name, default_value=0) # Add one for the oov bucket created by string_to_int. bow_indices, bow_weights = tft.tfidf(indices, VOCAB_SIZE + 1) features_dict[col_name + '_indices'] = bow_indices features_dict[col_name + '_weights'] = bow_weights elif col_schema['type'] == 'IMAGE_URL': features_dict[col_name] = tft.apply_function_with_checkpoint( _image_to_vec, [inputs[col_name]], INCEPTION_V3_CHECKPOINT, exclude=INCEPTION_EXCLUDED_VARIABLES) elif col_schema['type'] == 'KEY': features_dict[col_name] = inputs[col_name] else: raise ValueError('Invalid schema. Unknown type ' + col_schema['type']) return features_dict
def preprocessing_fn(inputs): """User defined preprocessing function for reddit columns. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ # TODO(b/35001605) Make this "passthrough" more DRY. result = {'score': inputs['score'], 'toplevel': inputs['toplevel']} result['subreddit_id'] = tft.string_to_int( inputs['subreddit'], frequency_threshold=frequency_threshold) for name in ('author', 'comment_body', 'comment_parent_body'): words = tf.string_split(inputs[name]) # TODO(b/33467613) Translate these to bag-of-words style sparse features. result[name + '_bow'] = tft.string_to_int( words, frequency_threshold=frequency_threshold) return result
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf(review_indices, VOCAB_SIZE + 1) return { REVIEW_COLUMN: review_bow_indices, REVIEW_WEIGHT: review_weight, LABEL_COLUMN: inputs[LABEL_COLUMN] }
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[const.REVIEW_COLUMN] review_tokens = tft.map(lambda x: tf.string_split(x, delimiters), review) review_indices = tft.string_to_int(review_tokens, top_k=vocab_size) # Add one for the oov bucket created by string_to_int. review_weight = tft.tfidf_weights(review_indices, vocab_size + 1) output = { const.REVIEW_COLUMN: review_indices, const.REVIEW_WEIGHT: review_weight, const.LABEL_COLUMN: inputs[const.LABEL_COLUMN] } return output
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" x = inputs['x'] y = inputs['y'] s = inputs['s'] x_centered = x - tft.mean(x) y_normalized = tft.scale_to_0_1(y) s_integerized = tft.string_to_int(s) x_centered_times_y_normalized = (x_centered * y_normalized) return { 'x_centered': x_centered, 'y_normalized': y_normalized, 'x_centered_times_y_normalized': x_centered_times_y_normalized, 's_integerized': s_integerized }
def preprocessing_fn(inputs): """User defined preprocessing function for reddit columns. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ # TODO(b/35001605) Make this "passthrough" more DRY. result = {'score': inputs['score'], 'example_id': inputs['example_id']} result['subreddit_id'] = tft.string_to_int( inputs['subreddit'], frequency_threshold=frequency_threshold) return result
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" x = inputs['x'] y = inputs['y'] s = inputs['s'] x_centered = tft.map(lambda x, mean: x - mean, x, tft.mean(x)) y_normalized = tft.scale_to_0_1(y) s_integerized = tft.string_to_int(s) x_centered_times_y_normalized = tft.map(lambda x, y: x * y, x_centered, y_normalized) return { 'x_centered': x_centered, 'y_normalized': y_normalized, 'x_centered_times_y_normalized': x_centered_times_y_normalized, 's_integerized': s_integerized }
def pre_processing_fun(inputs): outputs = {} for fea in NUMERIC_FEATURE_KEYS: outputs[fea] = tft.scale_to_0_1(inputs[fea]) for fea in CATEGORICAL_FEATURE_KEYS: outputs[fea] = tft.string_to_int(inputs[fea]) def convert_label(label): table = lookup.index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """User defined preprocessing function for criteo columns. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ # TODO(b/35001605) Make this "passthrough" more DRY. result = {'clicked': inputs['clicked']} for name in INTEGER_COLUMN_NAMES: result[name] = inputs[name] for name in CATEGORICAL_COLUMN_NAMES: result[name + '_id'] = tft.string_to_int( inputs[name], frequency_threshold=frequency_threshold) return result
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. outputs[LABEL_KEY] = inputs[LABEL_KEY] return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = transform.scale_to_z_score(inputs[key]) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[key] = transform.string_to_int( inputs[key], top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[key] = transform.bucketize(inputs[key], taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[key] = inputs[key] # Was this passenger a big tipper? def convert_label(label): taxi_fare = inputs[taxi.FARE_KEY] return tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(label, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) outputs[taxi.LABEL_KEY] = transform.apply_function( convert_label, inputs[taxi.LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] def remove_character(s, char): """Remove a character from a string. Args: s: A SparseTensor of rank 1 of type tf.string char: A string of length 1 Returns: The string `s` with the given character removed (i.e. replaced by '') """ # Hacky implementation where we split and rejoin. split = tf.string_split(s, char) rejoined = tf.reduce_join( tf.sparse_to_dense( split.indices, split.dense_shape, split.values, ''), 1) return rejoined def remove_punctuation(s): """Remove puncuation from a string. Args: s: A SparseTensor of rank 1 of type tf.string Returns: The string `s` with punctuation removed. """ for char in PUNCTUATION_CHARACTERS: s = remove_character(s, char) return s cleaned_review = tft.map(remove_punctuation, review) review_tokens = tft.map(tf.string_split, cleaned_review) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) return { REVIEW_COLUMN: review_indices, LABEL_COLUMN: inputs[LABEL_COLUMN] }
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs
def preprocess_tft(inputs): import datetime print inputs result = {} result['fare_amount'] = tf.identity(inputs['fare_amount']) result['dayofweek'] = tft.string_to_int(inputs['dayofweek']) # builds a vocabulary result['hourofday'] = tf.identity(inputs['hourofday']) # pass through result['pickuplon'] = (tft.scale_to_0_1(inputs['pickuplon'])) # scaling numeric values result['pickuplat'] = (tft.scale_to_0_1(inputs['pickuplat'])) result['dropofflon'] = (tft.scale_to_0_1(inputs['dropofflon'])) result['dropofflat'] = (tft.scale_to_0_1(inputs['dropofflat'])) result['passengers'] = tf.cast(inputs['passengers'], tf.float32) # a cast result['key'] = tf.as_string(tf.ones_like(inputs['passengers'])) # arbitrary TF func # engineered features latdiff = inputs['pickuplat'] - inputs['dropofflat'] londiff = inputs['pickuplon'] - inputs['dropofflon'] result['latdiff'] = tft.scale_to_0_1(latdiff) result['londiff'] = tft.scale_to_0_1(londiff) dist = tf.sqrt(latdiff * latdiff + londiff * londiff) result['euclidean'] = dist return result
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # bucketize numeric columns for key in TO_BE_BUCKETIZED_FEATURE: outputs[key+'_bucketized'] = tft.bucketize( inputs[key], TO_BE_BUCKETIZED_FEATURE[key] ) # For categorical columns with a small vocabulary for key in STRING_TO_INT_FEATURE_KEYS: outputs[key] = tft.string_to_int( inputs[key], vocab_filename=key) for key in HASH_STRING_FEATURE_KEYS: outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key]) # For the label column we transform it either 0 or 1 if there are row leads def convert_label(label): """Parses a string tensor into the label tensor Args: label_string_tensor: Tensor of dtype string. Result of parsing the CSV column specified by LABEL_COLUMN Returns: A Tensor of the same shape as label_string_tensor, should return an int64 Tensor representing the label index for classification tasks """ table = lookup.index_table_from_tensor(['<=50K', '>50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # bucketize numeric columns for key in TO_BE_BUCKETIZED_FEATURE: outputs[key + '_bucketized'] = tft.bucketize( inputs[key], TO_BE_BUCKETIZED_FEATURE[key]) # For categorical columns with a small vocabulary for key in STRING_TO_INT_FEATURE_KEYS: outputs[key] = tft.string_to_int(inputs[key], vocab_filename=key) for key in HASH_STRING_FEATURE_KEYS: outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key]) # For the label column we transform it either 0 or 1 if there are row leads def convert_label(label): """Parses a string tensor into the label tensor Args: label_string_tensor: Tensor of dtype string. Result of parsing the CSV column specified by LABEL_COLUMN Returns: A Tensor of the same shape as label_string_tensor, should return an int64 Tensor representing the label index for classification tasks """ table = lookup.index_table_from_tensor(['<=50K', '>50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs
def preprocess(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = transform.scale_to_z_score(inputs[key]) for key in VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. if inputs[key].dtype == tf.string: vocab_tensor = inputs[key] else: vocab_tensor = tf.as_string(inputs[key]) outputs[key] = transform.string_to_int(vocab_tensor, vocab_filename='vocab_' + key, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) for key in BUCKET_FEATURE_KEYS: outputs[key] = transform.bucketize(inputs[key], FEATURE_BUCKET_COUNT) for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tf.to_int64(inputs[key]) taxi_fare = inputs[FARE_KEY] taxi_tip = inputs[LABEL_KEY] # Test if the tip was > 20% of the fare. tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2)) outputs[LABEL_KEY] = tf.logical_and(tf.logical_not(tf.is_nan(taxi_fare)), tf.greater(taxi_tip, tip_threshold)) return outputs
def preprocessing_fn(inputs): return { 'index': tft.string_to_int(tft.map(tf.string_split, inputs['a'])) }
def preprocessing_fn(inputs): return { 'ab': tft.map(tf.multiply, inputs['a'], inputs['b']), 'i': tft.string_to_int(inputs['c']) }
def preprocessing_fn(inputs): return {'index': tft.string_to_int(inputs['a'])}