def def_preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # If sparse make it dense, setting nan's to 0 or '', and apply zscore. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) tips = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in ts.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = transform.scale_to_z_score(inputs[key]) for key in ts.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[key] = transform.string_to_int( inputs[key], top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) for key in ts.BUCKET_FEATURE_KEYS: outputs[key] = transform.bucketize(inputs[key], FEATURE_BUCKET_COUNT) for key in ts.CATEGORICAL_FEATURE_KEYS: outputs[key] = inputs[key] # Was this passenger a big tipper? def convert_label(label): taxi_fare = inputs[ts.FARE_KEY] return tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 5% of the fare. tf.cast( tf.greater(label, tf.multiply(taxi_fare, tf.constant(0.05))), tf.int64)) outputs[ts.LABEL_KEY] = transform.apply_function(convert_label, inputs[ts.LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # bucketize numeric columns for key in TO_BE_BUCKETIZED_FEATURE: outputs[key+'_bucketized'] = tft.bucketize( inputs[key], TO_BE_BUCKETIZED_FEATURE[key] ) # For categorical columns with a small vocabulary for key in STRING_TO_INT_FEATURE_KEYS: outputs[key] = tft.string_to_int( inputs[key], vocab_filename=key) for key in HASH_STRING_FEATURE_KEYS: outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key]) # For the label column we transform it either 0 or 1 if there are row leads def convert_label(label): """Parses a string tensor into the label tensor Args: label_string_tensor: Tensor of dtype string. Result of parsing the CSV column specified by LABEL_COLUMN Returns: A Tensor of the same shape as label_string_tensor, should return an int64 Tensor representing the label index for classification tasks """ table = lookup.index_table_from_tensor(['<=50K', '>50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """Preprocesses Titanic Dataset.""" outputs = {} # Scale numerical features for key in features.NUMERIC_FEATURE_KEYS: mean_value = compute_mean_ignore_nan(inputs[key].values) absl.logging.info(f'TFT preprocessing. Mean value for {key} = {mean_value}') outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing_with_impute(inputs[key], mean_value)) for key in features.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=features.VOCAB_SIZE_MAP.get(key, features.VOCAB_SIZE), num_oov_buckets=features.OOV_SIZE) for key in features.BUCKET_FEATURE_KEYS: if key in features.FEATURE_BUCKET_BOUNDARIES: bucket_boundaries = tf.constant(features.FEATURE_BUCKET_BOUNDARIES.get(key)) # tf.print("bucket_boundaries:", bucket_boundaries, output_stream=absl.logging.info) outputs[features.transformed_name(key)] = tft.apply_buckets(_fill_in_missing(inputs[key]), bucket_boundaries) else: outputs[features.transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), features.FEATURE_BUCKET_COUNT_MAP.get(key, features.FEATURE_BUCKET_COUNT)) # Generate vocabularies and maps categorical features for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key) # Convert Cover_Type to dense tensor outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing( inputs[features.LABEL_KEY]) return outputs
def preprocessing_fn(inputs: tf.Tensor) -> tf.Tensor: """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in ONE_HOT_FEATURES.keys(): dim = ONE_HOT_FEATURES[key] int_value = tft.compute_and_apply_vocabulary(fill_in_missing( inputs[key]), top_k=dim + 1) outputs[transformed_name(key)] = convert_num_to_one_hot( int_value, num_labels=dim + 1) for key, bucket_count in BUCKET_FEATURES.items(): dense_feature = fill_in_missing(inputs[key]) if key == "zip_code" and dense_feature.dtype == tf.string: dense_feature = convert_zip_code(dense_feature) else: dense_feature = tf.cast(dense_feature, tf.float32) temp_feature = tft.bucketize(dense_feature, bucket_count, always_return_num_quantiles=False) outputs[transformed_name(key)] = convert_num_to_one_hot( temp_feature, num_labels=bucket_count + 1) for key in TEXT_FEATURES.keys(): outputs[transformed_name(key)] = fill_in_missing(inputs[key]) outputs[transformed_name(LABEL_KEY)] = fill_in_missing(inputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), num_buckets=_FEATURE_BUCKET_COUNT, always_return_num_quantiles=False) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? - our classification goal taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) tips = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), tf.cast(tf.greater(tips, taxi_fare * tf.constant(0.2)), tf.int64)) return outputs
def wide_preprocessing_fn(inputs): """TFT preprocessing function. Args: inputs: Map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for idx, key in enumerate( itertools.islice( itertools.cycle(self._BUCKETIZE_KEYS), self._num_bucketize)): outputs["bucketized" + str(idx)] = tft.bucketize( taxi_utils._fill_in_missing(inputs[key]), taxi_utils._FEATURE_BUCKET_COUNT) for idx, key in enumerate( itertools.islice(itertools.cycle(self._SCALE_KEYS), self._num_scale)): # If sparse make it dense, setting nan's to 0 or '', and apply zscore. outputs["scaled" + str(idx)] = tft.scale_to_z_score( taxi_utils._fill_in_missing(inputs[key])) for idx, key in enumerate( itertools.islice( itertools.cycle(self._VOCABULARY_KEYS), self._num_vocabs)): outputs["vocab" + str(idx)] = tft.compute_and_apply_vocabulary( taxi_utils._fill_in_missing(inputs[key]), top_k=taxi_utils._VOCAB_SIZE, num_oov_buckets=taxi_utils._OOV_SIZE) # Pass-through features. for key in taxi_utils._CATEGORICAL_FEATURE_KEYS + [taxi_utils._LABEL_KEY]: outputs[key] = inputs[key] return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in features.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in features.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=features.VOCAB_SIZE, num_oov_buckets=features.OOV_SIZE) for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS, features.BUCKET_FEATURE_BUCKET_COUNT): outputs[features.transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), num_buckets) for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key]) # TODO(b/157064428): Support label transformation for Keras. # Do not apply label transformation as it will result in wrong evaluation. outputs[features.transformed_name( features.LABEL_KEY)] = inputs[features.LABEL_KEY] return outputs
def preprocess(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = transform.scale_to_z_score(inputs[key]) for key in VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. if inputs[key].dtype == tf.string: vocab_tensor = inputs[key] else: vocab_tensor = tf.as_string(inputs[key]) outputs[key] = transform.string_to_int(vocab_tensor, vocab_filename='vocab_' + key, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) for key in BUCKET_FEATURE_KEYS: outputs[key] = transform.bucketize(inputs[key], FEATURE_BUCKET_COUNT) for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tf.to_int64(inputs[key]) taxi_fare = inputs[FARE_KEY] taxi_tip = inputs[LABEL_KEY] # Test if the tip was > 20% of the fare. tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2)) outputs[LABEL_KEY] = tf.logical_and(tf.logical_not(tf.is_nan(taxi_fare)), tf.greater(taxi_tip, tip_threshold)) return outputs
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) # bucketize numeric columns for key in TO_BE_BUCKETIZED_FEATURE: outputs[key + '_bucketized'] = tft.bucketize( inputs[key], TO_BE_BUCKETIZED_FEATURE[key]) # For categorical columns with a small vocabulary for key in STRING_TO_INT_FEATURE_KEYS: outputs[key] = tft.string_to_int(inputs[key], vocab_filename=key) for key in HASH_STRING_FEATURE_KEYS: outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key]) # For the label column we transform it either 0 or 1 if there are row leads def convert_label(label): """Parses a string tensor into the label tensor Args: label_string_tensor: Tensor of dtype string. Result of parsing the CSV column specified by LABEL_COLUMN Returns: A Tensor of the same shape as label_string_tensor, should return an int64 Tensor representing the label index for classification tasks """ table = lookup.index_table_from_tensor(['<=50K', '>50K']) return table.lookup(label) outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1] for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) for key in NUMERIC_FEATURE_KEYS_INT: outputs[key] = tft.scale_to_0_1(inputs[key]) # Bucketize numeric columns for key in TO_BE_BUCKETIZED_FEATURE: outputs[f'{key}_b'] = tft.bucketize(inputs[key], TO_BE_BUCKETIZED_FEATURE[key]) for key in HASH_STRING_FEATURE_KEYS: outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key]) # For the label column we transform it either 0 or 1 if there are row leads outputs[LABEL_KEY] = inputs[LABEL_KEY] return outputs
def preprocessing_fn(inputs): return { 'x_bucketized': tft.bucketize(inputs['x'], num_buckets=3, epsilon=0.00001) }
def transform(self, name, values): value = Feature.fill_in_missing(values[name]) if self.buckets is None: return tft.scale_to_z_score(value) return tft.bucketize(value, self.buckets)
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. https://cloud.google.com/solutions/machine-learning/data-preprocessing-for-ml-with-tf-transform-pt2 Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: print('processing key', key) print('input:', inputs[key]) # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) # for key in taxi.FEATURE_NGRAM: # # Extract nggrams and build a vocab. # outputs[ # taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( # transform.ngrams( # tf.string_split(_fill_in_missing(inputs[key])), # ngram_range=taxi.NGRAM_RANGE, # separator=' '), # top_k=512, # num_oov_buckets=taxi.OOV_SIZE) for key in taxi.FEATURE_NGRAM: # Extract nggrams and build a vocab. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( transform_ngrams(_fill_in_missing(inputs[key]), taxi.NGRAM_RANGE), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs