def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # TODO(b/157064428): Support label transformation for Keras. # Do not apply label transformation as it will result in wrong evaluation. outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY] return outputs
def preprocessing_fn(inputs): """Callback function for transforming inputs. Args: inputs: A dict of feature keys maped to `Tensor` or `SparseTensor` of raw features. Returns: Map from string feature keys to `Tensor` of transformed features. """ outputs = { features_config.TARGET_FEATURE: utils.preprocess_sparsetensor( inputs.pop(features_config.TARGET_FEATURE)) } outputs[features_config.ID_FEATURE] = inputs.pop(features_config.ID_FEATURE) for key in features_config.NUMERIC_FEATURES: outputs[utils.make_transformed_key(key)] = transform.scale_to_z_score( utils.preprocess_sparsetensor(inputs[key])) for key in features_config.CATEGORICAL_FEATURES: outputs[utils.make_transformed_key( key)] = transform.compute_and_apply_vocabulary( utils.preprocess_sparsetensor(inputs[key]), top_k=features_config.VOCAB_SIZE, num_oov_buckets=features_config.OOV_SIZE) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) outputs[_transformed_name(_LABEL_KEY)] = _fill_in_missing( inputs[_LABEL_KEY]) return outputs
def preprocessing_fn(inputs): outputs = {} for key in features.DENSE_FLOAT_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in features.VOCAB_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=features.VOCAB_SIZE, num_oov_buckets=features.OOV_SIZE) for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS, features.BUCKET_FEATURE_BUCKET_COUNT): outputs[features.transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), num_buckets, always_return_num_quantiles=False) for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key]) fare_key = 'fare' taxi_fare = _fill_in_missing(inputs[fare_key]) tips = _fill_in_missing(inputs[features.LABEL_KEY]) outputs[features.transformed_name(features.LABEL_KEY)] = tf.compat.v1.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def preprocess(input_features): output_features = {} output_features[metadata.TARGET_FEATURE_NAME] = input_features[metadata.TARGET_FEATURE_NAME] for feature_name in metadata.NUMERIC_FEATURE_NAMES: #output_features[feature_name+"_scaled"] = tft.scale_to_z_score(input_features[feature_name]) output_features[feature_name] = tft.scale_to_z_score(input_features[feature_name]) quantiles = tft.quantiles(input_features[feature_name], num_buckets=NUM_BUCKETS, epsilon=0.01) output_features[feature_name+"_bucketized"] = tft.apply_buckets(input_features[feature_name], bucket_boundaries=quantiles) for feature_name in metadata.CATEGORICAL_FEATURE_NAMES: tft.uniques(input_features[feature_name], vocab_filename=feature_name) output_features[feature_name] = input_features[feature_name] # sba added this #output_features[feature_name+"_integerized"] = tft.string_to_int(input_features[feature_name], #vocab_filename=feature_name) for feature_name in metadata.VOCAB_FEATURE_NAMES: output_features[feature_name +"_integerized"] = tft.string_to_int(input_features[feature_name],top_k=metadata.VOCAB_SIZE, num_oov_buckets=metadata.OOV_SIZE, vocab_filename=feature_name) return output_features
def preprocessing_fn( inputs: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]: """tf.transform's callback function for preprocessing inputs. Parameters ---------- inputs: map from feature keys to raw not-yet-transformed features. Returns ------- Map from string feature key to transformed feature operations. """ outputs = {} for key in categorical_feature_keys + [label_key]: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) vocab_file_tensor = tft.vocabulary(outputs[_transformed_name(key)], vocab_filename=key) outputs[_transformed_name(key)] = tft.apply_vocabulary( outputs[_transformed_name(key)], vocab_file_tensor) # NOTE: This won't be correct in the incremental case since it's only using # the new examples to get the mean and variance. for key in numerical_feature_keys: outputs[_transformed_name(key)] = tf.expand_dims( tft.scale_to_z_score(_fill_in_missing(inputs[key])), axis=1) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in features.FEATURE_NAMES: if key in features.NUMERICAL_FEATURE_NAMES: outputs[features.transformed_name(key)] = tft.scale_to_z_score(inputs[key]) elif key in features.categorical_feature_names(): outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( inputs[key], num_oov_buckets=1, vocab_filename=key, ) outputs[features.TARGET_FEATURE_NAME] = inputs[features.TARGET_FEATURE_NAME] for key in outputs: outputs[key] = tf.squeeze(outputs[key], -1) return outputs
def preprocessing_fn(inputs): out = dict() for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. out[taxi.transformed_name(key)] = tft.scale_to_z_score( taxi.fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. out[taxi.transformed_name(key)] = tft.compute_and_apply_vocabulary( taxi.fill_in_missing(inputs[key]), top_k=10, num_oov_buckets=10) for key in taxi.BUCKET_FEATURE_KEYS: out[taxi.transformed_name(key)] = tft.bucketize(taxi.fill_in_missing( inputs[key]), num_buckets=10) for key in taxi.CATEGORICAL_FEATURE_KEYS: out[taxi.transformed_name(key)] = taxi.fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = taxi.fill_in_missing(inputs[taxi.FARE_KEY]) tips = taxi.fill_in_missing(inputs[taxi.LABEL_KEY]) out[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return out
def preprocessing_fn(inputs): features = [] outputs = {LABEL_KEY: _fill_in_missing(inputs[LABEL_KEY])} for feature_name, feature_tensor in inputs.items(): if feature_name != LABEL_KEY: features.append( tft.scale_to_z_score( # standard scaler pre-req for PCA _fill_in_missing( feature_tensor) # filling in missing values )) # concat to make feature matrix for PCA to run over feature_matrix = tf.concat(features, axis=1) # get orthonormal vector matix orthonormal_vectors = tft.pca(feature_matrix, output_dim=2, dtype=tf.float32) # multiply matrix by feature matrix to get transformation pca_examples = tf.linalg.matmul(feature_matrix, orthonormal_vectors) # unstack and add to output dict pca_examples = tf.unstack(pca_examples, axis=1) outputs['Principal Component 1'] = pca_examples[0] outputs['Principal Component 2'] = pca_examples[1] return outputs
def preprocessing_fn(input_features): processed_features = {} for feature in raw_schema.feature: # Pass the target feature as is. if feature.name in [TARGET_FEATURE_NAME, WEIGHT_FEATURE_NAME]: processed_features[feature.name] = _prep( input_features[feature.name]) continue if feature.type == 1: # Extract vocabulary and integerize categorical features. processed_features[feature.name + "_integerized"] = _prep( tft.compute_and_apply_vocabulary(input_features[feature.name], vocab_filename=feature.name)) else: # normalize numeric features. processed_features[feature.name + "_scaled"] = _prep( tft.scale_to_z_score(input_features[feature.name])) # Bucketize age using quantiles. quantiles = tft.quantiles(input_features["age"], num_buckets=5, epsilon=0.01) processed_features["age_bucketized"] = _prep( tft.apply_buckets(input_features["age"], bucket_boundaries=quantiles)) return processed_features
def preprocessing_fn(inputs): """Preprocesses Covertype Dataset. Scales numerical features and generates vocabularies and mappings for categorical features. Args: inputs: A map from feature keys to raw not-yet-transformed features Returns: A map from transformed feature keys to transformation operations """ outputs = {} # Scale numerical features for key in NUMERIC_FEATURES_KEYS: outputs[_transformed_name(key)] = tft.scale_to_z_score(_fill_in_missing(inputs[key])) # Generate vocabularies and maps categorical features for key in CATEGORICAL_FEATURES_KEYS: outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key) # Convert Cover_Type from 1-7 to 0-6 outputs[_transformed_name(LABEL_KEY)] = _fill_in_missing(inputs[LABEL_KEY]) - 1 return outputs
def preprocessing_fn(inputs): outputs = {} for key in _FEATURE_KEYS: outputs[_transformed_name(key)] = tft.scale_to_z_score(inputs[key]) outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY] return outputs
def preprocessing_fn(inputs: Dict[str, Tensor], custom_config=Dict[str, Any]) -> Dict[str, Tensor]: """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. custom_config: Custom configuration dictionary for passing the task's ProblemStatement as a text proto, since custom_config must be JSON-serializable. Returns: Map from string feature key to transformed feature operations. """ problem_statement = ps_pb2.ProblemStatement() text_format.Parse( text=custom_config[BasicPreprocessor.PROBLEM_STATEMENT_KEY], message=problem_statement) outputs = {} for key in [k for k, v in inputs.items() if v.dtype == tf.float32]: # TODO(weill): Handle case when an int field can actually represents numeric # rather than categorical values. task_type = problem_statement.tasks[0].type if task_type.HasField('one_dimensional_regression') and ( key == task_type.one_dimensional_regression.label): outputs[key] = inputs[key] # Skip normalizing regression tasks. continue # Preserve this feature as a dense float, setting nan's to the mean. outputs[_sanitize_feature_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in [k for k, v in inputs.items() if v.dtype != tf.float32]: # Build a vocabulary for this feature. # TODO(weill): Risk here to blow up computation needlessly. output = tft.compute_and_apply_vocabulary(_fill_in_missing( inputs[key]), top_k=None, num_oov_buckets=1) # Don't sanitize the label key name. task_type = problem_statement.tasks[0].type if task_type.HasField('multi_class_classification') and ( key == task_type.multi_class_classification.label): outputs[key] = output continue if task_type.HasField('binary_classification') and ( key == task_type.binary_classification.label): outputs[key] = output continue # Do sanitize feature key names. outputs[_sanitize_feature_name(key)] = output return outputs
def preprocess(inputs): # inputs is a batch of input features median_age = inputs["housing_median_age"] ocean_proximity = inputs["ocean_proximity"] standardized_age = tft.scale_to_z_score(median_age - tft.mean(median_age)) ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity) return { "standardized_median_age": standardized_age, "ocean_proximity_id": ocean_proximity_id }
def preprocessing_fn(inputs): """ Preprocess input columns into transformed columns. Args: inputs (dict): dict of input columns Returns: output dict of transformed columns """ outputs = {} # Encode categorical column: outputs['MixingSpeed'] = tft.compute_and_apply_vocabulary( inputs['MixingSpeed']) outputs['ButterMass'] = inputs['ButterMass'] # Calculate Derived Features: outputs['TotalMass'] = inputs['ButterMass'] + inputs['SugarMass'] + inputs[ 'FlourMass'] for ingredient in ['Butter', 'Sugar', 'Flour']: ingredient_percentage = inputs['{}Mass'.format( ingredient)] / outputs['TotalMass'] outputs['Norm{}perc'.format(ingredient)] = tft.scale_to_z_score( ingredient_percentage) # Keep absolute numeric columns for key in ['TotalVolume', 'Energy']: outputs[key] = inputs[key] # Normalize other numeric columns for key in [ 'ButterTemperature', 'SugarHumidity', 'FlourHumidity', 'HeatingTime', 'MixingTime', 'Density', 'Temperature', 'Humidity', ]: outputs[key] = tft.scale_to_z_score(inputs[key]) # Extract Specific Problems chunks_detected_str = tf.regex_replace(input=inputs['Problems'], pattern='.*chunk.*', rewrite='chunk', name='DetectChunk') outputs['Chunks'] = tf.cast(tf.equal(chunks_detected_str, 'chunk'), tf.float32) return outputs
def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]: outputs = {} for feat in DENSE_FEATURES: outputs[f'{feat}_xf'] = tft.scale_to_z_score(inputs[feat]) for feat in BINARY_FEATURES: outputs[feat] = inputs[feat] outputs[LABEL_KEY] = inputs[LABEL_KEY] return outputs
def transform_to_tfrecord(self, inputs): """Preprocess raw input columns into transformed columns.""" outputs = inputs.copy() for key in self.data_formatter.number_features: outputs[key] = tft.scale_to_z_score((outputs[key])) for key in self.data_formatter.vocabulary_features: tft.vocabulary(inputs[key], vocab_filename=key) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _FEATURE_KEYS: outputs[_transformed_name(key)] = tft.scale_to_z_score(inputs[key]) outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY] return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in features.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in features.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[features.transformed_name( key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=features.VOCAB_SIZE, num_oov_buckets=features.OOV_SIZE) for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS, features.BUCKET_FEATURE_BUCKET_COUNT): outputs[features.transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), num_buckets, always_return_num_quantiles=False) for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? fare_key = 'fare' taxi_fare = _fill_in_missing(inputs[fare_key]) tips = _fill_in_missing(inputs[features.LABEL_KEY]) outputs[features.transformed_name( features.LABEL_KEY)] = tf.compat.v1.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in my_metadata.NUMERIC_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[my_metadata.transformed_name(key)] = transform.scale_to_z_score(_fill_in_missing(inputs[key])) for key in my_metadata.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[my_metadata.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), vocab_filename=my_metadata.transformed_name(key), num_oov_buckets=my_metadata.OOV_SIZE, top_k=my_metadata.VOCAB_SIZE ) for key, hash_buckets in my_metadata.HASH_STRING_FEATURE_KEYS.items(): outputs[my_metadata.transformed_name(key)] = transform.hash_strings( _fill_in_missing(inputs[key]), hash_buckets=hash_buckets ) for key, nb_buckets in my_metadata.TO_BE_BUCKETIZED_FEATURE.items(): outputs[my_metadata.transformed_name(key +'_bucketized')] = transform.bucketize( _fill_in_missing(inputs[key]), nb_buckets) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[my_metadata.FARE_KEY]) tips = _fill_in_missing(inputs[my_metadata.LABEL_KEY]) outputs[my_metadata.transformed_name(my_metadata.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def preprocess_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = tft.scale_to_z_score(to_dense(inputs[key])) for key in VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. if inputs[key].dtype == tf.string: vocab_tensor = to_dense(inputs[key]) else: vocab_tensor = tf.as_string(to_dense(inputs[key])) outputs[key] = tft.compute_and_apply_vocabulary( vocab_tensor, vocab_filename='vocab_' + key, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) for key in BUCKET_FEATURE_KEYS: outputs[key] = tft.bucketize(to_dense(inputs[key]), FEATURE_BUCKET_COUNT) for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64) taxi_fare = to_dense(inputs[FARE_KEY]) taxi_tip = to_dense(inputs[LABEL_KEY]) # Test if the tip was > 20% of the fare. tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2)) outputs[LABEL_KEY] = tf.logical_and( tf.logical_not(tf.math.is_nan(taxi_fare)), tf.greater(taxi_tip, tip_threshold)) for key in outputs: if outputs[key].dtype == tf.bool: outputs[key] = tft.compute_and_apply_vocabulary( tf.as_string(outputs[key]), vocab_filename='vocab_' + key) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _FEATURE_KEYS: outputs[_transformed_name(key)] = tft.scale_to_z_score(inputs[key]) # TODO(b/157064428): Support label transformation for Keras. # Do not apply label transformation as it will result in wrong evaluation. outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY] return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = transform.scale_to_z_score(inputs[key]) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[key] = transform.string_to_int( inputs[key], top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[key] = transform.bucketize(inputs[key], taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[key] = inputs[key] # Was this passenger a big tipper? def convert_label(label): taxi_fare = inputs[taxi.FARE_KEY] return tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(label, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) outputs[taxi.LABEL_KEY] = transform.apply_function( convert_label, inputs[taxi.LABEL_KEY]) return outputs
def wide_preprocessing_fn(inputs): """TFT preprocessing function. Args: inputs: Map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} # pylint: disable=protected-access for idx, key in enumerate( itertools.islice( itertools.cycle(taxi_utils._BUCKET_FEATURE_KEYS), self._num_bucketize)): outputs["bucketized" + str(idx)] = tft.bucketize( taxi_utils._fill_in_missing(inputs[key]), taxi_utils._FEATURE_BUCKET_COUNT) for idx, key in enumerate( itertools.islice( itertools.cycle(taxi_utils._DENSE_FLOAT_FEATURE_KEYS), self._num_scale)): # Preserve this feature as a dense float, setting nan's to the mean. outputs["scaled" + str(idx)] = tft.scale_to_z_score( taxi_utils._fill_in_missing(inputs[key])) for idx, key in enumerate( itertools.islice( itertools.cycle(taxi_utils._VOCAB_FEATURE_KEYS), self._num_vocabs)): outputs["vocab" + str(idx)] = tft.compute_and_apply_vocabulary( taxi_utils._fill_in_missing(inputs[key]), top_k=taxi_utils._VOCAB_SIZE, num_oov_buckets=taxi_utils._OOV_SIZE) # Pass-through features. for key in taxi_utils._CATEGORICAL_FEATURE_KEYS + [ taxi_utils._LABEL_KEY ]: outputs[key] = inputs[key] return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in outputs.keys(): # Preserve this feature as a dense float, setting nan's to the mean. outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) outputs[features.transformed_name( features.LABEL_KEY)] = inputs[features.LABEL_KEY] return outputs
def def_preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def preprocessing_fn(inputs): """Preprocesses Covertype Dataset.""" outputs = {} # Scale numerical features for key in features.NUMERIC_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) # Generate vocabularies and maps categorical features for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key) # Convert Cover_Type to dense tensor outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing( inputs[features.LABEL_KEY]) return outputs
def preprocessing_fn(inputs, custom_config): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. custom_config: additional properties for pre-processing. Returns: Map from string feature key to transformed features. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(_identity(inputs[key]))) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=custom_config.get('VOCAB_SIZE', _VOCAB_SIZE), num_oov_buckets=custom_config.get('OOV_SIZE', _OOV_SIZE)) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) tips = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def transform_to_tfrecord(self, inputs): """Preprocess raw input columns into transformed columns.""" outputs = inputs.copy() for key in enabled_number_features: outputs[key] = tft.scale_to_z_score((outputs[key])) # for key in OPTIONAL_NUMERIC_FEATURE_KEYS: # # This is a SparseTensor because it is optional. Here we fill in a default # # value when it is missing. # dense = tf.sparse_to_dense(outputs[key].indices, # [outputs[key].dense_shape[0], 1], # outputs[key].values, default_value=0.) # # Reshaping from a batch of vectors of size 1 to a batch to scalars. # dense = tf.squeeze(dense, axis=1) # outputs[key] = tft.scale_to_0_1(dense) for key in enabled_vocabulary_features: tft.vocabulary(inputs[key], vocab_filename=key) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: A no-op! Returns unchanged map from string feature key to features. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) # flower variety variety = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.cast(variety, tf.int64) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # If sparse make it dense, setting nan's to 0 or '', and apply zscore. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) tips = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs