def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # TODO(b/157064428): Support label transformation for Keras. # Do not apply label transformation as it will result in wrong evaluation. outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY] return outputs
def _tokenize_review(review): """Tokenize the reviews by spliting the reviews. Constructing a vocabulary. Map the words to their frequency index in the vocabulary. Args: review: tensors containing the reviews. (batch_size/None, 1) Returns: Tokenized and padded review tensors. (batch_size/None, _MAX_LEN) """ review_sparse = tf.strings.split(tf.reshape(review, [-1])).to_sparse() # tft.apply_vocabulary doesn't reserve 0 for oov words. In order to comply # with convention and use mask_zero in keras.embedding layer, set oov value # to _VOCAB_SIZE and padding value to -1. Then add 1 to all the tokens. review_indices = tft.compute_and_apply_vocabulary( review_sparse, default_value=_VOCAB_SIZE, top_k=_VOCAB_SIZE) dense = tf.sparse.to_dense(review_indices, default_value=-1) # TFX transform expects the transform result to be FixedLenFeature. padding_config = [[0, 0], [0, _MAX_LEN]] dense = tf.pad(dense, padding_config, 'CONSTANT', -1) padded = tf.slice(dense, [0, 0], [-1, _MAX_LEN]) padded += 1 return padded
def preprocessing_fn(inputs): """Preprocesses Covertype Dataset. Scales numerical features and generates vocabularies and mappings for categorical features. Args: inputs: A map from feature keys to raw not-yet-transformed features Returns: A map from transformed feature keys to transformation operations """ outputs = {} # Scale numerical features for key in NUMERIC_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) # Generate vocabularies and maps categorical features for key in CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key) # Convert Cover_Type from 1-7 to 0-6 outputs[_transformed_name(LABEL_KEY)] = _fill_in_missing( inputs[LABEL_KEY]) - 1 return outputs
def compute_vocab_fn(inputs): """Preprocessing fn for sparse features. This function computes unique IDs for the sparse features. We rely on implicit behavior which writes the vocab files to the vocab_filename specified in tft.compute_and_apply_vocabulary. Pre-condition: Sparse features have been converted to integer and mod'ed with MAX_IND_RANGE. Args: inputs: Input features to transform. Returns: Output dict with transformed features. """ outputs = {} outputs[LABEL_KEY] = inputs[LABEL_KEY] for key in NUMERIC_FEATURE_KEYS: outputs[key] = inputs[key] for idx, key in enumerate(CATEGORICAL_FEATURE_KEYS): outputs[key] = tft.compute_and_apply_vocabulary( x=inputs[key], vocab_filename="feature_{}_vocab".format(idx)) return outputs
def _preprocess_tft(raw_data, user_freq, item_freq): """Creates vocabularies for users and items and maps their ids to ints. Args: raw_data: a dict of shape {$user_key: tensor, $item_key: tensor, ...}. user_freq: minimum frequency of a user to include it in the user vocab. item_freq: minimum frequency of an item to include it in the item vocab. Returns: A dict containing int ids cooresponding to a user_id and item_id and other features: {$user_key: $user_id, $item_key: $item_id, ...}. """ features = { feature: raw_data[feature] for feature in constants.BQ_FEATURES } item_vocab = tft.vocabulary(raw_data[constants.ITEM_KEY], vocab_filename=constants.ITEM_VOCAB_NAME, frequency_threshold=item_freq) tft_features = { constants.TFT_USER_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.USER_KEY], vocab_filename=constants.USER_VOCAB_NAME, frequency_threshold=user_freq, default_value=constants.TFT_DEFAULT_ID), constants.TFT_ITEM_KEY: tft.apply_vocabulary(raw_data[constants.ITEM_KEY], item_vocab, default_value=constants.TFT_DEFAULT_ID), constants.TFT_ARTIST_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.ARTIST_KEY], vocab_filename=constants.ARTIST_VOCAB_NAME, default_value=constants.TFT_DEFAULT_ID), constants.TFT_TAGS_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.TAGS_KEY], vocab_filename=constants.TAG_VOCAB_NAME, default_value=constants.TFT_DEFAULT_ID), constants.TFT_TOP_10_KEY: tft.apply_vocabulary(raw_data[constants.TOP_10_KEY], item_vocab, default_value=constants.TFT_DEFAULT_ID), } features.update(tft_features) return features
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) for key in OPTIONAL_NUMERIC_FEATURE_KEYS: # This is a SparseTensor because it is optional. Here we fill in a default # value when it is missing. sparse = tf.sparse.SparseTensor(inputs[key].indices, inputs[key].values, [inputs[key].dense_shape[0], 1]) dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.) # Reshaping from a batch of vectors of size 1 to a batch to scalars. dense = tf.squeeze(dense, axis=1) outputs[key] = tft.scale_to_0_1(dense) # For all categorical columns except the label column, we generate a # vocabulary, and convert the string feature to a one-hot encoding. for key in CATEGORICAL_FEATURE_KEYS: integerized = tft.compute_and_apply_vocabulary( tf.strings.strip(inputs[key]), num_oov_buckets=NUM_OOV_BUCKETS, vocab_filename=key) depth = (tft.experimental.get_vocabulary_size_by_name(key) + NUM_OOV_BUCKETS) one_hot_encoded = tf.one_hot(integerized, depth=tf.cast(depth, tf.int32), on_value=1.0, off_value=0.0) # This output is now one-hot encoded. If saving transformed data to disk, # this can incur significant memory cost. outputs[key] = tf.reshape(one_hot_encoded, [-1, depth]) # For the label column we provide the mapping from string to index. table_keys = ['>50K', '<=50K'] with tf.init_scope(): initializer = tf.lookup.KeyValueTensorInitializer( keys=table_keys, values=tf.cast(tf.range(len(table_keys)), tf.int64), key_dtype=tf.string, value_dtype=tf.int64) table = tf.lookup.StaticHashTable(initializer, default_value=-1) # Remove trailing periods for test data when the data is read with tf.data. label_str = tf.strings.regex_replace(inputs[LABEL_KEY], r'\.', '') label_str = tf.strings.strip(label_str) data_labels = table.lookup(label_str) transformed_label = tf.one_hot(indices=data_labels, depth=len(table_keys), on_value=1.0, off_value=0.0) outputs[LABEL_KEY] = tf.reshape(transformed_label, [-1, len(table_keys)]) return outputs
def preprocessing_fn(inputs): #end::entry_point[] #tag::logic[] outputs = {} # TFT business logic goes here outputs["body_stuff"] = tft.compute_and_apply_vocabulary(inputs["body"], top_k=1000) return outputs
def preprocess_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[key] = tft.scale_to_z_score(to_dense(inputs[key])) for key in VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. if inputs[key].dtype == tf.string: vocab_tensor = to_dense(inputs[key]) else: vocab_tensor = tf.as_string(to_dense(inputs[key])) outputs[key] = tft.compute_and_apply_vocabulary( vocab_tensor, vocab_filename='vocab_' + key, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) for key in BUCKET_FEATURE_KEYS: outputs[key] = tft.bucketize(to_dense(inputs[key]), FEATURE_BUCKET_COUNT) for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64) taxi_fare = to_dense(inputs[FARE_KEY]) taxi_tip = to_dense(inputs[LABEL_KEY]) # Test if the tip was > 20% of the fare. tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2)) outputs[LABEL_KEY] = tf.logical_and( tf.logical_not(tf.math.is_nan(taxi_fare)), tf.greater(taxi_tip, tip_threshold)) for key in outputs: if outputs[key].dtype == tf.bool: outputs[key] = tft.compute_and_apply_vocabulary( tf.as_string(outputs[key]), vocab_filename='vocab_' + key) return outputs
def preprocess(inputs): # inputs is a batch of input features median_age = inputs["housing_median_age"] ocean_proximity = inputs["ocean_proximity"] standardized_age = tft.scale_to_z_score(median_age - tft.mean(median_age)) ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity) return { "standardized_median_age": standardized_age, "ocean_proximity_id": ocean_proximity_id }
def preprocessing_fn(inputs: Dict[str, Tensor], custom_config=Dict[str, Any]) -> Dict[str, Tensor]: """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. custom_config: Custom configuration dictionary for passing the task's ProblemStatement as a text proto, since custom_config must be JSON-serializable. Returns: Map from string feature key to transformed feature operations. """ problem_statement = ps_pb2.ProblemStatement() text_format.Parse( text=custom_config[BasicPreprocessor.PROBLEM_STATEMENT_KEY], message=problem_statement) outputs = {} for key in [k for k, v in inputs.items() if v.dtype == tf.float32]: # TODO(weill): Handle case when an int field can actually represents numeric # rather than categorical values. task_type = problem_statement.tasks[0].type if task_type.HasField('one_dimensional_regression') and ( key == task_type.one_dimensional_regression.label): outputs[key] = inputs[key] # Skip normalizing regression tasks. continue # Preserve this feature as a dense float, setting nan's to the mean. outputs[_sanitize_feature_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in [k for k, v in inputs.items() if v.dtype != tf.float32]: # Build a vocabulary for this feature. # TODO(weill): Risk here to blow up computation needlessly. output = tft.compute_and_apply_vocabulary(_fill_in_missing( inputs[key]), top_k=None, num_oov_buckets=1) # Don't sanitize the label key name. task_type = problem_statement.tasks[0].type if task_type.HasField('multi_class_classification') and ( key == task_type.multi_class_classification.label): outputs[key] = output continue if task_type.HasField('binary_classification') and ( key == task_type.binary_classification.label): outputs[key] = output continue # Do sanitize feature key names. outputs[_sanitize_feature_name(key)] = output return outputs
def _preprocessing_fn(inputs, integer_label: bool = False): """TensorFlow Transform preprocessing function.""" outputs = inputs.copy() if not integer_label: # Integerize string labels, if present. outputs[constants.LABEL_KEY] = tft.compute_and_apply_vocabulary( outputs[constants.LABEL_KEY]) return outputs
def _preprocessing_fn(inputs: Dict[str, Any], schema_map: Dict[str, collections.namedtuple]): """TensorFlow Transform preprocessing function.""" outputs = {} for name, supported_type in schema_map.items(): if supported_type.type_name == 'string_label': outputs[name] = tft.compute_and_apply_vocabulary(inputs[name]) else: outputs[name] = inputs[name] return outputs
def preprocessing_fn(inputs): """ This is the preprocessing functions use by the tensorflow transform Paramters: inputs -- the tensorflow parset input tensors in a dict, defined by the metadata input Returns: inputs -- dict wit the now appended output values, the added word representation is a sparse tensor """ words = tf.string_split(inputs['text'],DELIMITERS_WORDS) word_representation = tft.compute_and_apply_vocabulary(words,default_value=0,top_k=10000) inputs["word_representation"] = word_representation return inputs
def preprocessing_fn(inputs): """Preprocesses Titanic Dataset.""" outputs = {} # Scale numerical features for key in features.NUMERIC_FEATURE_KEYS: mean_value = compute_mean_ignore_nan(inputs[key].values) absl.logging.info(f'TFT preprocessing. Mean value for {key} = {mean_value}') outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing_with_impute(inputs[key], mean_value)) for key in features.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=features.VOCAB_SIZE_MAP.get(key, features.VOCAB_SIZE), num_oov_buckets=features.OOV_SIZE) for key in features.BUCKET_FEATURE_KEYS: if key in features.FEATURE_BUCKET_BOUNDARIES: bucket_boundaries = tf.constant(features.FEATURE_BUCKET_BOUNDARIES.get(key)) # tf.print("bucket_boundaries:", bucket_boundaries, output_stream=absl.logging.info) outputs[features.transformed_name(key)] = tft.apply_buckets(_fill_in_missing(inputs[key]), bucket_boundaries) else: outputs[features.transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), features.FEATURE_BUCKET_COUNT_MAP.get(key, features.FEATURE_BUCKET_COUNT)) # Generate vocabularies and maps categorical features for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key) # Convert Cover_Type to dense tensor outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing( inputs[features.LABEL_KEY]) return outputs
def preprocessing_fn(inputs): x = inputs['x'] y = inputs['y'] s = inputs['s'] x_centered = x - tft.mean(x) y_normalized = tft.scale_to_0_1(y) s_integerized = tft.compute_and_apply_vocabulary(s) x_centered_times_y_normalized = x_centered * y_normalized return { 'x_centered': x_centered, 'y_normalized': y_normalized, 'x_centered_times_y_normalized': x_centered_times_y_normalized, 's_integerized': s_integerized }
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} outputs["id"] = inputs["id"] tokens = tokenize_reviews(_fill_in_missing(inputs["text"], '')) outputs["text_xf"] = tft.compute_and_apply_vocabulary( tokens, top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE) outputs["label_xf"] = _fill_in_missing(inputs["label"], -1) return outputs
def preprocessing_fn(inputs): text_fields = [] # Keep the original data and add more to it. result = inputs.copy() # Figure out the vocabulary for our text fields. for field_name in text_fields: field = inputs[field_name] tokens = tf.strings.split(text, " ") bag_of_words = tft.bag_of_words(tokens, range(1,3), seperator=" ") indices = tft.compute_and_apply_vocabulary(bag_of_words) bow_indices, weights = tft.tfidf(line_indices) outputs[f"{field_name}_bow_indices"] = bow_indices outputs[f"{field_name}_weight"] weights return result
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.compute_and_apply_vocabulary( review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by compute_and_apply_vocabulary. review_bow_indices, review_weight = tft.tfidf(review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] }
def _preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" x = inputs['x'] y = inputs['y'] s = inputs['s'] x_centered = x - tft.mean(x) y_normalized = tft.scale_to_0_1(y) s_integerized = tft.compute_and_apply_vocabulary(s) x_centered_times_y_normalized = (x_centered * y_normalized) return { 'x_centered': x_centered, 'y_normalized': y_normalized, 'x_centered_times_y_normalized': x_centered_times_y_normalized, 's_integerized': s_integerized }
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in features.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in features.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[features.transformed_name( key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=features.VOCAB_SIZE, num_oov_buckets=features.OOV_SIZE) for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS, features.BUCKET_FEATURE_BUCKET_COUNT): outputs[features.transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), num_buckets, always_return_num_quantiles=False) for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? fare_key = 'fare' taxi_fare = _fill_in_missing(inputs[fare_key]) tips = _fill_in_missing(inputs[features.LABEL_KEY]) outputs[features.transformed_name( features.LABEL_KEY)] = tf.compat.v1.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in my_metadata.NUMERIC_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[my_metadata.transformed_name(key)] = transform.scale_to_z_score(_fill_in_missing(inputs[key])) for key in my_metadata.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[my_metadata.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), vocab_filename=my_metadata.transformed_name(key), num_oov_buckets=my_metadata.OOV_SIZE, top_k=my_metadata.VOCAB_SIZE ) for key, hash_buckets in my_metadata.HASH_STRING_FEATURE_KEYS.items(): outputs[my_metadata.transformed_name(key)] = transform.hash_strings( _fill_in_missing(inputs[key]), hash_buckets=hash_buckets ) for key, nb_buckets in my_metadata.TO_BE_BUCKETIZED_FEATURE.items(): outputs[my_metadata.transformed_name(key +'_bucketized')] = transform.bucketize( _fill_in_missing(inputs[key]), nb_buckets) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[my_metadata.FARE_KEY]) tips = _fill_in_missing(inputs[my_metadata.LABEL_KEY]) outputs[my_metadata.transformed_name(my_metadata.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def preprocessing_fn(inputs): """ Perform feature reduction via `compute_and_apply_vocabulary`. An `indices` tensor should come in with values in (0,5e7) and should be transformed to (0,40000). """ outputs = inputs outputs['indices'] = ( tft.compute_and_apply_vocabulary(x=inputs['indices'], top_k=(MAX_IDX-5), num_oov_buckets=5, vocab_filename='my_vocab') ) return outputs
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(outputs[key]) for key in OPTIONAL_NUMERIC_FEATURE_KEYS: # This is a SparseTensor because it is optional. Here we fill in a default # value when it is missing. sparse = tf.sparse.SparseTensor(inputs[key].indices, inputs[key].values, [inputs[key].dense_shape[0], 1]) dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.) # Reshaping from a batch of vectors of size 1 to a batch to scalars. dense = tf.squeeze(dense, axis=1) outputs[key] = tft.scale_to_0_1(dense) # For all categorical columns except the label column, we generate a # vocabulary but do not modify the feature. This vocabulary is instead # used in the trainer, by means of a feature column, to convert the feature # from a string to an integer id. for key in CATEGORICAL_FEATURE_KEYS: outputs[key] = tft.compute_and_apply_vocabulary(inputs[key], vocab_filename=key) # For the label column we provide the mapping from string to index. table_keys = ['>50K', '<=50K'] initializer = tf.lookup.KeyValueTensorInitializer( keys=table_keys, values=tf.cast(tf.range(len(table_keys)), tf.int64), key_dtype=tf.string, value_dtype=tf.int64) table = tf.lookup.StaticHashTable(initializer, default_value=-1) data_labels = table.lookup(inputs[LABEL_KEY]) outputs[LABEL_KEY] = tf.one_hot(indices=data_labels, depth=len(table_keys), on_value=1.0, off_value=0.0) return outputs
def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), }
def preprocessing_fn(inputs): """ Preprocess input columns into transformed columns. Args: inputs (dict): dict of input columns Returns: output dict of transformed columns """ outputs = {} # Encode categorical column: outputs['MixingSpeed'] = tft.compute_and_apply_vocabulary( inputs['MixingSpeed']) outputs['ButterMass'] = inputs['ButterMass'] # Calculate Derived Features: outputs['TotalMass'] = inputs['ButterMass'] + inputs['SugarMass'] + inputs[ 'FlourMass'] for ingredient in ['Butter', 'Sugar', 'Flour']: ingredient_percentage = inputs['{}Mass'.format( ingredient)] / outputs['TotalMass'] outputs['Norm{}perc'.format(ingredient)] = tft.scale_to_z_score( ingredient_percentage) # Keep absolute numeric columns for key in ['TotalVolume', 'Energy']: outputs[key] = inputs[key] # Normalize other numeric columns for key in [ 'ButterTemperature', 'SugarHumidity', 'FlourHumidity', 'HeatingTime', 'MixingTime', 'Density', 'Temperature', 'Humidity', ]: outputs[key] = tft.scale_to_z_score(inputs[key]) # Extract Specific Problems chunks_detected_str = tf.regex_replace(input=inputs['Problems'], pattern='.*chunk.*', rewrite='chunk', name='DetectChunk') outputs['Chunks'] = tf.cast(tf.equal(chunks_detected_str, 'chunk'), tf.float32) return outputs
def wide_preprocessing_fn(inputs): """TFT preprocessing function. Args: inputs: Map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} # pylint: disable=protected-access for idx, key in enumerate( itertools.islice( itertools.cycle(taxi_utils._BUCKET_FEATURE_KEYS), self._num_bucketize)): outputs["bucketized" + str(idx)] = tft.bucketize( taxi_utils._fill_in_missing(inputs[key]), taxi_utils._FEATURE_BUCKET_COUNT) for idx, key in enumerate( itertools.islice( itertools.cycle(taxi_utils._DENSE_FLOAT_FEATURE_KEYS), self._num_scale)): # Preserve this feature as a dense float, setting nan's to the mean. outputs["scaled" + str(idx)] = tft.scale_to_z_score( taxi_utils._fill_in_missing(inputs[key])) for idx, key in enumerate( itertools.islice( itertools.cycle(taxi_utils._VOCAB_FEATURE_KEYS), self._num_vocabs)): outputs["vocab" + str(idx)] = tft.compute_and_apply_vocabulary( taxi_utils._fill_in_missing(inputs[key]), top_k=taxi_utils._VOCAB_SIZE, num_oov_buckets=taxi_utils._OOV_SIZE) # Pass-through features. for key in taxi_utils._CATEGORICAL_FEATURE_KEYS + [ taxi_utils._LABEL_KEY ]: outputs[key] = inputs[key] return outputs
def def_preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def preprocessing_fn(inputs): """Preprocesses Covertype Dataset.""" outputs = {} # Scale numerical features for key in features.NUMERIC_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) # Generate vocabularies and maps categorical features for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key) # Convert Cover_Type to dense tensor outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing( inputs[features.LABEL_KEY]) return outputs
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score(inputs[key]) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( inputs[key], top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY] return outputs
def preprocessing_fn(inputs, custom_config): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. custom_config: additional properties for pre-processing. Returns: Map from string feature key to transformed features. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(_identity(inputs[key]))) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=custom_config.get('VOCAB_SIZE', _VOCAB_SIZE), num_oov_buckets=custom_config.get('OOV_SIZE', _OOV_SIZE)) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) tips = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
def preprocessing_fn(inputs: tf.Tensor) -> tf.Tensor: """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in ONE_HOT_FEATURES.keys(): dim = ONE_HOT_FEATURES[key] #calling the fill_in_missing function int_value = tft.compute_and_apply_vocabulary( fill_in_missing(inputs[key]), top_k = dim + 1 ) outputs[transformed_name(key)] = convert_num_to_one_hot( int_value, num_labels = dim + 1 ) for key, bucket_count in BUCKET_FEATURES.items(): dense_feature = fill_in_missing(inputs[key]) if key == 'zip_code' and dense_feature.dtype == tf.string: dense_feature = convert_zip_code(dense_feature) else: dense_feature = tf.cast(dense_feature, tf.float32) temp_feature = tft.bucketize(dense_feature, bucket_count, always_return_num_quantiles= False) outputs[transformed_name(key)] = convert_num_to_one_hot( temp_feature, num_labels = bucket_count + 1 ) for key in TEXT_FEATURES.keys(): #it's probably clearer to separate function from dict key outputs[transformed_name(key)] = fill_in_missing(inputs[key]) outputs[transformed_name(LABEL_KEY)] = fill_in_missing(inputs[LABEL_KEY]) return outputs