def preprocessing_fn(inputs): """User defined preprocessing function for criteo columns. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ # TODO(b/35001605) Make this "passthrough" more DRY. result = {'score': inputs['score'], 'toplevel': inputs['toplevel']} result['subreddit_id'] = tft.string_to_int( inputs['subreddit'], frequency_threshold=frequency_threshold) # TODO(b/35318962): Obviate the need for this workaround on Dense features. # FeatureColumns expect shape (batch_size, 1), not just (batch_size) # All features added to results up to this point are dense and require this # workaround. All following features will be sparse. result = { k: tft.map(lambda x: tf.expand_dims(x, -1), v) for k, v in result.items() } for name in ('author', 'comment_body', 'comment_parent_body'): words = tft.map(tf.string_split, inputs[name]) # TODO(b/33467613) Translate these to bag-of-words style sparse features. result[name + '_bow'] = tft.string_to_int( words, frequency_threshold=frequency_threshold) return result
def preprocessing_fn(inputs): sparse_sum = tft.map(lambda x: tf.sparse_reduce_sum(x, axis=1), inputs['sparse']) sparse_copy = tft.map( lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape), inputs['sparse']) varlen_copy = tft.map( lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape), inputs['varlen']) sparse_copy.schema = sch.ColumnSchema( sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32), sch.LogicalShape([sch.Axis(10)])), sch.SparseColumnRepresentation( 'val_copy', [sch.SparseIndexField('idx_copy', False)])) return { 'fixed': sparse_sum, # Schema should be inferred. 'sparse': inputs['sparse'], # Schema manually attached above. 'varlen': inputs['varlen'], # Schema should be inferred. 'sparse_copy': sparse_copy, # Schema should propagate from input. 'varlen_copy': varlen_copy # Schema should propagate from input. }
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # Update outputs of both kinds to convert from shape (batch,), i.e. a batch # of scalars, to shape (batch, 1), i.e. a batch of vectors of length 1. # This is needed so the output can be easily wrapped in `FeatureColumn`s. for key in NUMERIC_COLUMNS + CATEGORICAL_COLUMNS: outputs[key] = tft.map(lambda x: tf.expand_dims(x, -1), outputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs
def preprocessing_fn(inputs): x_scaled = tft.scale_to_0_1(inputs['x']) y_sum = tft.map(lambda y: tf.sparse_reduce_sum(y, axis=1), inputs['y']) z_copy = tft.map( lambda z: tf.SparseTensor(z.indices, z.values, z.dense_shape), inputs['z']) return {'x_scaled': x_scaled, 'y_sum': y_sum, 'z_copy': z_copy}
def tfidf(x, reduced_term_freq, vocab_size, corpus_size): """Maps the terms in x to their (1/doc_length) * inverse document frequency. Args: x: A `Column` representing int64 values (most likely that are the result of calling string_to_int on a tokenized string). reduced_term_freq: A dense tensor of shape (vocab_size,) that represents the count of the number of documents with each term. So vocab token i ( which is an int) occures in reduced_term_freq[i] examples in the corpus. This means reduced_term_freq should have a count for out-of-vocab tokens vocab_size: An int - the count of vocab used to turn the string into int64s including any out-of-vocab ids corpus_size: A scalar count of the number of documents in the corpus Returns: A `Column` where each int value is mapped to a double equal to (1 if that term appears in that row, 0 otherwise / the number of terms in that row) * the log of (the number of rows in `x` / (1 + the number of rows in `x` where the term appears at least once)) NOTE: This is intented to be used with the feature_column 'sum' combiner to arrive at the true term frequncies. """ def _map_to_vocab_range(x): """Enforces that the vocab_ids in x are positive.""" return tf.SparseTensor(indices=x.indices, values=tf.mod(x.values, vocab_size), dense_shape=x.dense_shape) def _map_to_tfidf(x): """Calculates the inverse document frequency of terms in the corpus. Args: x : a SparseTensor of int64 representing string indices in vocab. Returns: The tf*idf values """ # Add one to the reduced term freqnencies to avoid dividing by zero. idf = tf.log( tf.to_double(corpus_size) / (1.0 + tf.to_double(reduced_term_freq))) dense_doc_sizes = tf.to_double( tf.sparse_reduce_sum( tf.SparseTensor(indices=x.indices, values=tf.ones_like(x.values), dense_shape=x.dense_shape), 1)) # For every term in x, divide the idf by the doc size. # The two gathers both result in shape <sum_doc_sizes> idf_over_doc_size = (tf.gather(idf, x.values) / tf.gather(dense_doc_sizes, x.indices[:, 0])) return tf.SparseTensor(indices=x.indices, values=idf_over_doc_size, dense_shape=x.dense_shape) cleaned_input = tft.map(_map_to_vocab_range, x) weights = tft.map(_map_to_tfidf, cleaned_input) return tft.map(tf.to_float, weights)
def preprocessing_fn(inputs): def repeat(in_tensor, value): batch_size = tf.shape(in_tensor)[0] return tf.ones([batch_size], value.dtype) * value return { 'min': tft.map(repeat, inputs['a'], tft.min(inputs['a'])), 'max': tft.map(repeat, inputs['a'], tft.max(inputs['a'])), 'sum': tft.map(repeat, inputs['a'], tft.sum(inputs['a'])), 'size': tft.map(repeat, inputs['a'], tft.size(inputs['a'])), 'mean': tft.map(repeat, inputs['a'], tft.mean(inputs['a'])) }
def preprocessing_fn(inputs): return { 'index1': tft.string_to_int(tft.map(tf.string_split, inputs['a']), default_value=-99, top_k=2), # As above but using a string for top_k (and changing the # default_value to showcase things). 'index2': tft.string_to_int(tft.map(tf.string_split, inputs['a']), default_value=-9, top_k='2') }
def preprocessing_fn(inputs): return { 'index1': tft.string_to_int(tft.map(tf.string_split, inputs['a']), default_value=-99, frequency_threshold=2), # As above but using a string for frequency_threshold (and changing # the default_value to showcase things). 'index2': tft.string_to_int(tft.map(tf.string_split, inputs['a']), default_value=-9, frequency_threshold='2') }
def scale(x, min_x_value, max_x_value, output_min, output_max): """Scale a column to [output_min, output_max]. Assumes the columns's range is [min_x_value, max_x_value]. If this is not true at training or prediction time, the output value of this scale could be outside the range [output_min, output_max]. Raises: ValueError: if min_x_value = max_x_value, as the column is constant. """ if round(min_x_value - max_x_value, 7) == 0: # There is something wrong with the data. # Why round to 7 places? It's the same as unittest's assertAlmostEqual. raise ValueError('In make_scale_tito, min_x_value == max_x_value') def _scale(x): min_x_valuef = tf.to_float(min_x_value) max_x_valuef = tf.to_float(max_x_value) output_minf = tf.to_float(output_min) output_maxf = tf.to_float(output_max) return ((((tf.to_float(x) - min_x_valuef) * (output_maxf - output_minf)) / (max_x_valuef - min_x_valuef)) + output_minf) return tft.map(_scale, x)
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" x = inputs['x'] y = inputs['y'] s = inputs['s'] x_centered = tft.map(lambda x, mean: x - mean, x, tft.mean(x)) y_normalized = tft.scale_to_0_1(y) s_integerized = tft.string_to_int(s) x_centered_times_y_normalized = tft.map(lambda x, y: x * y, x_centered, y_normalized) return { 'x_centered': x_centered, 'y_normalized': y_normalized, 'x_centered_times_y_normalized': x_centered_times_y_normalized, 's_integerized': s_integerized }
def preprocessing_fn(inputs): """User defined preprocessing function for criteo columns. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ # TODO(b/35001605) Make this "passthrough" more DRY. result = {'clicked': inputs['clicked']} for name in INTEGER_COLUMN_NAMES: result[name] = inputs[name] for name in CATEGORICAL_COLUMN_NAMES: result[name + '_id'] = tft.string_to_int( inputs[name], frequency_threshold=frequency_threshold) # TODO(b/35318962): Obviate the need for this workaround on Dense features. # FeatureColumns expect shape (batch_size, 1), not just (batch_size) result = { k: tft.map(lambda x: tf.expand_dims(x, -1), v) for k, v in result.items() } return result
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] def remove_character(s, char): """Remove a character from a string. Args: s: A SparseTensor of rank 1 of type tf.string char: A string of length 1 Returns: The string `s` with the given character removed (i.e. replaced by '') """ # Hacky implementation where we split and rejoin. split = tf.string_split(s, char) rejoined = tf.reduce_join( tf.sparse_to_dense( split.indices, split.dense_shape, split.values, ''), 1) return rejoined def remove_punctuation(s): """Remove puncuation from a string. Args: s: A SparseTensor of rank 1 of type tf.string Returns: The string `s` with punctuation removed. """ for char in PUNCTUATION_CHARACTERS: s = remove_character(s, char) return s cleaned_review = tft.map(remove_punctuation, review) review_tokens = tft.map(tf.string_split, cleaned_review) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) return { REVIEW_COLUMN: review_indices, LABEL_COLUMN: inputs[LABEL_COLUMN] }
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[const.REVIEW_COLUMN] review_tokens = tft.map(lambda x: tf.string_split(x, delimiters), review) review_indices = tft.string_to_int(review_tokens, top_k=vocab_size) # Add one for the oov bucket created by string_to_int. review_weight = tft.tfidf_weights(review_indices, vocab_size + 1) output = { const.REVIEW_COLUMN: review_indices, const.REVIEW_WEIGHT: review_weight, const.LABEL_COLUMN: inputs[const.LABEL_COLUMN] } return output
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] review_tokens = tft.map( lambda x: tf.string_split(x, DELIMITERS), review) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_weight = tft.tfidf_weights(review_indices, VOCAB_SIZE + 1) return { REVIEW_COLUMN: review_indices, REVIEW_WEIGHT: review_weight, LABEL_COLUMN: inputs[LABEL_COLUMN] }
def bag_of_words(x): """Computes bag of words weights Note the return type is a float sparse tensor, not a int sparse tensor. This is so that the output types batch tfidf, and any downstream transformation in tf layers during training can be applied to both. """ def _bow(x): """Comptue BOW weights. As tf layer's sum combiner is used, the weights can be just ones. Tokens are not summed together here. """ return tf.SparseTensor(indices=x.indices, values=tf.to_float(tf.ones_like(x.values)), dense_shape=x.dense_shape) return tft.map(_bow, x)
def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" outputs = {} # Scale numeric columns to have range [0, 1]. for key in NUMERIC_COLUMNS: outputs[key] = tft.scale_to_0_1(inputs[key]) # For all categorical columns except the label column, we use # tft.string_to_int which computes the set of unique values and uses this # to convert the strings to indices. for key in CATEGORICAL_COLUMNS: outputs[key] = tft.string_to_int(inputs[key]) # For the label column we provide the mapping from string to index. def convert_label(label): table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label) outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN]) return outputs
def preprocessing_fn(inputs): """User defined preprocessing function for reddit columns. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ # TODO(b/35001605) Make this "passthrough" more DRY. result = {'score': inputs['score'], 'toplevel': inputs['toplevel']} result['subreddit_id'] = tft.string_to_int( inputs['subreddit'], frequency_threshold=frequency_threshold) for name in ('author', 'comment_body', 'comment_parent_body'): words = tft.map(tf.string_split, inputs[name]) # TODO(b/33467613) Translate these to bag-of-words style sparse features. result[name + '_bow'] = tft.string_to_int( words, frequency_threshold=frequency_threshold) return result
def string_to_int(x, vocab): """Given a vocabulary and a string tensor `x`, maps `x` into an int tensor. Args: x: A `Column` representing a string value. vocab: list of strings. Returns: A `Column` where each string value is mapped to an integer representing its index in the vocab. Out of vocab values are mapped to len(vocab). """ def _map_to_int(x): """Maps string tensor into indexes using vocab. Args: x : a Tensor/SparseTensor of string. Returns: a Tensor/SparseTensor of indexes (int) of the same shape as x. """ table = lookup.string_to_index_table_from_tensor( vocab, default_value=len(vocab)) return table.lookup(x) return tft.map(_map_to_int, x)
def preprocessing_fn(inputs): def tito_string_join(*tensors): return tf.string_join(tensors, separator=' ') return {'a b': tft.map(tito_string_join, inputs['a'], inputs['b'])}
def preprocessing_fn(inputs): return {'ab': tft.map(tf.multiply, inputs['a'], inputs['b'])}
def preprocessing_fn(inputs): """User defined preprocessing function. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ result = {LABEL_COLUMN: tft.map(lambda x: tf.expand_dims(x, -1), inputs[LABEL_COLUMN]), DISPLAY_ID_COLUMN: tft.map(lambda x: tf.expand_dims(tf.to_int64(x), -1), inputs[DISPLAY_ID_COLUMN]), IS_LEAK_COLUMN: tft.map(lambda x: tf.expand_dims(x, -1), inputs[IS_LEAK_COLUMN]), DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN: tft.map(lambda display_id, is_leak: tf.expand_dims((tf.to_int64(display_id)*10)+tf.nn.relu(is_leak), -1), inputs[DISPLAY_ID_COLUMN], inputs[IS_LEAK_COLUMN])} for name in FLOAT_COLUMNS: result[name] = tft.map(lambda x: tf.expand_dims(x, -1), inputs[name]) #For well-distributed percentages, creating 10 bins for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM: result[name+'_binned'] = tft.map(lambda x: tf.expand_dims(tf.to_int64(x*10), -1), inputs[name]) #For log-distributed percentages, creating bins on log for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM: result[name+'_log_binned'] = tft.map(lambda x: tf.expand_dims(tf.to_int64(tf_log2_1p(x*1000)), -1), inputs[name]) result[name+'_log_01scaled'] = tft.scale_to_0_1(tft.map(lambda x: tf.expand_dims(tf_log2_1p(x*1000), -1), inputs[name])) #Apply the log to smooth high counts (outliers) and scale from 0 to 1 for name in INT_COLUMNS: result[name+'_log_int'] = tft.map(lambda x: tf.expand_dims(tf.to_int64(tf_log2_1p(x)), -1), inputs[name]) result[name+'_log_01scaled'] = tft.scale_to_0_1(tft.map(lambda x: tf.expand_dims(tf_log2_1p(x), -1), inputs[name])) #result[name] = tft.map(lambda x: tf.expand_dims(tf.to_int64(x), -1), inputs[name]) #for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS + \ # [category for multicategory in DOC_CATEGORICAL_MULTIVALUED_COLUMNS for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multicategory]]: for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS: result[name] = tft.map(lambda x: tf.to_int64(x), inputs[name]) #result['display_ad_id_key'] = tft.map(lambda display_id, ad_id: tf.multiply(tf.sparse_tensor_to_dense(tf.to_int64(display_id)), int(1e8)) + tf.sparse_tensor_to_dense(tf.to_int64(ad_id)), inputs['display_id'], inputs['ad_id']) for multicategory in DOC_CATEGORICAL_MULTIVALUED_COLUMNS: if len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multicategory]) == 3: result[multicategory] = tft.map(lambda col1, col2, col3: tf.to_int64(tf.sparse_concat(axis=1, sp_inputs=[col1, col2, col3])), *[inputs[category] for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multicategory]]) elif len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multicategory]) == 6: result[multicategory] = tft.map(lambda col1, col2, col3, col4, col5, col6: tf.to_int64(tf.sparse_concat(axis=1, sp_inputs=[col1, col2, col3, col4, col5, col6])), *[inputs[category] for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multicategory]]) return result
def preprocessing_fn(inputs): return { 'index': tft.string_to_int(tft.map(tf.string_split, inputs['a'])) }
def mean_fn(inputs): return { 'mean': tft.map(repeat, inputs['a'], tft.mean(inputs['a'])) }
def size_fn(inputs): return { 'size': tft.map(repeat, inputs['a'], tft.size(inputs['a'])) }
def sum_fn(inputs): return { 'sum': tft.map(repeat, inputs['a'], tft.sum(inputs['a'])) }
def min_fn(inputs): return { 'min': tft.map(repeat, inputs['a'], tft.min(inputs['a'])) }
def preprocessing_fn(inputs): return {'img_col': tft.map(tf.decode_base64, inputs['img_col']), 'num_col': tft.map(lambda x: tf.add(x, 1), inputs['num_col'])}
def preprocessing_fn(inputs): scaled_to_0 = tft.map(lambda x, y: x - y, inputs['x'], tft.min(inputs['x'])) scaled_to_0_1 = tft.map(lambda x, y: x / y, scaled_to_0, tft.max(scaled_to_0)) return {'x_scaled': scaled_to_0_1}
def max_fn(inputs): return { 'max': tft.map(repeat, inputs['a'], tft.max(inputs['a'])) }
def preprocessing_fn(inputs): return { 'a(b+c)': tft.map(tf.multiply, inputs['a'], tft.map(tf.add, inputs['b'], inputs['c'])) }