示例#1
0
    def preprocessing_fn(inputs):
        """User defined preprocessing function for criteo columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
        # TODO(b/35001605) Make this "passthrough" more DRY.
        result = {'score': inputs['score'], 'toplevel': inputs['toplevel']}

        result['subreddit_id'] = tft.string_to_int(
            inputs['subreddit'], frequency_threshold=frequency_threshold)

        # TODO(b/35318962): Obviate the need for this workaround on Dense features.
        # FeatureColumns expect shape (batch_size, 1), not just (batch_size)
        # All features added to results up to this point are dense and require this
        # workaround. All following features will be sparse.
        result = {
            k: tft.map(lambda x: tf.expand_dims(x, -1), v)
            for k, v in result.items()
        }

        for name in ('author', 'comment_body', 'comment_parent_body'):
            words = tft.map(tf.string_split, inputs[name])
            # TODO(b/33467613) Translate these to bag-of-words style sparse features.
            result[name + '_bow'] = tft.string_to_int(
                words, frequency_threshold=frequency_threshold)

        return result
示例#2
0
        def preprocessing_fn(inputs):
            sparse_sum = tft.map(lambda x: tf.sparse_reduce_sum(x, axis=1),
                                 inputs['sparse'])
            sparse_copy = tft.map(
                lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape),
                inputs['sparse'])
            varlen_copy = tft.map(
                lambda y: tf.SparseTensor(y.indices, y.values, y.dense_shape),
                inputs['varlen'])

            sparse_copy.schema = sch.ColumnSchema(
                sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                        sch.LogicalShape([sch.Axis(10)])),
                sch.SparseColumnRepresentation(
                    'val_copy', [sch.SparseIndexField('idx_copy', False)]))

            return {
                'fixed': sparse_sum,  # Schema should be inferred.
                'sparse': inputs['sparse'],  # Schema manually attached above.
                'varlen': inputs['varlen'],  # Schema should be inferred.
                'sparse_copy':
                sparse_copy,  # Schema should propagate from input.
                'varlen_copy':
                varlen_copy  # Schema should propagate from input.
            }
示例#3
0
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    outputs = {}

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_COLUMNS:
      outputs[key] = tft.scale_to_0_1(inputs[key])

    # For all categorical columns except the label column, we use
    # tft.string_to_int which computes the set of unique values and uses this
    # to convert the strings to indices.
    for key in CATEGORICAL_COLUMNS:
      outputs[key] = tft.string_to_int(inputs[key])

    # Update outputs of both kinds to convert from shape (batch,), i.e. a batch
    # of scalars, to shape (batch, 1), i.e. a batch of vectors of length 1.
    # This is needed so the output can be easily wrapped in `FeatureColumn`s.
    for key in NUMERIC_COLUMNS + CATEGORICAL_COLUMNS:
      outputs[key] = tft.map(lambda x: tf.expand_dims(x, -1), outputs[key])

    # For the label column we provide the mapping from string to index.
    def convert_label(label):
      table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
      return table.lookup(label)
    outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

    return outputs
示例#4
0
 def preprocessing_fn(inputs):
     x_scaled = tft.scale_to_0_1(inputs['x'])
     y_sum = tft.map(lambda y: tf.sparse_reduce_sum(y, axis=1),
                     inputs['y'])
     z_copy = tft.map(
         lambda z: tf.SparseTensor(z.indices, z.values, z.dense_shape),
         inputs['z'])
     return {'x_scaled': x_scaled, 'y_sum': y_sum, 'z_copy': z_copy}
示例#5
0
def tfidf(x, reduced_term_freq, vocab_size, corpus_size):
    """Maps the terms in x to their (1/doc_length) * inverse document frequency.
  Args:
    x: A `Column` representing int64 values (most likely that are the result
        of calling string_to_int on a tokenized string).
    reduced_term_freq: A dense tensor of shape (vocab_size,) that represents
        the count of the number of documents with each term. So vocab token i (
        which is an int) occures in reduced_term_freq[i] examples in the corpus.
        This means reduced_term_freq should have a count for out-of-vocab tokens
    vocab_size: An int - the count of vocab used to turn the string into int64s
        including any out-of-vocab ids
    corpus_size: A scalar count of the number of documents in the corpus
  Returns:
    A `Column` where each int value is mapped to a double equal to
    (1 if that term appears in that row, 0 otherwise / the number of terms in
    that row) * the log of (the number of rows in `x` / (1 + the number of
    rows in `x` where the term appears at least once))
  NOTE:
    This is intented to be used with the feature_column 'sum' combiner to arrive
    at the true term frequncies.
  """
    def _map_to_vocab_range(x):
        """Enforces that the vocab_ids in x are positive."""
        return tf.SparseTensor(indices=x.indices,
                               values=tf.mod(x.values, vocab_size),
                               dense_shape=x.dense_shape)

    def _map_to_tfidf(x):
        """Calculates the inverse document frequency of terms in the corpus.
    Args:
      x : a SparseTensor of int64 representing string indices in vocab.
    Returns:
      The tf*idf values
    """
        # Add one to the reduced term freqnencies to avoid dividing by zero.
        idf = tf.log(
            tf.to_double(corpus_size) /
            (1.0 + tf.to_double(reduced_term_freq)))

        dense_doc_sizes = tf.to_double(
            tf.sparse_reduce_sum(
                tf.SparseTensor(indices=x.indices,
                                values=tf.ones_like(x.values),
                                dense_shape=x.dense_shape), 1))

        # For every term in x, divide the idf by the doc size.
        # The two gathers both result in shape <sum_doc_sizes>
        idf_over_doc_size = (tf.gather(idf, x.values) /
                             tf.gather(dense_doc_sizes, x.indices[:, 0]))

        return tf.SparseTensor(indices=x.indices,
                               values=idf_over_doc_size,
                               dense_shape=x.dense_shape)

    cleaned_input = tft.map(_map_to_vocab_range, x)

    weights = tft.map(_map_to_tfidf, cleaned_input)
    return tft.map(tf.to_float, weights)
示例#6
0
        def preprocessing_fn(inputs):
            def repeat(in_tensor, value):
                batch_size = tf.shape(in_tensor)[0]
                return tf.ones([batch_size], value.dtype) * value

            return {
                'min': tft.map(repeat, inputs['a'], tft.min(inputs['a'])),
                'max': tft.map(repeat, inputs['a'], tft.max(inputs['a'])),
                'sum': tft.map(repeat, inputs['a'], tft.sum(inputs['a'])),
                'size': tft.map(repeat, inputs['a'], tft.size(inputs['a'])),
                'mean': tft.map(repeat, inputs['a'], tft.mean(inputs['a']))
            }
示例#7
0
        def preprocessing_fn(inputs):
            return {
                'index1':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-99,
                                  top_k=2),

                # As above but using a string for top_k (and changing the
                # default_value to showcase things).
                'index2':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-9,
                                  top_k='2')
            }
示例#8
0
        def preprocessing_fn(inputs):
            return {
                'index1':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-99,
                                  frequency_threshold=2),

                # As above but using a string for frequency_threshold (and changing
                # the default_value to showcase things).
                'index2':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-9,
                                  frequency_threshold='2')
            }
示例#9
0
def scale(x, min_x_value, max_x_value, output_min, output_max):
    """Scale a column to [output_min, output_max].

  Assumes the columns's range is [min_x_value, max_x_value]. If this is not
  true at training or prediction time, the output value of this scale could be
  outside the range [output_min, output_max].

  Raises:
    ValueError: if min_x_value = max_x_value, as the column is constant.
  """

    if round(min_x_value - max_x_value, 7) == 0:
        # There is something wrong with the data.
        # Why round to 7 places? It's the same as unittest's assertAlmostEqual.
        raise ValueError('In make_scale_tito, min_x_value == max_x_value')

    def _scale(x):
        min_x_valuef = tf.to_float(min_x_value)
        max_x_valuef = tf.to_float(max_x_value)
        output_minf = tf.to_float(output_min)
        output_maxf = tf.to_float(output_max)
        return ((((tf.to_float(x) - min_x_valuef) *
                  (output_maxf - output_minf)) /
                 (max_x_valuef - min_x_valuef)) + output_minf)

    return tft.map(_scale, x)
示例#10
0
def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = tft.map(lambda x, mean: x - mean, x, tft.mean(x))
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.string_to_int(s)
    x_centered_times_y_normalized = tft.map(lambda x, y: x * y, x_centered,
                                            y_normalized)
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
        's_integerized': s_integerized
    }
示例#11
0
  def preprocessing_fn(inputs):
    """User defined preprocessing function for criteo columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
    # TODO(b/35001605) Make this "passthrough" more DRY.
    result = {'clicked': inputs['clicked']}
    for name in INTEGER_COLUMN_NAMES:
      result[name] = inputs[name]
    for name in CATEGORICAL_COLUMN_NAMES:
      result[name + '_id'] = tft.string_to_int(
          inputs[name], frequency_threshold=frequency_threshold)

    # TODO(b/35318962): Obviate the need for this workaround on Dense features.
    # FeatureColumns expect shape (batch_size, 1), not just (batch_size)
    result = {
        k: tft.map(lambda x: tf.expand_dims(x, -1), v)
        for k, v in result.items()
    }

    return result
示例#12
0
      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_COLUMN]

        def remove_character(s, char):
          """Remove a character from a string.

          Args:
            s: A SparseTensor of rank 1 of type tf.string
            char: A string of length 1

          Returns:
            The string `s` with the given character removed (i.e. replaced by
            '')
          """
          # Hacky implementation where we split and rejoin.
          split = tf.string_split(s, char)
          rejoined = tf.reduce_join(
              tf.sparse_to_dense(
                  split.indices, split.dense_shape, split.values, ''),
              1)
          return rejoined

        def remove_punctuation(s):
          """Remove puncuation from a string.

          Args:
            s: A SparseTensor of rank 1 of type tf.string

          Returns:
            The string `s` with punctuation removed.
          """
          for char in PUNCTUATION_CHARACTERS:
            s = remove_character(s, char)
          return s

        cleaned_review = tft.map(remove_punctuation, review)
        review_tokens = tft.map(tf.string_split, cleaned_review)
        review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
        return {
            REVIEW_COLUMN: review_indices,
            LABEL_COLUMN: inputs[LABEL_COLUMN]
        }
示例#13
0
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[const.REVIEW_COLUMN]
        review_tokens = tft.map(lambda x: tf.string_split(x, delimiters), review)
        review_indices = tft.string_to_int(review_tokens, top_k=vocab_size)
        # Add one for the oov bucket created by string_to_int.
        review_weight = tft.tfidf_weights(review_indices, vocab_size + 1)

        output = {
            const.REVIEW_COLUMN: review_indices,
            const.REVIEW_WEIGHT: review_weight,
            const.LABEL_COLUMN: inputs[const.LABEL_COLUMN]
        }
        return output
示例#14
0
            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_COLUMN]

                review_tokens = tft.map(
                    lambda x: tf.string_split(x, DELIMITERS), review)
                review_indices = tft.string_to_int(review_tokens,
                                                   top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by string_to_int.
                review_weight = tft.tfidf_weights(review_indices,
                                                  VOCAB_SIZE + 1)
                return {
                    REVIEW_COLUMN: review_indices,
                    REVIEW_WEIGHT: review_weight,
                    LABEL_COLUMN: inputs[LABEL_COLUMN]
                }
示例#15
0
def bag_of_words(x):
    """Computes bag of words weights

  Note the return type is a float sparse tensor, not a int sparse tensor. This
  is so that the output types batch tfidf, and any downstream transformation
  in tf layers during training can be applied to both.
  """
    def _bow(x):
        """Comptue BOW weights.

    As tf layer's sum combiner is used, the weights can be just ones. Tokens are
    not summed together here.
    """
        return tf.SparseTensor(indices=x.indices,
                               values=tf.to_float(tf.ones_like(x.values)),
                               dense_shape=x.dense_shape)

    return tft.map(_bow, x)
示例#16
0
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_COLUMNS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_COLUMNS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

        return outputs
示例#17
0
    def preprocessing_fn(inputs):
        """User defined preprocessing function for reddit columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
        # TODO(b/35001605) Make this "passthrough" more DRY.
        result = {'score': inputs['score'], 'toplevel': inputs['toplevel']}

        result['subreddit_id'] = tft.string_to_int(
            inputs['subreddit'], frequency_threshold=frequency_threshold)

        for name in ('author', 'comment_body', 'comment_parent_body'):
            words = tft.map(tf.string_split, inputs[name])
            # TODO(b/33467613) Translate these to bag-of-words style sparse features.
            result[name + '_bow'] = tft.string_to_int(
                words, frequency_threshold=frequency_threshold)

        return result
示例#18
0
def string_to_int(x, vocab):
    """Given a vocabulary and a string tensor `x`, maps `x` into an int tensor.
  Args:
    x: A `Column` representing a string value.
    vocab: list of strings.

  Returns:
    A `Column` where each string value is mapped to an integer representing
    its index in the vocab. Out of vocab values are mapped to len(vocab).
  """
    def _map_to_int(x):
        """Maps string tensor into indexes using vocab.

    Args:
      x : a Tensor/SparseTensor of string.
    Returns:
      a Tensor/SparseTensor of indexes (int) of the same shape as x.
    """
        table = lookup.string_to_index_table_from_tensor(
            vocab, default_value=len(vocab))
        return table.lookup(x)

    return tft.map(_map_to_int, x)
示例#19
0
        def preprocessing_fn(inputs):
            def tito_string_join(*tensors):
                return tf.string_join(tensors, separator=' ')

            return {'a b': tft.map(tito_string_join, inputs['a'], inputs['b'])}
示例#20
0
 def preprocessing_fn(inputs):
     return {'ab': tft.map(tf.multiply, inputs['a'], inputs['b'])}
  def preprocessing_fn(inputs):
    """User defined preprocessing function.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """

    result = {LABEL_COLUMN: tft.map(lambda x: tf.expand_dims(x, -1), inputs[LABEL_COLUMN]), 
              DISPLAY_ID_COLUMN: tft.map(lambda x: tf.expand_dims(tf.to_int64(x), -1), inputs[DISPLAY_ID_COLUMN]),
              IS_LEAK_COLUMN: tft.map(lambda x: tf.expand_dims(x, -1), inputs[IS_LEAK_COLUMN]),
              DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN: tft.map(lambda display_id, is_leak: tf.expand_dims((tf.to_int64(display_id)*10)+tf.nn.relu(is_leak), -1), inputs[DISPLAY_ID_COLUMN], inputs[IS_LEAK_COLUMN])}


    for name in FLOAT_COLUMNS:
      result[name] = tft.map(lambda x: tf.expand_dims(x, -1), inputs[name])

    #For well-distributed percentages, creating 10 bins
    for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
      result[name+'_binned'] = tft.map(lambda x: tf.expand_dims(tf.to_int64(x*10), -1), inputs[name])

    #For log-distributed percentages, creating bins on log
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
      result[name+'_log_binned'] = tft.map(lambda x: tf.expand_dims(tf.to_int64(tf_log2_1p(x*1000)), -1), inputs[name])
      result[name+'_log_01scaled'] = tft.scale_to_0_1(tft.map(lambda x: tf.expand_dims(tf_log2_1p(x*1000), -1), inputs[name]))

    #Apply the log to smooth high counts (outliers) and scale from 0 to 1
    for name in INT_COLUMNS:    
      result[name+'_log_int']  = tft.map(lambda x: tf.expand_dims(tf.to_int64(tf_log2_1p(x)), -1), inputs[name])
      result[name+'_log_01scaled'] = tft.scale_to_0_1(tft.map(lambda x: tf.expand_dims(tf_log2_1p(x), -1), inputs[name]))
      #result[name] = tft.map(lambda x: tf.expand_dims(tf.to_int64(x), -1), inputs[name])
    
    #for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS + \
    #            [category for multicategory in DOC_CATEGORICAL_MULTIVALUED_COLUMNS for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multicategory]]:
    for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
      result[name] = tft.map(lambda x: tf.to_int64(x), inputs[name])

    #result['display_ad_id_key'] = tft.map(lambda display_id, ad_id: tf.multiply(tf.sparse_tensor_to_dense(tf.to_int64(display_id)), int(1e8)) + tf.sparse_tensor_to_dense(tf.to_int64(ad_id)), inputs['display_id'], inputs['ad_id'])


    for multicategory in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
      if len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multicategory]) == 3:
        result[multicategory] = tft.map(lambda col1, col2, col3: tf.to_int64(tf.sparse_concat(axis=1, sp_inputs=[col1, col2, col3])), 
                                        *[inputs[category] for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multicategory]])
      elif len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multicategory]) == 6:
        result[multicategory] = tft.map(lambda col1, col2, col3, col4, col5, col6: tf.to_int64(tf.sparse_concat(axis=1, sp_inputs=[col1, col2, col3, col4, col5, col6])), 
                                        *[inputs[category] for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multicategory]])

    return result
示例#22
0
 def preprocessing_fn(inputs):
     return {
         'index':
         tft.string_to_int(tft.map(tf.string_split, inputs['a']))
     }
示例#23
0
 def mean_fn(inputs):
     return {
         'mean': tft.map(repeat, inputs['a'], tft.mean(inputs['a']))
     }
示例#24
0
 def size_fn(inputs):
     return {
         'size': tft.map(repeat, inputs['a'], tft.size(inputs['a']))
     }
示例#25
0
 def sum_fn(inputs):
     return {
         'sum': tft.map(repeat, inputs['a'], tft.sum(inputs['a']))
     }
示例#26
0
 def min_fn(inputs):
     return {
         'min': tft.map(repeat, inputs['a'], tft.min(inputs['a']))
     }
 def preprocessing_fn(inputs):
   return {'img_col': tft.map(tf.decode_base64, inputs['img_col']),
           'num_col': tft.map(lambda x: tf.add(x, 1), inputs['num_col'])}
示例#28
0
 def preprocessing_fn(inputs):
     scaled_to_0 = tft.map(lambda x, y: x - y, inputs['x'],
                           tft.min(inputs['x']))
     scaled_to_0_1 = tft.map(lambda x, y: x / y, scaled_to_0,
                             tft.max(scaled_to_0))
     return {'x_scaled': scaled_to_0_1}
示例#29
0
 def max_fn(inputs):
     return {
         'max': tft.map(repeat, inputs['a'], tft.max(inputs['a']))
     }
示例#30
0
 def preprocessing_fn(inputs):
     return {
         'a(b+c)':
         tft.map(tf.multiply, inputs['a'],
                 tft.map(tf.add, inputs['b'], inputs['c']))
     }