예제 #1
0
    def preprocessing_fn(inputs):
        """User defined preprocessing function for criteo columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
        # TODO(b/35001605) Make this "passthrough" more DRY.
        result = {'score': inputs['score'], 'toplevel': inputs['toplevel']}

        result['subreddit_id'] = tft.string_to_int(
            inputs['subreddit'], frequency_threshold=frequency_threshold)

        # TODO(b/35318962): Obviate the need for this workaround on Dense features.
        # FeatureColumns expect shape (batch_size, 1), not just (batch_size)
        # All features added to results up to this point are dense and require this
        # workaround. All following features will be sparse.
        result = {
            k: tft.map(lambda x: tf.expand_dims(x, -1), v)
            for k, v in result.items()
        }

        for name in ('author', 'comment_body', 'comment_parent_body'):
            words = tft.map(tf.string_split, inputs[name])
            # TODO(b/33467613) Translate these to bag-of-words style sparse features.
            result[name + '_bow'] = tft.string_to_int(
                words, frequency_threshold=frequency_threshold)

        return result
예제 #2
0
def preprocess_fn(dictrow):
    return {
        'customer_id':
        tft.string_to_int(dictrow['customer_id'],
                          vocab_filename='customers_mapping'),
        'sku':
        tft.string_to_int(dictrow['sku'], vocab_filename='skus_mapping'),
        'action':
        dictrow['action']
    }
예제 #3
0
        def preprocessing_fn(inputs):
            return {
                'index1':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-99,
                                  top_k=2),

                # As above but using a string for top_k (and changing the
                # default_value to showcase things).
                'index2':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-9,
                                  top_k='2')
            }
예제 #4
0
        def preprocessing_fn(inputs):
            return {
                'index1':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-99,
                                  frequency_threshold=2),

                # As above but using a string for frequency_threshold (and changing
                # the default_value to showcase things).
                'index2':
                tft.string_to_int(tft.map(tf.string_split, inputs['a']),
                                  default_value=-9,
                                  frequency_threshold='2')
            }
예제 #5
0
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    outputs = {}

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_COLUMNS:
      outputs[key] = tft.scale_to_0_1(inputs[key])

    # For all categorical columns except the label column, we use
    # tft.string_to_int which computes the set of unique values and uses this
    # to convert the strings to indices.
    for key in CATEGORICAL_COLUMNS:
      outputs[key] = tft.string_to_int(inputs[key])

    # Update outputs of both kinds to convert from shape (batch,), i.e. a batch
    # of scalars, to shape (batch, 1), i.e. a batch of vectors of length 1.
    # This is needed so the output can be easily wrapped in `FeatureColumn`s.
    for key in NUMERIC_COLUMNS + CATEGORICAL_COLUMNS:
      outputs[key] = tft.map(lambda x: tf.expand_dims(x, -1), outputs[key])

    # For the label column we provide the mapping from string to index.
    def convert_label(label):
      table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
      return table.lookup(label)
    outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

    return outputs
예제 #6
0
def preprocess(input_features):

    output_features = {}

    output_features[metadata.TARGET_FEATURE_NAME] = input_features[metadata.TARGET_FEATURE_NAME]

    for feature_name in metadata.NUMERIC_FEATURE_NAMES:

        #output_features[feature_name+"_scaled"] = tft.scale_to_z_score(input_features[feature_name])
        output_features[feature_name] = tft.scale_to_z_score(input_features[feature_name])

        quantiles = tft.quantiles(input_features[feature_name], num_buckets=NUM_BUCKETS, epsilon=0.01)
        output_features[feature_name+"_bucketized"] = tft.apply_buckets(input_features[feature_name],
                                                                        bucket_boundaries=quantiles)

    for feature_name in metadata.CATEGORICAL_FEATURE_NAMES:

        tft.uniques(input_features[feature_name], vocab_filename=feature_name)
        output_features[feature_name] = input_features[feature_name]

        # sba added this
        #output_features[feature_name+"_integerized"] = tft.string_to_int(input_features[feature_name],
                                                           #vocab_filename=feature_name)
    for feature_name in metadata.VOCAB_FEATURE_NAMES:

        output_features[feature_name +"_integerized"] = tft.string_to_int(input_features[feature_name],top_k=metadata.VOCAB_SIZE, num_oov_buckets=metadata.OOV_SIZE, vocab_filename=feature_name)
                                                           


    return output_features
예제 #7
0
def preprocessing_fn(input_features):

    # get the text of clean_title
    text = input_features['clean_title']

    # extract embeddings using tf.hub
    embeddings = tft.apply_function(get_embeddings, text)

    # tokenize text
    text_tokens = tf.string_split(text, parameters.DELIMITERS)

    # bag of words (bow) indices
    text_tokens_indices = tft.string_to_int(text_tokens, top_k=parameters.VOCAB_SIZE)

    # tf.idf
    bag_of_words_indices, tf_idf = tft.tfidf(text_tokens_indices, parameters.VOCAB_SIZE + 1)

    output_features = dict()
    output_features['topic'] = input_features['topic']
    output_features['title'] = input_features['raw_title']
    output_features['bow'] = bag_of_words_indices
    output_features['tf_idf'] = tf_idf
    output_features['embeddings'] = embeddings

    return output_features
예제 #8
0
  def preprocessing_fn(inputs):
    """User defined preprocessing function for criteo columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
    # TODO(b/35001605) Make this "passthrough" more DRY.
    result = {'clicked': inputs['clicked']}
    for name in INTEGER_COLUMN_NAMES:
      result[name] = inputs[name]
    for name in CATEGORICAL_COLUMN_NAMES:
      result[name + '_id'] = tft.string_to_int(
          inputs[name], frequency_threshold=frequency_threshold)

    # TODO(b/35318962): Obviate the need for this workaround on Dense features.
    # FeatureColumns expect shape (batch_size, 1), not just (batch_size)
    result = {
        k: tft.map(lambda x: tf.expand_dims(x, -1), v)
        for k, v in result.items()
    }

    return result
예제 #9
0
 def wrapped_preprocessing_fn(inputs):
     outputs = preprocessing.preprocess(inputs)
     for key in outputs:
         if outputs[key].dtype == tf.bool:
             outputs[key] = tft.string_to_int(
                 tf.as_string(outputs[key]),
                 vocab_filename='vocab_' + key)
     return outputs
def preprocess_tft(inputs):
    import copy
    import numpy as np
    def center(x):
          return x - tft.mean(x)
    result = copy.copy(inputs) # shallow copy
    result['mother_age_tft'] = center(inputs['mother_age'])
    result['gestation_weeks_centered'] = tft.scale_to_0_1(inputs['gestation_weeks'])
    result['mother_race_tft'] = tft.string_to_int(inputs['mother_race'])
    return result
예제 #11
0
    def preprocessing_fn(inputs):
        """TFT preprocessing function.
    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """

        features_dict = {}
        for col_schema in schema:
            col_name = col_schema['name']
            if col_schema['type'] == 'NUMBER':
                features_dict[col_name] = inputs[col_name]
            elif col_schema['type'] == 'CATEGORY':
                features_dict[col_name] = tft.string_to_int(
                    inputs[col_name], vocab_filename='vocab_' + col_name)
            elif col_schema['type'] == 'TEXT':
                tokens = tf.string_split(inputs[col_name], DELIMITERS)
                # TODO: default_value = 0 is wrong. It means OOV gets 0 for their index.
                # But this is to workaround the issue that trainer can use the true vocab
                # size. Otherwise trainer has to use VOCAB_SIZE defined in this file which
                # is too large. I am talking to TFT folks on this. If there is no workaround,
                # user has to provide a vocab_size.
                indices = tft.string_to_int(tokens,
                                            vocab_filename='vocab_' + col_name,
                                            default_value=0)
                # Add one for the oov bucket created by string_to_int.
                bow_indices, bow_weights = tft.tfidf(indices, VOCAB_SIZE + 1)
                features_dict[col_name + '_indices'] = bow_indices
                features_dict[col_name + '_weights'] = bow_weights
            elif col_schema['type'] == 'IMAGE_URL':
                features_dict[col_name] = tft.apply_function_with_checkpoint(
                    _image_to_vec, [inputs[col_name]],
                    INCEPTION_V3_CHECKPOINT,
                    exclude=INCEPTION_EXCLUDED_VARIABLES)
            elif col_schema['type'] == 'KEY':
                features_dict[col_name] = inputs[col_name]
            else:
                raise ValueError('Invalid schema. Unknown type ' +
                                 col_schema['type'])
        return features_dict
예제 #12
0
    def preprocessing_fn(inputs):
        """User defined preprocessing function for reddit columns.
    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
        # TODO(b/35001605) Make this "passthrough" more DRY.
        result = {'score': inputs['score'], 'toplevel': inputs['toplevel']}

        result['subreddit_id'] = tft.string_to_int(
            inputs['subreddit'], frequency_threshold=frequency_threshold)

        for name in ('author', 'comment_body', 'comment_parent_body'):
            words = tf.string_split(inputs[name])
            # TODO(b/33467613) Translate these to bag-of-words style sparse features.
            result[name + '_bow'] = tft.string_to_int(
                words, frequency_threshold=frequency_threshold)

        return result
예제 #13
0
  def preprocessing_fn(inputs):
    """User defined preprocessing function for reddit columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
    # TODO(b/35001605) Make this "passthrough" more DRY.
    result = {'score': inputs['score'], 'toplevel': inputs['toplevel']}

    result['subreddit_id'] = tft.string_to_int(
        inputs['subreddit'], frequency_threshold=frequency_threshold)

    for name in ('author', 'comment_body', 'comment_parent_body'):
      words = tf.string_split(inputs[name])
      # TODO(b/33467613) Translate these to bag-of-words style sparse features.
      result[name + '_bow'] = tft.string_to_int(
          words, frequency_threshold=frequency_threshold)

    return result
예제 #14
0
      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_COLUMN]

        review_tokens = tf.string_split(review, DELIMITERS)
        review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
        # Add one for the oov bucket created by string_to_int.
        review_bow_indices, review_weight = tft.tfidf(review_indices,
                                                      VOCAB_SIZE + 1)
        return {
            REVIEW_COLUMN: review_bow_indices,
            REVIEW_WEIGHT: review_weight,
            LABEL_COLUMN: inputs[LABEL_COLUMN]
        }
예제 #15
0
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[const.REVIEW_COLUMN]
        review_tokens = tft.map(lambda x: tf.string_split(x, delimiters), review)
        review_indices = tft.string_to_int(review_tokens, top_k=vocab_size)
        # Add one for the oov bucket created by string_to_int.
        review_weight = tft.tfidf_weights(review_indices, vocab_size + 1)

        output = {
            const.REVIEW_COLUMN: review_indices,
            const.REVIEW_WEIGHT: review_weight,
            const.LABEL_COLUMN: inputs[const.LABEL_COLUMN]
        }
        return output
예제 #16
0
 def preprocessing_fn(inputs):
   """Preprocess input columns into transformed columns."""
   x = inputs['x']
   y = inputs['y']
   s = inputs['s']
   x_centered = x - tft.mean(x)
   y_normalized = tft.scale_to_0_1(y)
   s_integerized = tft.string_to_int(s)
   x_centered_times_y_normalized = (x_centered * y_normalized)
   return {
       'x_centered': x_centered,
       'y_normalized': y_normalized,
       'x_centered_times_y_normalized': x_centered_times_y_normalized,
       's_integerized': s_integerized
   }
예제 #17
0
    def preprocessing_fn(inputs):
        """User defined preprocessing function for reddit columns.
    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
        # TODO(b/35001605) Make this "passthrough" more DRY.
        result = {'score': inputs['score'], 'example_id': inputs['example_id']}

        result['subreddit_id'] = tft.string_to_int(
            inputs['subreddit'], frequency_threshold=frequency_threshold)

        return result
예제 #18
0
def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = tft.map(lambda x, mean: x - mean, x, tft.mean(x))
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.string_to_int(s)
    x_centered_times_y_normalized = tft.map(lambda x, y: x * y, x_centered,
                                            y_normalized)
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
        's_integerized': s_integerized
    }
예제 #19
0
    def pre_processing_fun(inputs):
        outputs = {}

        for fea in NUMERIC_FEATURE_KEYS:
            outputs[fea] = tft.scale_to_0_1(inputs[fea])

        for fea in CATEGORICAL_FEATURE_KEYS:
            outputs[fea] = tft.string_to_int(inputs[fea])

        def convert_label(label):
            table = lookup.index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                inputs[LABEL_KEY])

        return outputs
예제 #20
0
    def preprocessing_fn(inputs):
        """User defined preprocessing function for criteo columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
        # TODO(b/35001605) Make this "passthrough" more DRY.
        result = {'clicked': inputs['clicked']}
        for name in INTEGER_COLUMN_NAMES:
            result[name] = inputs[name]
        for name in CATEGORICAL_COLUMN_NAMES:
            result[name + '_id'] = tft.string_to_int(
                inputs[name], frequency_threshold=frequency_threshold)

        return result
예제 #21
0
  def preprocessing_fn(inputs):
    """User defined preprocessing function for criteo columns.

    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """
    # TODO(b/35001605) Make this "passthrough" more DRY.
    result = {'clicked': inputs['clicked']}
    for name in INTEGER_COLUMN_NAMES:
      result[name] = inputs[name]
    for name in CATEGORICAL_COLUMN_NAMES:
      result[name + '_id'] = tft.string_to_int(
          inputs[name], frequency_threshold=frequency_threshold)

    return result
예제 #22
0
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        outputs[LABEL_KEY] = inputs[LABEL_KEY]

        return outputs
예제 #23
0
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = transform.scale_to_z_score(inputs[key])

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[key] = transform.string_to_int(
                inputs[key],
                top_k=taxi.VOCAB_SIZE,
                num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[key] = transform.bucketize(inputs[key],
                                               taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[key] = inputs[key]

        # Was this passenger a big tipper?
        def convert_label(label):
            taxi_fare = inputs[taxi.FARE_KEY]
            return tf.where(
                tf.is_nan(taxi_fare),
                tf.cast(tf.zeros_like(taxi_fare), tf.int64),
                # Test if the tip was > 20% of the fare.
                tf.cast(
                    tf.greater(label, tf.multiply(taxi_fare,
                                                  tf.constant(0.2))),
                    tf.int64))

        outputs[taxi.LABEL_KEY] = transform.apply_function(
            convert_label, inputs[taxi.LABEL_KEY])

        return outputs
예제 #24
0
      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_COLUMN]

        def remove_character(s, char):
          """Remove a character from a string.

          Args:
            s: A SparseTensor of rank 1 of type tf.string
            char: A string of length 1

          Returns:
            The string `s` with the given character removed (i.e. replaced by
            '')
          """
          # Hacky implementation where we split and rejoin.
          split = tf.string_split(s, char)
          rejoined = tf.reduce_join(
              tf.sparse_to_dense(
                  split.indices, split.dense_shape, split.values, ''),
              1)
          return rejoined

        def remove_punctuation(s):
          """Remove puncuation from a string.

          Args:
            s: A SparseTensor of rank 1 of type tf.string

          Returns:
            The string `s` with punctuation removed.
          """
          for char in PUNCTUATION_CHARACTERS:
            s = remove_character(s, char)
          return s

        cleaned_review = tft.map(remove_punctuation, review)
        review_tokens = tft.map(tf.string_split, cleaned_review)
        review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
        return {
            REVIEW_COLUMN: review_indices,
            LABEL_COLUMN: inputs[LABEL_COLUMN]
        }
예제 #25
0
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    outputs = {}

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_FEATURE_KEYS:
      outputs[key] = tft.scale_to_0_1(inputs[key])

    # For all categorical columns except the label column, we use
    # tft.string_to_int which computes the set of unique values and uses this
    # to convert the strings to indices.
    for key in CATEGORICAL_FEATURE_KEYS:
      outputs[key] = tft.string_to_int(inputs[key])

    # For the label column we provide the mapping from string to index.
    def convert_label(label):
      table = lookup.index_table_from_tensor(['>50K', '<=50K'])
      return table.lookup(label)
    outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY])

    return outputs
def preprocess_tft(inputs):
      import datetime   
      print inputs
      result = {}
      result['fare_amount'] = tf.identity(inputs['fare_amount'])     
      result['dayofweek'] = tft.string_to_int(inputs['dayofweek']) # builds a vocabulary
      result['hourofday'] = tf.identity(inputs['hourofday']) # pass through
      result['pickuplon'] = (tft.scale_to_0_1(inputs['pickuplon'])) # scaling numeric values
      result['pickuplat'] = (tft.scale_to_0_1(inputs['pickuplat']))
      result['dropofflon'] = (tft.scale_to_0_1(inputs['dropofflon']))
      result['dropofflat'] = (tft.scale_to_0_1(inputs['dropofflat']))
      result['passengers'] = tf.cast(inputs['passengers'], tf.float32) # a cast
      result['key'] = tf.as_string(tf.ones_like(inputs['passengers'])) # arbitrary TF func
      # engineered features
      latdiff = inputs['pickuplat'] - inputs['dropofflat']
      londiff = inputs['pickuplon'] - inputs['dropofflon']
      result['latdiff'] = tft.scale_to_0_1(latdiff)
      result['londiff'] = tft.scale_to_0_1(londiff)
      dist = tf.sqrt(latdiff * latdiff + londiff * londiff)
      result['euclidean'] = dist
      return result
예제 #27
0
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    outputs = {}

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_FEATURE_KEYS:
      outputs[key] = tft.scale_to_0_1(inputs[key])

    # bucketize numeric columns
    for key in TO_BE_BUCKETIZED_FEATURE:
      outputs[key+'_bucketized'] = tft.bucketize(
          inputs[key],
          TO_BE_BUCKETIZED_FEATURE[key]
      )


    # For categorical columns with a small vocabulary
    for key in STRING_TO_INT_FEATURE_KEYS:
      outputs[key] = tft.string_to_int(
          inputs[key],
          vocab_filename=key)

    for key in HASH_STRING_FEATURE_KEYS:
      outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key])

    # For the label column we transform it either 0 or 1 if there are row leads
    def convert_label(label):
      """Parses a string tensor into the label tensor
      Args:
        label_string_tensor: Tensor of dtype string. Result of parsing the
        CSV column specified by LABEL_COLUMN
      Returns:
        A Tensor of the same shape as label_string_tensor, should return
        an int64 Tensor representing the label index for classification tasks
      """
      table = lookup.index_table_from_tensor(['<=50K', '>50K'])
      return table.lookup(label)

    outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY])
    return outputs
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # bucketize numeric columns
        for key in TO_BE_BUCKETIZED_FEATURE:
            outputs[key + '_bucketized'] = tft.bucketize(
                inputs[key], TO_BE_BUCKETIZED_FEATURE[key])

        # For categorical columns with a small vocabulary
        for key in STRING_TO_INT_FEATURE_KEYS:
            outputs[key] = tft.string_to_int(inputs[key], vocab_filename=key)

        for key in HASH_STRING_FEATURE_KEYS:
            outputs[key] = tft.hash_strings(inputs[key],
                                            HASH_STRING_FEATURE_KEYS[key])

        # For the label column we transform it either 0 or 1 if there are row leads
        def convert_label(label):
            """Parses a string tensor into the label tensor
      Args:
        label_string_tensor: Tensor of dtype string. Result of parsing the
        CSV column specified by LABEL_COLUMN
      Returns:
        A Tensor of the same shape as label_string_tensor, should return
        an int64 Tensor representing the label index for classification tasks
      """
            table = lookup.index_table_from_tensor(['<=50K', '>50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                inputs[LABEL_KEY])
        return outputs
예제 #29
0
def preprocess(inputs):
    """tf.transform's callback function for preprocessing inputs.
  Args:
    inputs: map from feature keys to raw not-yet-transformed features.
  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}
    for key in DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[key] = transform.scale_to_z_score(inputs[key])

    for key in VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        if inputs[key].dtype == tf.string:
            vocab_tensor = inputs[key]
        else:
            vocab_tensor = tf.as_string(inputs[key])
        outputs[key] = transform.string_to_int(vocab_tensor,
                                               vocab_filename='vocab_' + key,
                                               top_k=VOCAB_SIZE,
                                               num_oov_buckets=OOV_SIZE)

    for key in BUCKET_FEATURE_KEYS:
        outputs[key] = transform.bucketize(inputs[key], FEATURE_BUCKET_COUNT)

    for key in CATEGORICAL_FEATURE_KEYS:
        outputs[key] = tf.to_int64(inputs[key])

    taxi_fare = inputs[FARE_KEY]
    taxi_tip = inputs[LABEL_KEY]
    # Test if the tip was > 20% of the fare.
    tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
    outputs[LABEL_KEY] = tf.logical_and(tf.logical_not(tf.is_nan(taxi_fare)),
                                        tf.greater(taxi_tip, tip_threshold))

    return outputs
예제 #30
0
 def preprocessing_fn(inputs):
     return {
         'index':
         tft.string_to_int(tft.map(tf.string_split, inputs['a']))
     }
예제 #31
0
 def preprocessing_fn(inputs):
     return {
         'ab': tft.map(tf.multiply, inputs['a'], inputs['b']),
         'i': tft.string_to_int(inputs['c'])
     }
예제 #32
0
 def preprocessing_fn(inputs):
     return {'index': tft.string_to_int(inputs['a'])}