def preprocess_fn(input_features):
  import tensorflow_transform as tft

  title_embed = tft.apply_function(get_embed_content, input_features['content'])
  content_embed = tft.apply_function(get_embed_title, input_features['title'])
  output_features = {
      'topics': input_features['topics'],
      'title': input_features['title'],
      'content': input_features['content'],
      'title_embed': title_embed,
      'content_embed': content_embed,
  }
  return output_features
示例#2
0
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        # Since we are modifying some features and leaving others unchanged, we
        # start by setting `outputs` to a copy of `inputs.
        outputs = inputs.copy()

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(outputs[key])

        # For all categorical columns except the label column, we generate a
        # vocabulary but do not modify the feature.  This vocabulary is instead
        # used in the trainer, by means of a feature column, to convert the feature
        # from a string to an integer id.
        for key in CATEGORICAL_FEATURE_KEYS:
            tft.uniques(inputs[key], vocab_filename=key)

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                outputs[LABEL_KEY])

        return outputs
def preprocessing_fn(input_features):

    # get the text of clean_title
    text = input_features['clean_title']

    # extract embeddings using tf.hub
    embeddings = tft.apply_function(get_embeddings, text)

    # tokenize text
    text_tokens = tf.string_split(text, parameters.DELIMITERS)

    # bag of words (bow) indices
    text_tokens_indices = tft.string_to_int(text_tokens, top_k=parameters.VOCAB_SIZE)

    # tf.idf
    bag_of_words_indices, tf_idf = tft.tfidf(text_tokens_indices, parameters.VOCAB_SIZE + 1)

    output_features = dict()
    output_features['topic'] = input_features['topic']
    output_features['title'] = input_features['raw_title']
    output_features['bow'] = bag_of_words_indices
    output_features['tf_idf'] = tf_idf
    output_features['embeddings'] = embeddings

    return output_features
示例#4
0
    def pre_processing_fun(inputs):
        outputs = {}

        for fea in NUMERIC_FEATURE_KEYS:
            outputs[fea] = tft.scale_to_0_1(inputs[fea])

        for fea in CATEGORICAL_FEATURE_KEYS:
            outputs[fea] = tft.string_to_int(inputs[fea])

        def convert_label(label):
            table = lookup.index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                inputs[LABEL_KEY])

        return outputs
示例#5
0
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = transform.scale_to_z_score(inputs[key])

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[key] = transform.string_to_int(
                inputs[key],
                top_k=taxi.VOCAB_SIZE,
                num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[key] = transform.bucketize(inputs[key],
                                               taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[key] = inputs[key]

        # Was this passenger a big tipper?
        def convert_label(label):
            taxi_fare = inputs[taxi.FARE_KEY]
            return tf.where(
                tf.is_nan(taxi_fare),
                tf.cast(tf.zeros_like(taxi_fare), tf.int64),
                # Test if the tip was > 20% of the fare.
                tf.cast(
                    tf.greater(label, tf.multiply(taxi_fare,
                                                  tf.constant(0.2))),
                    tf.int64))

        outputs[taxi.LABEL_KEY] = transform.apply_function(
            convert_label, inputs[taxi.LABEL_KEY])

        return outputs
示例#6
0
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    outputs = {}

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_FEATURE_KEYS:
      outputs[key] = tft.scale_to_0_1(inputs[key])

    # For all categorical columns except the label column, we use
    # tft.string_to_int which computes the set of unique values and uses this
    # to convert the strings to indices.
    for key in CATEGORICAL_FEATURE_KEYS:
      outputs[key] = tft.string_to_int(inputs[key])

    # For the label column we provide the mapping from string to index.
    def convert_label(label):
      table = lookup.index_table_from_tensor(['>50K', '<=50K'])
      return table.lookup(label)
    outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY])

    return outputs
示例#7
0
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    outputs = {}

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_FEATURE_KEYS:
      outputs[key] = tft.scale_to_0_1(inputs[key])

    # bucketize numeric columns
    for key in TO_BE_BUCKETIZED_FEATURE:
      outputs[key+'_bucketized'] = tft.bucketize(
          inputs[key],
          TO_BE_BUCKETIZED_FEATURE[key]
      )


    # For categorical columns with a small vocabulary
    for key in STRING_TO_INT_FEATURE_KEYS:
      outputs[key] = tft.string_to_int(
          inputs[key],
          vocab_filename=key)

    for key in HASH_STRING_FEATURE_KEYS:
      outputs[key] = tft.hash_strings(inputs[key], HASH_STRING_FEATURE_KEYS[key])

    # For the label column we transform it either 0 or 1 if there are row leads
    def convert_label(label):
      """Parses a string tensor into the label tensor
      Args:
        label_string_tensor: Tensor of dtype string. Result of parsing the
        CSV column specified by LABEL_COLUMN
      Returns:
        A Tensor of the same shape as label_string_tensor, should return
        an int64 Tensor representing the label index for classification tasks
      """
      table = lookup.index_table_from_tensor(['<=50K', '>50K'])
      return table.lookup(label)

    outputs[LABEL_KEY] = tft.apply_function(convert_label, inputs[LABEL_KEY])
    return outputs
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # bucketize numeric columns
        for key in TO_BE_BUCKETIZED_FEATURE:
            outputs[key + '_bucketized'] = tft.bucketize(
                inputs[key], TO_BE_BUCKETIZED_FEATURE[key])

        # For categorical columns with a small vocabulary
        for key in STRING_TO_INT_FEATURE_KEYS:
            outputs[key] = tft.string_to_int(inputs[key], vocab_filename=key)

        for key in HASH_STRING_FEATURE_KEYS:
            outputs[key] = tft.hash_strings(inputs[key],
                                            HASH_STRING_FEATURE_KEYS[key])

        # For the label column we transform it either 0 or 1 if there are row leads
        def convert_label(label):
            """Parses a string tensor into the label tensor
      Args:
        label_string_tensor: Tensor of dtype string. Result of parsing the
        CSV column specified by LABEL_COLUMN
      Returns:
        A Tensor of the same shape as label_string_tensor, should return
        an int64 Tensor representing the label index for classification tasks
      """
            table = lookup.index_table_from_tensor(['<=50K', '>50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                inputs[LABEL_KEY])
        return outputs
示例#9
0
  def preprocessing_fn(inputs):
    """User defined preprocessing function for movielens columns.

    Args:
      inputs: a `dict` that maps EXAMPLE_COLUMNS to the corresponding
        Tensor/SparseTensor.
    Returns:
      A `dict` that maps EXAMPLE_COLUMNS to the transformed Tensor/SparseTensor.
    """
    result = {column_name: inputs[column_name]
              for column_name in EXAMPLE_COLUMNS}

    rating_max = tft.max(inputs[QUERY_RATED_MOVIE_SCORES].values)

    rating_min = tft.min(inputs[QUERY_RATED_MOVIE_SCORES].values)

    def scale_sparse_values(x, min_value, max_value):
      """0-1 normalization of the values of a SparseTensor.

      Args:
        x: a input sparse tensor.
        min_value: minimum value for x.values.
        max_value: maximum value for x.values.
      Returns:
        A sparse tensor y such as that y.values is the result of
        0-1 normalization of x.values.
      """
      scaled_values = (x.values - min_value) / (max_value - min_value)
      return tf.SparseTensor(indices=x.indices, values=scaled_values,
                             dense_shape=x.dense_shape)

    result[QUERY_RATED_MOVIE_SCORES] = scale_sparse_values(
        inputs[QUERY_RATED_MOVIE_SCORES],
        rating_min, rating_max)

    genre_vocab = tft.uniques(tf.concat(
        [inputs[QUERY_RATED_GENRE_IDS].values,
         inputs[CANDIDATE_GENRE_IDS].values], 0))

    movie_vocab = tft.uniques(tf.concat(
        [inputs[QUERY_RATED_MOVIE_IDS].values,
         inputs[CANDIDATE_MOVIE_ID].values,
         inputs[RANKING_CANDIDATE_MOVIE_IDS].values], 0))

    def map_to_int(x, vocabulary_or_file):
      """Maps string tensor into indexes using vocab.

      Args:
        x : a Tensor/SparseTensor of string.
        vocabulary_or_file: a Tensor/SparseTensor containing unique string
          values within x or a single value for the file where the vocabulary
          is stored.

      Returns:
        A Tensor/SparseTensor of indexes (int) of the same shape as x.
      """
      # TODO(b/62489180): Remove this workaround once TFT 0.2.0 is released.
      if hasattr(impl,
                 '_asset_files_supported') and impl._asset_files_supported():  # pylint: disable=protected-access
        table = tf.contrib.lookup.string_to_index_table_from_file(
            vocabulary_file=vocabulary_or_file, num_oov_buckets=1)
      else:
        table = tf.contrib.lookup.string_to_index_table_from_tensor(
            mapping=vocabulary_or_file, num_oov_buckets=1)
      return table.lookup(x)

    result[QUERY_RATED_GENRE_IDS] = tft.apply_function(
        map_to_int, inputs[QUERY_RATED_GENRE_IDS], genre_vocab)

    result[CANDIDATE_GENRE_IDS] = tft.apply_function(
        map_to_int, inputs[CANDIDATE_GENRE_IDS], genre_vocab)

    result[QUERY_RATED_MOVIE_IDS] = tft.apply_function(
        map_to_int, inputs[QUERY_RATED_MOVIE_IDS], movie_vocab)

    result[CANDIDATE_MOVIE_ID] = tft.apply_function(
        map_to_int, inputs[CANDIDATE_MOVIE_ID], movie_vocab)

    result[RANKING_CANDIDATE_MOVIE_IDS] = tft.apply_function(
        map_to_int, inputs[RANKING_CANDIDATE_MOVIE_IDS], movie_vocab)

    return result
示例#10
0
def preprocess_fn(input_features):
    import tensorflow_transform as tft
    embedding = tft.apply_function(embed_text, input_features['text'])
    output_features = {'id': input_features['id'], 'embedding': embedding}
    return output_features
示例#11
0
    def preprocessing_fn(inputs):
        """User defined preprocessing function for movielens columns.

    Args:
      inputs: a `dict` that maps EXAMPLE_COLUMNS to the corresponding
        Tensor/SparseTensor.
    Returns:
      A `dict` that maps EXAMPLE_COLUMNS to the transformed Tensor/SparseTensor.
    """
        result = {
            column_name: inputs[column_name]
            for column_name in EXAMPLE_COLUMNS
        }

        rating_max = tft.max(inputs[QUERY_RATED_MOVIE_SCORES].values)

        rating_min = tft.min(inputs[QUERY_RATED_MOVIE_SCORES].values)

        def scale_sparse_values(x, min_value, max_value):
            """0-1 normalization of the values of a SparseTensor.

      Args:
        x: a input sparse tensor.
        min_value: minimum value for x.values.
        max_value: maximum value for x.values.
      Returns:
        A sparse tensor y such as that y.values is the result of
        0-1 normalization of x.values.
      """
            scaled_values = (x.values - min_value) / (max_value - min_value)
            return tf.SparseTensor(indices=x.indices,
                                   values=scaled_values,
                                   dense_shape=x.dense_shape)

        result[QUERY_RATED_MOVIE_SCORES] = scale_sparse_values(
            inputs[QUERY_RATED_MOVIE_SCORES], rating_min, rating_max)

        genre_vocab = tft.uniques(
            tf.concat([
                inputs[QUERY_RATED_GENRE_IDS].values,
                inputs[CANDIDATE_GENRE_IDS].values
            ], 0))

        movie_vocab = tft.uniques(
            tf.concat([
                inputs[QUERY_RATED_MOVIE_IDS].values,
                inputs[CANDIDATE_MOVIE_ID].values,
                inputs[RANKING_CANDIDATE_MOVIE_IDS].values
            ], 0))

        def map_to_int(x, vocabulary_or_file):
            """Maps string tensor into indexes using vocab.

      Args:
        x : a Tensor/SparseTensor of string.
        vocabulary_or_file: a Tensor/SparseTensor containing unique string
          values within x or a single value for the file where the vocabulary
          is stored.

      Returns:
        A Tensor/SparseTensor of indexes (int) of the same shape as x.
      """
            # TODO(b/62489180): Remove this workaround once TFT 0.2.0 is released.
            if hasattr(impl, '_asset_files_supported'
                       ) and impl._asset_files_supported():  # pylint: disable=protected-access
                table = tf.contrib.lookup.string_to_index_table_from_file(
                    vocabulary_file=vocabulary_or_file, num_oov_buckets=1)
            else:
                table = tf.contrib.lookup.string_to_index_table_from_tensor(
                    mapping=vocabulary_or_file, num_oov_buckets=1)
            return table.lookup(x)

        result[QUERY_RATED_GENRE_IDS] = tft.apply_function(
            map_to_int, inputs[QUERY_RATED_GENRE_IDS], genre_vocab)

        result[CANDIDATE_GENRE_IDS] = tft.apply_function(
            map_to_int, inputs[CANDIDATE_GENRE_IDS], genre_vocab)

        result[QUERY_RATED_MOVIE_IDS] = tft.apply_function(
            map_to_int, inputs[QUERY_RATED_MOVIE_IDS], movie_vocab)

        result[CANDIDATE_MOVIE_ID] = tft.apply_function(
            map_to_int, inputs[CANDIDATE_MOVIE_ID], movie_vocab)

        result[RANKING_CANDIDATE_MOVIE_IDS] = tft.apply_function(
            map_to_int, inputs[RANKING_CANDIDATE_MOVIE_IDS], movie_vocab)

        return result