Exemplo n.º 1
0
def preprocessing_fn(input_features):

    # get the text of clean_title
    text = input_features['clean_title']

    # extract embeddings using tf.hub
    embeddings = tft.apply_function(get_embeddings, text)

    # tokenize text
    text_tokens = tf.string_split(text, parameters.DELIMITERS)

    # bag of words (bow) indices
    text_tokens_indices = tft.string_to_int(text_tokens, top_k=parameters.VOCAB_SIZE)

    # tf.idf
    bag_of_words_indices, tf_idf = tft.tfidf(text_tokens_indices, parameters.VOCAB_SIZE + 1)

    output_features = dict()
    output_features['topic'] = input_features['topic']
    output_features['title'] = input_features['raw_title']
    output_features['bow'] = bag_of_words_indices
    output_features['tf_idf'] = tf_idf
    output_features['embeddings'] = embeddings

    return output_features
def preprocessing_fn(inputs):
    text_fields = []
    # Keep the original data and add more to it.
    result = inputs.copy()
    # Figure out the vocabulary for our text fields.
    for field_name in text_fields:
        field = inputs[field_name]
        tokens = tf.strings.split(text, " ")
        bag_of_words = tft.bag_of_words(tokens, range(1,3), seperator=" ")
        indices = tft.compute_and_apply_vocabulary(bag_of_words)
        bow_indices, weights = tft.tfidf(line_indices)
        outputs[f"{field_name}_bow_indices"] = bow_indices
        outputs[f"{field_name}_weight"] weights
    return result
Exemplo n.º 3
0
      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_COLUMN]

        review_tokens = tf.string_split(review, DELIMITERS)
        review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE)
        # Add one for the oov bucket created by string_to_int.
        review_bow_indices, review_weight = tft.tfidf(review_indices,
                                                      VOCAB_SIZE + 1)
        return {
            REVIEW_COLUMN: review_bow_indices,
            REVIEW_WEIGHT: review_weight,
            LABEL_COLUMN: inputs[LABEL_COLUMN]
        }
Exemplo n.º 4
0
      def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        review = inputs[REVIEW_KEY]

        review_tokens = tf.string_split(review, DELIMITERS)
        review_indices = tft.compute_and_apply_vocabulary(
            review_tokens, top_k=VOCAB_SIZE)
        # Add one for the oov bucket created by compute_and_apply_vocabulary.
        review_bow_indices, review_weight = tft.tfidf(review_indices,
                                                      VOCAB_SIZE + 1)
        return {
            REVIEW_KEY: review_bow_indices,
            REVIEW_WEIGHT_KEY: review_weight,
            LABEL_KEY: inputs[LABEL_KEY]
        }
Exemplo n.º 5
0
    def preprocessing_fn(inputs):
        """TFT preprocessing function.
    Args:
      inputs: dictionary of input `tensorflow_transform.Column`.
    Returns:
      A dictionary of `tensorflow_transform.Column` representing the transformed
          columns.
    """

        features_dict = {}
        for col_schema in schema:
            col_name = col_schema['name']
            if col_schema['type'] == 'NUMBER':
                features_dict[col_name] = inputs[col_name]
            elif col_schema['type'] == 'CATEGORY':
                features_dict[col_name] = tft.string_to_int(
                    inputs[col_name], vocab_filename='vocab_' + col_name)
            elif col_schema['type'] == 'TEXT':
                tokens = tf.string_split(inputs[col_name], DELIMITERS)
                # TODO: default_value = 0 is wrong. It means OOV gets 0 for their index.
                # But this is to workaround the issue that trainer can use the true vocab
                # size. Otherwise trainer has to use VOCAB_SIZE defined in this file which
                # is too large. I am talking to TFT folks on this. If there is no workaround,
                # user has to provide a vocab_size.
                indices = tft.string_to_int(tokens,
                                            vocab_filename='vocab_' + col_name,
                                            default_value=0)
                # Add one for the oov bucket created by string_to_int.
                bow_indices, bow_weights = tft.tfidf(indices, VOCAB_SIZE + 1)
                features_dict[col_name + '_indices'] = bow_indices
                features_dict[col_name + '_weights'] = bow_weights
            elif col_schema['type'] == 'IMAGE_URL':
                features_dict[col_name] = tft.apply_function_with_checkpoint(
                    _image_to_vec, [inputs[col_name]],
                    INCEPTION_V3_CHECKPOINT,
                    exclude=INCEPTION_EXCLUDED_VARIABLES)
            elif col_schema['type'] == 'KEY':
                features_dict[col_name] = inputs[col_name]
            else:
                raise ValueError('Invalid schema. Unknown type ' +
                                 col_schema['type'])
        return features_dict