示例#1
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}
    for key in _DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[_transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    for key in _VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=_VOCAB_SIZE,
            num_oov_buckets=_OOV_SIZE)

    for key in _BUCKET_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.bucketize(
            _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

    # TODO(b/157064428): Support label transformation for Keras.
    # Do not apply label transformation as it will result in wrong evaluation.
    outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]

    return outputs
示例#2
0
def preprocessing_fn(inputs):
  """Callback function for transforming inputs.

  Args:
    inputs: A dict of feature keys maped to `Tensor` or `SparseTensor` of raw
      features.

  Returns:
    Map from string feature keys to `Tensor` of transformed features.
  """
  outputs = {
      features_config.TARGET_FEATURE:
          utils.preprocess_sparsetensor(
              inputs.pop(features_config.TARGET_FEATURE))
  }
  outputs[features_config.ID_FEATURE] = inputs.pop(features_config.ID_FEATURE)
  for key in features_config.NUMERIC_FEATURES:
    outputs[utils.make_transformed_key(key)] = transform.scale_to_z_score(
        utils.preprocess_sparsetensor(inputs[key]))
  for key in features_config.CATEGORICAL_FEATURES:
    outputs[utils.make_transformed_key(
        key)] = transform.compute_and_apply_vocabulary(
            utils.preprocess_sparsetensor(inputs[key]),
            top_k=features_config.VOCAB_SIZE,
            num_oov_buckets=features_config.OOV_SIZE)
  return outputs
示例#3
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    Args:
    inputs: map from feature keys to raw not-yet-transformed features.
    Returns:
    Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in _DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[_transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    for key in _VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=_VOCAB_SIZE,
            num_oov_buckets=_OOV_SIZE)

    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

    outputs[_transformed_name(_LABEL_KEY)] = _fill_in_missing(
        inputs[_LABEL_KEY])

    return outputs
示例#4
0
def preprocessing_fn(inputs):
  outputs = {}
  for key in features.DENSE_FLOAT_FEATURE_KEYS:
    outputs[features.transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in features.VOCAB_FEATURE_KEYS:
    outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=features.VOCAB_SIZE,
        num_oov_buckets=features.OOV_SIZE)

  for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS,
                              features.BUCKET_FEATURE_BUCKET_COUNT):
    outputs[features.transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        num_buckets,
        always_return_num_quantiles=False)

  for key in features.CATEGORICAL_FEATURE_KEYS:
    outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key])

  fare_key = 'fare'
  taxi_fare = _fill_in_missing(inputs[fare_key])
  tips = _fill_in_missing(inputs[features.LABEL_KEY])
  outputs[features.transformed_name(features.LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs
示例#5
0
def preprocess(input_features):

    output_features = {}

    output_features[metadata.TARGET_FEATURE_NAME] = input_features[metadata.TARGET_FEATURE_NAME]

    for feature_name in metadata.NUMERIC_FEATURE_NAMES:

        #output_features[feature_name+"_scaled"] = tft.scale_to_z_score(input_features[feature_name])
        output_features[feature_name] = tft.scale_to_z_score(input_features[feature_name])

        quantiles = tft.quantiles(input_features[feature_name], num_buckets=NUM_BUCKETS, epsilon=0.01)
        output_features[feature_name+"_bucketized"] = tft.apply_buckets(input_features[feature_name],
                                                                        bucket_boundaries=quantiles)

    for feature_name in metadata.CATEGORICAL_FEATURE_NAMES:

        tft.uniques(input_features[feature_name], vocab_filename=feature_name)
        output_features[feature_name] = input_features[feature_name]

        # sba added this
        #output_features[feature_name+"_integerized"] = tft.string_to_int(input_features[feature_name],
                                                           #vocab_filename=feature_name)
    for feature_name in metadata.VOCAB_FEATURE_NAMES:

        output_features[feature_name +"_integerized"] = tft.string_to_int(input_features[feature_name],top_k=metadata.VOCAB_SIZE, num_oov_buckets=metadata.OOV_SIZE, vocab_filename=feature_name)
                                                           


    return output_features
示例#6
0
    def preprocessing_fn(
            inputs: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]:
        """tf.transform's callback function for preprocessing inputs.

    Parameters
    ----------
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns
    -------
      Map from string feature key to transformed feature operations.

    """
        outputs = {}
        for key in categorical_feature_keys + [label_key]:
            outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

            vocab_file_tensor = tft.vocabulary(outputs[_transformed_name(key)],
                                               vocab_filename=key)

            outputs[_transformed_name(key)] = tft.apply_vocabulary(
                outputs[_transformed_name(key)], vocab_file_tensor)

        # NOTE: This won't be correct in the incremental case since it's only using
        # the new examples to get the mean and variance.
        for key in numerical_feature_keys:
            outputs[_transformed_name(key)] = tf.expand_dims(
                tft.scale_to_z_score(_fill_in_missing(inputs[key])), axis=1)

        return outputs
示例#7
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    Args:
      inputs: map from feature keys to raw not-yet-transformed features.
    Returns:
      Map from string feature key to transformed feature operations.
    """

    outputs = {}

    for key in features.FEATURE_NAMES:
        if key in features.NUMERICAL_FEATURE_NAMES:
            outputs[features.transformed_name(key)] = tft.scale_to_z_score(inputs[key])

        elif key in features.categorical_feature_names():
            outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
                inputs[key],
                num_oov_buckets=1,
                vocab_filename=key,
            )

    outputs[features.TARGET_FEATURE_NAME] = inputs[features.TARGET_FEATURE_NAME]

    for key in outputs:
        outputs[key] = tf.squeeze(outputs[key], -1)

    return outputs
示例#8
0
def preprocessing_fn(inputs):
    out = dict()

    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        out[taxi.transformed_name(key)] = tft.scale_to_z_score(
            taxi.fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        out[taxi.transformed_name(key)] = tft.compute_and_apply_vocabulary(
            taxi.fill_in_missing(inputs[key]), top_k=10, num_oov_buckets=10)

    for key in taxi.BUCKET_FEATURE_KEYS:
        out[taxi.transformed_name(key)] = tft.bucketize(taxi.fill_in_missing(
            inputs[key]),
                                                        num_buckets=10)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
        out[taxi.transformed_name(key)] = taxi.fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = taxi.fill_in_missing(inputs[taxi.FARE_KEY])
    tips = taxi.fill_in_missing(inputs[taxi.LABEL_KEY])
    out[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                tf.int64))

    return out
示例#9
0
def preprocessing_fn(inputs):
    features = []
    outputs = {LABEL_KEY: _fill_in_missing(inputs[LABEL_KEY])}

    for feature_name, feature_tensor in inputs.items():
        if feature_name != LABEL_KEY:
            features.append(
                tft.scale_to_z_score(  # standard scaler pre-req for PCA
                    _fill_in_missing(
                        feature_tensor)  # filling in missing values
                ))

    # concat to make feature matrix for PCA to run over
    feature_matrix = tf.concat(features, axis=1)

    # get orthonormal vector matix
    orthonormal_vectors = tft.pca(feature_matrix,
                                  output_dim=2,
                                  dtype=tf.float32)

    # multiply matrix by feature matrix to get transformation
    pca_examples = tf.linalg.matmul(feature_matrix, orthonormal_vectors)

    # unstack and add to output dict
    pca_examples = tf.unstack(pca_examples, axis=1)
    outputs['Principal Component 1'] = pca_examples[0]
    outputs['Principal Component 2'] = pca_examples[1]

    return outputs
示例#10
0
def preprocessing_fn(input_features):

    processed_features = {}

    for feature in raw_schema.feature:

        # Pass the target feature as is.
        if feature.name in [TARGET_FEATURE_NAME, WEIGHT_FEATURE_NAME]:
            processed_features[feature.name] = _prep(
                input_features[feature.name])
            continue

        if feature.type == 1:
            # Extract vocabulary and integerize categorical features.
            processed_features[feature.name + "_integerized"] = _prep(
                tft.compute_and_apply_vocabulary(input_features[feature.name],
                                                 vocab_filename=feature.name))
        else:
            # normalize numeric features.
            processed_features[feature.name + "_scaled"] = _prep(
                tft.scale_to_z_score(input_features[feature.name]))

        # Bucketize age using quantiles.
        quantiles = tft.quantiles(input_features["age"],
                                  num_buckets=5,
                                  epsilon=0.01)
        processed_features["age_bucketized"] = _prep(
            tft.apply_buckets(input_features["age"],
                              bucket_boundaries=quantiles))

    return processed_features
示例#11
0
def preprocessing_fn(inputs):
    """Preprocesses Covertype Dataset.
    
    Scales numerical features and generates vocabularies
    and mappings for categorical features.
    
    Args:
        inputs: A map from feature keys to raw not-yet-transformed features
        
    Returns:
        A map from transformed feature keys to transformation operations
    """
    
    outputs = {}
    
    # Scale numerical features
    for key in NUMERIC_FEATURES_KEYS:
        outputs[_transformed_name(key)] = tft.scale_to_z_score(_fill_in_missing(inputs[key]))
        
    # Generate vocabularies and maps categorical features
    for key in CATEGORICAL_FEATURES_KEYS:
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        x=_fill_in_missing(inputs[key]),
        num_oov_buckets=1,
        vocab_filename=key)
        
        
    # Convert Cover_Type from 1-7 to 0-6 
    outputs[_transformed_name(LABEL_KEY)] = _fill_in_missing(inputs[LABEL_KEY]) - 1
    
    return outputs
    
示例#12
0
def preprocessing_fn(inputs):
    outputs = {}

    for key in _FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.scale_to_z_score(inputs[key])
    outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]

    return outputs
示例#13
0
def preprocessing_fn(inputs: Dict[str, Tensor],
                     custom_config=Dict[str, Any]) -> Dict[str, Tensor]:
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.
    custom_config: Custom configuration dictionary for passing the task's
      ProblemStatement as a text proto, since custom_config must be
      JSON-serializable.

  Returns:
    Map from string feature key to transformed feature operations.
  """

    problem_statement = ps_pb2.ProblemStatement()
    text_format.Parse(
        text=custom_config[BasicPreprocessor.PROBLEM_STATEMENT_KEY],
        message=problem_statement)

    outputs = {}
    for key in [k for k, v in inputs.items() if v.dtype == tf.float32]:
        # TODO(weill): Handle case when an int field can actually represents numeric
        # rather than categorical values.
        task_type = problem_statement.tasks[0].type
        if task_type.HasField('one_dimensional_regression') and (
                key == task_type.one_dimensional_regression.label):
            outputs[key] = inputs[key]
            # Skip normalizing regression tasks.
            continue

        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[_sanitize_feature_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    for key in [k for k, v in inputs.items() if v.dtype != tf.float32]:
        # Build a vocabulary for this feature.
        # TODO(weill): Risk here to blow up computation needlessly.
        output = tft.compute_and_apply_vocabulary(_fill_in_missing(
            inputs[key]),
                                                  top_k=None,
                                                  num_oov_buckets=1)

        # Don't sanitize the label key name.
        task_type = problem_statement.tasks[0].type
        if task_type.HasField('multi_class_classification') and (
                key == task_type.multi_class_classification.label):
            outputs[key] = output
            continue
        if task_type.HasField('binary_classification') and (
                key == task_type.binary_classification.label):
            outputs[key] = output
            continue

        # Do sanitize feature key names.
        outputs[_sanitize_feature_name(key)] = output

    return outputs
 def preprocess(inputs):  # inputs is a batch of input features
     median_age = inputs["housing_median_age"]
     ocean_proximity = inputs["ocean_proximity"]
     standardized_age = tft.scale_to_z_score(median_age - tft.mean(median_age))
     ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)
     return {
         "standardized_median_age": standardized_age,
         "ocean_proximity_id": ocean_proximity_id
     }
示例#15
0
def preprocessing_fn(inputs):
    """
    Preprocess input columns into transformed columns.
    Args:
        inputs (dict): dict of input columns
    Returns:
        output dict of transformed columns
    """
    outputs = {}
    # Encode categorical column:
    outputs['MixingSpeed'] = tft.compute_and_apply_vocabulary(
        inputs['MixingSpeed'])
    outputs['ButterMass'] = inputs['ButterMass']
    # Calculate Derived Features:
    outputs['TotalMass'] = inputs['ButterMass'] + inputs['SugarMass'] + inputs[
        'FlourMass']
    for ingredient in ['Butter', 'Sugar', 'Flour']:
        ingredient_percentage = inputs['{}Mass'.format(
            ingredient)] / outputs['TotalMass']
        outputs['Norm{}perc'.format(ingredient)] = tft.scale_to_z_score(
            ingredient_percentage)
    # Keep absolute numeric columns
    for key in ['TotalVolume', 'Energy']:
        outputs[key] = inputs[key]
    # Normalize other numeric columns
    for key in [
            'ButterTemperature',
            'SugarHumidity',
            'FlourHumidity',
            'HeatingTime',
            'MixingTime',
            'Density',
            'Temperature',
            'Humidity',
    ]:
        outputs[key] = tft.scale_to_z_score(inputs[key])
    # Extract Specific Problems
    chunks_detected_str = tf.regex_replace(input=inputs['Problems'],
                                           pattern='.*chunk.*',
                                           rewrite='chunk',
                                           name='DetectChunk')
    outputs['Chunks'] = tf.cast(tf.equal(chunks_detected_str, 'chunk'),
                                tf.float32)
    return outputs
示例#16
0
def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]:
    outputs = {}
    for feat in DENSE_FEATURES:
        outputs[f'{feat}_xf'] = tft.scale_to_z_score(inputs[feat])

    for feat in BINARY_FEATURES:
        outputs[feat] = inputs[feat]

    outputs[LABEL_KEY] = inputs[LABEL_KEY]
    return outputs
示例#17
0
    def transform_to_tfrecord(self, inputs):
        """Preprocess raw input columns into transformed columns."""
        outputs = inputs.copy()

        for key in self.data_formatter.number_features:
            outputs[key] = tft.scale_to_z_score((outputs[key]))

        for key in self.data_formatter.vocabulary_features:
            tft.vocabulary(inputs[key], vocab_filename=key)

        return outputs
示例#18
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}

    for key in _FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.scale_to_z_score(inputs[key])
    outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]

    return outputs
示例#19
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}
    for key in features.DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[features.transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    for key in features.VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[features.transformed_name(
            key)] = tft.compute_and_apply_vocabulary(
                _fill_in_missing(inputs[key]),
                top_k=features.VOCAB_SIZE,
                num_oov_buckets=features.OOV_SIZE)

    for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS,
                                features.BUCKET_FEATURE_BUCKET_COUNT):
        outputs[features.transformed_name(key)] = tft.bucketize(
            _fill_in_missing(inputs[key]),
            num_buckets,
            always_return_num_quantiles=False)

    for key in features.CATEGORICAL_FEATURE_KEYS:
        outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    fare_key = 'fare'
    taxi_fare = _fill_in_missing(inputs[fare_key])
    tips = _fill_in_missing(inputs[features.LABEL_KEY])
    outputs[features.transformed_name(
        features.LABEL_KEY)] = tf.compat.v1.where(
            tf.math.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

    return outputs
示例#20
0
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

        Args:
          inputs: map from feature keys to raw not-yet-transformed features.

        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in my_metadata.NUMERIC_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[my_metadata.transformed_name(key)] = transform.scale_to_z_score(_fill_in_missing(inputs[key]))

        for key in my_metadata.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[my_metadata.transformed_name(key)] = transform.compute_and_apply_vocabulary(
                _fill_in_missing(inputs[key]),
                vocab_filename=my_metadata.transformed_name(key),
                num_oov_buckets=my_metadata.OOV_SIZE,
                top_k=my_metadata.VOCAB_SIZE
            )

        for key, hash_buckets in my_metadata.HASH_STRING_FEATURE_KEYS.items():
            outputs[my_metadata.transformed_name(key)] = transform.hash_strings(
                _fill_in_missing(inputs[key]),
                hash_buckets=hash_buckets
            )

        for key, nb_buckets in my_metadata.TO_BE_BUCKETIZED_FEATURE.items():
            outputs[my_metadata.transformed_name(key +'_bucketized')] = transform.bucketize(
                _fill_in_missing(inputs[key]), nb_buckets)


        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[my_metadata.FARE_KEY])
        tips = _fill_in_missing(inputs[my_metadata.LABEL_KEY])
        outputs[my_metadata.transformed_name(my_metadata.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(
                tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                tf.int64))

        return outputs
示例#21
0
    def preprocess_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.
        Args:
          inputs: map from feature keys to raw not-yet-transformed features.
        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = tft.scale_to_z_score(to_dense(inputs[key]))

        for key in VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            if inputs[key].dtype == tf.string:
                vocab_tensor = to_dense(inputs[key])
            else:
                vocab_tensor = tf.as_string(to_dense(inputs[key]))
            outputs[key] = tft.compute_and_apply_vocabulary(
                vocab_tensor,
                vocab_filename='vocab_' + key,
                top_k=VOCAB_SIZE,
                num_oov_buckets=OOV_SIZE)

        for key in BUCKET_FEATURE_KEYS:
            outputs[key] = tft.bucketize(to_dense(inputs[key]),
                                         FEATURE_BUCKET_COUNT)

        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64)

        taxi_fare = to_dense(inputs[FARE_KEY])
        taxi_tip = to_dense(inputs[LABEL_KEY])
        # Test if the tip was > 20% of the fare.
        tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
        outputs[LABEL_KEY] = tf.logical_and(
            tf.logical_not(tf.math.is_nan(taxi_fare)),
            tf.greater(taxi_tip, tip_threshold))

        for key in outputs:
            if outputs[key].dtype == tf.bool:
                outputs[key] = tft.compute_and_apply_vocabulary(
                    tf.as_string(outputs[key]), vocab_filename='vocab_' + key)

        return outputs
示例#22
0
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}

  for key in _FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.scale_to_z_score(inputs[key])
  # TODO(b/157064428): Support label transformation for Keras.
  # Do not apply label transformation as it will result in wrong evaluation.
  outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]

  return outputs
示例#23
0
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = transform.scale_to_z_score(inputs[key])

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[key] = transform.string_to_int(
                inputs[key],
                top_k=taxi.VOCAB_SIZE,
                num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[key] = transform.bucketize(inputs[key],
                                               taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[key] = inputs[key]

        # Was this passenger a big tipper?
        def convert_label(label):
            taxi_fare = inputs[taxi.FARE_KEY]
            return tf.where(
                tf.is_nan(taxi_fare),
                tf.cast(tf.zeros_like(taxi_fare), tf.int64),
                # Test if the tip was > 20% of the fare.
                tf.cast(
                    tf.greater(label, tf.multiply(taxi_fare,
                                                  tf.constant(0.2))),
                    tf.int64))

        outputs[taxi.LABEL_KEY] = transform.apply_function(
            convert_label, inputs[taxi.LABEL_KEY])

        return outputs
示例#24
0
文件: dataset.py 项目: vikrosj/tfx
        def wide_preprocessing_fn(inputs):
            """TFT preprocessing function.

      Args:
        inputs: Map from feature keys to raw not-yet-transformed features.

      Returns:
        Map from string feature key to transformed feature operations.
      """
            outputs = {}
            # pylint: disable=protected-access
            for idx, key in enumerate(
                    itertools.islice(
                        itertools.cycle(taxi_utils._BUCKET_FEATURE_KEYS),
                        self._num_bucketize)):
                outputs["bucketized" + str(idx)] = tft.bucketize(
                    taxi_utils._fill_in_missing(inputs[key]),
                    taxi_utils._FEATURE_BUCKET_COUNT)

            for idx, key in enumerate(
                    itertools.islice(
                        itertools.cycle(taxi_utils._DENSE_FLOAT_FEATURE_KEYS),
                        self._num_scale)):
                # Preserve this feature as a dense float, setting nan's to the mean.
                outputs["scaled" + str(idx)] = tft.scale_to_z_score(
                    taxi_utils._fill_in_missing(inputs[key]))

            for idx, key in enumerate(
                    itertools.islice(
                        itertools.cycle(taxi_utils._VOCAB_FEATURE_KEYS),
                        self._num_vocabs)):
                outputs["vocab" + str(idx)] = tft.compute_and_apply_vocabulary(
                    taxi_utils._fill_in_missing(inputs[key]),
                    top_k=taxi_utils._VOCAB_SIZE,
                    num_oov_buckets=taxi_utils._OOV_SIZE)

            # Pass-through features.
            for key in taxi_utils._CATEGORICAL_FEATURE_KEYS + [
                    taxi_utils._LABEL_KEY
            ]:
                outputs[key] = inputs[key]

            return outputs
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}
    for key in outputs.keys():
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[features.transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    outputs[features.transformed_name(
        features.LABEL_KEY)] = inputs[features.LABEL_KEY]

    return outputs
  def def_preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
      # Preserve this feature as a dense float, setting nan's to the mean.
      outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
          _fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
      # Build a vocabulary for this feature.
      outputs[
          taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
              _fill_in_missing(inputs[key]),
              top_k=taxi.VOCAB_SIZE,
              num_oov_buckets=taxi.OOV_SIZE)

    for key in taxi.BUCKET_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = transform.bucketize(
          _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
      outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
    tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
    outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(
            tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
            tf.int64))

    return outputs
def preprocessing_fn(inputs):
  """Preprocesses Covertype Dataset."""

  outputs = {}

  # Scale numerical features
  for key in features.NUMERIC_FEATURE_KEYS:
    outputs[features.transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  # Generate vocabularies and maps categorical features
  for key in features.CATEGORICAL_FEATURE_KEYS:
    outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
        x=_fill_in_missing(inputs[key]), num_oov_buckets=1, vocab_filename=key)

  # Convert Cover_Type to dense tensor
  outputs[features.transformed_name(features.LABEL_KEY)] = _fill_in_missing(
      inputs[features.LABEL_KEY])

  return outputs
示例#28
0
def preprocessing_fn(inputs, custom_config):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.
    custom_config: additional properties for pre-processing.

  Returns:
    Map from string feature key to transformed features.
  """
    outputs = {}
    for key in _DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[_transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(_identity(inputs[key])))

    for key in _VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=custom_config.get('VOCAB_SIZE', _VOCAB_SIZE),
            num_oov_buckets=custom_config.get('OOV_SIZE', _OOV_SIZE))

    for key in _BUCKET_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.bucketize(
            _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
    tips = _fill_in_missing(inputs[_LABEL_KEY])
    outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
        tf.math.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                tf.int64))

    return outputs
示例#29
0
    def transform_to_tfrecord(self, inputs):
        """Preprocess raw input columns into transformed columns."""
        outputs = inputs.copy()

        for key in enabled_number_features:
            outputs[key] = tft.scale_to_z_score((outputs[key]))

        # for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
        #     # This is a SparseTensor because it is optional. Here we fill in a default
        #     # value when it is missing.
        #     dense = tf.sparse_to_dense(outputs[key].indices,
        #                                [outputs[key].dense_shape[0], 1],
        #                                outputs[key].values, default_value=0.)
        #     # Reshaping from a batch of vectors of size 1 to a batch to scalars.
        #     dense = tf.squeeze(dense, axis=1)
        #     outputs[key] = tft.scale_to_0_1(dense)

        for key in enabled_vocabulary_features:
            tft.vocabulary(inputs[key], vocab_filename=key)

        return outputs
示例#30
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    A no-op!
    Returns unchanged map from string feature key to features.
  """
    outputs = {}
    for key in _DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[_transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    # flower variety
    variety = _fill_in_missing(inputs[_LABEL_KEY])
    outputs[_transformed_name(_LABEL_KEY)] = tf.cast(variety, tf.int64)

    return outputs
示例#31
0
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = {}
    for key in _DENSE_FLOAT_FEATURE_KEYS:
        # If sparse make it dense, setting nan's to 0 or '', and apply zscore.
        outputs[_transformed_name(key)] = tft.scale_to_z_score(
            _fill_in_missing(inputs[key]))

    for key in _VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=_VOCAB_SIZE,
            num_oov_buckets=_OOV_SIZE)

    for key in _BUCKET_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.bucketize(
            _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
    tips = _fill_in_missing(inputs[_LABEL_KEY])
    outputs[_transformed_name(_LABEL_KEY)] = tf.where(
        tf.math.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                tf.int64))

    return outputs