def encode_inputs(inputs, use_embedding=False): encoded_features = [] for feature_name in inputs: if feature_name in CATEGORICAL_FEATURE_NAMES: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] # Create a lookup to convert string values to an integer indices. # Since we are not using a mask token nor expecting any out of vocabulary # (oov) token, we set mask_token to None and num_oov_indices to 0. index = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0) # Convert the string input values into integer indices. value_index = index(inputs[feature_name]) if use_embedding: embedding_dims = int(math.sqrt(len(vocabulary))) # Create an embedding layer with the specified dimensions. embedding_ecoder = layers.Embedding(input_dim=len(vocabulary), output_dim=embedding_dims) # Convert the index values to embedding representations. encoded_feature = embedding_ecoder(value_index) else: # Create a one-hot encoder. onehot_encoder = CategoryEncoding(output_mode="binary") onehot_encoder.adapt(index(vocabulary)) # Convert the index values to a one-hot representation. encoded_feature = onehot_encoder(value_index) else: # Use the numerical features as-is. encoded_feature = tf.expand_dims(inputs[feature_name], -1) encoded_features.append(encoded_feature) all_features = layers.concatenate(encoded_features) return all_features
def encode_string_categorical_feature(feature, name, dataset): # Create a StringLookup layer which will turn strings into integer indices index = StringLookup() # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the set of possible string values and assign them a fixed integer index index.adapt(feature_ds) # Turn the string input into integer indices encoded_feature = index(feature) # Create a CategoryEncoding for our integer indices encoder = CategoryEncoding(output_mode="binary") # Prepare a dataset of indices feature_ds = feature_ds.map(index) # Learn the space of possible indices encoder.adapt(feature_ds) # Apply one-hot encoding to our indices encoded_feature = encoder(encoded_feature) return encoded_feature
def encode_integer_categorical_feature(feature, name, dataset): # Create a CategoryEncoding for the integer indices of the input feature passed as argument encoder = CategoryEncoding(output_mode='binary') # Prepare a Dataset containing only the feature feature_dset = dataset.map(lambda x, y: x[name]) feature_dset = feature_dset.map(lambda x: tf.expand_dims(x, -1)) # Learn the space of possible indices and apply one-hot encoding to them encoder.adapt(feature_dset) encoded_feature = encoder(feature) return encoded_feature
def encode_integer_categorical_feature(feature, name, dataset): encoder = CategoryEncoding(output_mode="binary") feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) encoder.adapt(feature_ds) encoded_feature = encoder(feature) return encoded_feature
def encode_integer_categorical_feature(feature, name, dataset): # Create a CategoryEncoding for our integer indices encoder = CategoryEncoding(output_mode="binary") # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the space of possible indices encoder.adapt(feature_ds) # Apply one-hot encoding to our indices encoded_feature = encoder(feature) return encoded_feature
def encode_string_categorical_feature(feature, name, dataset): index = StringLookup() feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) index.adapt(feature_ds) encoded_feature = index(feature) encoder = CategoryEncoding(output_mode="binary") feature_ds = feature_ds.map(index) encoder.adapt(feature_ds) encoded_feature = encoder(encoded_feature) return encoded_feature
def encode(self, input_feature, name, dataset): """ """ feature_ds = _extract_feature_column(dataset, name) # apply String Indexer index_encoder = StringIndexer() index_encoder.adapt(feature_ds) index_encoded_feature = index_encoder.encode(input_feature, name, dataset) feature_ds = feature_ds.map(index_encoder.encoder) # apply categorical encoding category_encoder = CategoryEncoding(output_mode="binary") category_encoder.adapt(feature_ds) encoded_feature = category_encoder(index_encoded_feature) return encoded_feature
def _encode_categorical_feature( feature: KerasTensor, name: str, dataset: Optional[BatchDataset], ) -> KerasTensor: """One-hot encode categorical features. Args: - feature: The input layer of the feature. - name: The feature's name (its column name in the original dataframe). - dataset: The training data, if not specified, return a no-op layer. Returns: The one-hot encoded tensor of the input feature. """ # Return generic layer for the tuner initialization if not dataset: return KerasTensor(type_spec=TensorSpec( shape=(None, 1), dtype=tf.float32, name=None)) # Create a StringLookup layer which will turn strings into integer indices index = StringLookup() # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the set of possible string values and assign them a fixed integer index index.adapt(feature_ds) # Turn the string input into integer indices encoded_feature = index(feature) # Create a CategoryEncoding for our integer indices encoder = CategoryEncoding(output_mode="binary") # Learn the space of possible indices encoder.adapt(np.arange(index.vocab_size())) # Apply one-hot encoding to our indices{split + 1} / {n_splits} encoded_feature = encoder(encoded_feature) return encoded_feature