def get_category_encoding_layer(name, dataset, dtype, max_tokens=None): # Create a StringLookup layer which will turn strings into integer indices if dtype == 'string': index = preprocessing.StringLookup(max_tokens=max_tokens) else: index = preprocessing.IntegerLookup(max_values=max_tokens) # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) # Learn the set of possible values and assign them a fixed integer index. index.adapt(feature_ds) # Create a Discretization for our integer indices. encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size()) # Prepare a Dataset that only yields our feature. feature_ds = feature_ds.map(index) # Learn the space of possible indices. encoder.adapt(feature_ds) # Apply one-hot encoding to our indices. The lambda function captures the # layer so we can use them, or include them in the functional model later. return lambda feature: encoder(index(feature))
def getCategoryEncodingLayer(self, name, dataset, dtype, max_tokens=None): if dtype == 'string': index = preprocessing.StringLookup(max_tokens=max_tokens) else: index = preprocessing.IntegerLookup(max_tokens=max_tokens) feature_ds = dataset.map(lambda x, y: x[name]) index.adapt(feature_ds) encoder = preprocessing.CategoryEncoding( num_tokens=index.vocabulary_size()) return lambda feature: encoder(index(feature))
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None): if dtype == 'string': index = preprocessing.StringLookup(max_tokens=max_tokens) else: index = preprocessing.IntegerLookup(max_values=max_tokens) feature_ds = dataset.map(lambda x, y: x[name]) index.adapt(feature_ds) encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size()) feature_ds = feature_ds.map(index) encoder.adapt(feature_ds) return lambda feature: encoder(index(feature))
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None): """Creates everything that's needed for a categorical encoding input pipeline. Args: name (string): name of the feature dataset (tf.DataSet): tensorflow dataset dtype (string): datatype max_tokens (int, optional): maximum number of tokens. Defaults to None. Returns: lambda function: categorical input pipeline """ # Create a StringLookup layer which will turn strings into integer indices if dtype == 'string': index = exp_preprocessing.StringLookup(max_tokens=max_tokens) else: index = exp_preprocessing.IntegerLookup(max_values=max_tokens) # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) # Learn the set of possible values and assign them a fixed integer index. index.adapt(feature_ds) # Create a Discretization for our integer indices. encoder = exp_preprocessing.CategoryEncoding(max_tokens=index.vocab_size()) # Prepare a Dataset that only yields our feature. feature_ds = feature_ds.map(index) # Learn the space of possible indices. encoder.adapt(feature_ds) # Apply one-hot encoding to our indices. The lambda function captures the # layer so we can use them, or include them in the functional model later. return lambda feature: encoder(index(feature))
string `""`), and index 1 is reserved for out-of-vocabulary values (values that were not seen during `adapt()`). You can configure this by using the `mask_token` and `oov_token` constructor arguments of `StringLookup`. You can see the `StringLookup` and `CategoryEncoding` layers in action in the example [structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/). """ """ ### Encoding integer categorical features via one-hot encoding """ # Define some toy data data = tf.constant([10, 20, 20, 10, 30, 0]) # Use IntegerLookup to build an index of the feature values indexer = preprocessing.IntegerLookup() indexer.adapt(data) # Use CategoryEncoding to encode the integer indices to a one-hot vector encoder = preprocessing.CategoryEncoding(output_mode="binary") encoder.adapt(indexer(data)) # Convert new test data (which includes unknown feature values) test_data = tf.constant([10, 10, 20, 50, 60, 0]) encoded_data = encoder(indexer(test_data)) print(encoded_data) """ Note that index 0 is reserved for missing values (which you should specify as the value 0), and index 1 is reserved for out-of-vocabulary values (values that were not seen during `adapt()`). You can configure this by using the `mask_value` and `oov_value` constructor arguments of `IntegerLookup`.
(values that were not seen during `adapt()`). You can see the `StringLookup` in action in the [Structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/) example. """ """ ### Encoding integer categorical features via one-hot encoding """ # Define some toy data data = tf.constant([[10], [20], [20], [10], [30], [0]]) # Use IntegerLookup to build an index of the feature values and encode output. lookup = preprocessing.IntegerLookup(output_mode="one_hot") lookup.adapt(data) # Convert new test data (which includes unknown feature values) test_data = tf.constant([[10], [10], [20], [50], [60], [0]]) encoded_data = lookup(test_data) print(encoded_data) """ Note that index 0 is reserved for missing values (which you should specify as the value 0), and index 1 is reserved for out-of-vocabulary values (values that were not seen during `adapt()`). You can configure this by using the `mask_token` and `oov_token` constructor arguments of `IntegerLookup`. You can see the `IntegerLookup` in action in the example [structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/).
seen during `adapt()`). You can configure this by using the `mask_token` and `oov_token` constructor arguments of `StringLookup`. You can see the `StringLookup` in action in the [Structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/) example. """ """ ### Encoding integer categorical features via one-hot encoding """ # Define some toy data data = tf.constant([[10], [20], [20], [10], [30], [0]]) # Use IntegerLookup to build an index of the feature values and encode output. lookup = preprocessing.IntegerLookup(output_mode="binary") lookup.adapt(data) # Convert new test data (which includes unknown feature values) test_data = tf.constant([[10], [10], [20], [50], [60], [0]]) encoded_data = lookup(test_data) print(encoded_data) """ Note that index 0 is reserved for missing values (which you should specify as the value 0), and index 1 is reserved for out-of-vocabulary values (values that were not seen during `adapt()`). You can configure this by using the `mask_token` and `oov_token` constructor arguments of `IntegerLookup`. You can see the `IntegerLookup` in action in the example [structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/). """