def encode_inputs(inputs, use_embedding=False): encoded_features = [] for feature_name in inputs: if feature_name in CATEGORICAL_FEATURE_NAMES: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] # Create a lookup to convert string values to an integer indices. # Since we are not using a mask token nor expecting any out of vocabulary # (oov) token, we set mask_token to None and num_oov_indices to 0. index = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0) # Convert the string input values into integer indices. value_index = index(inputs[feature_name]) if use_embedding: embedding_dims = int(math.sqrt(len(vocabulary))) # Create an embedding layer with the specified dimensions. embedding_ecoder = layers.Embedding(input_dim=len(vocabulary), output_dim=embedding_dims) # Convert the index values to embedding representations. encoded_feature = embedding_ecoder(value_index) else: # Create a one-hot encoder. onehot_encoder = CategoryEncoding(output_mode="binary") onehot_encoder.adapt(index(vocabulary)) # Convert the index values to a one-hot representation. encoded_feature = onehot_encoder(value_index) else: # Use the numerical features as-is. encoded_feature = tf.expand_dims(inputs[feature_name], -1) encoded_features.append(encoded_feature) all_features = layers.concatenate(encoded_features) return all_features
def encode_string_categorical_feature(feature, name, dataset): # Create a StringLookup layer which will turn strings into integer indices index = StringLookup() # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the set of possible string values and assign them a fixed integer index index.adapt(feature_ds) # Turn the string input into integer indices encoded_feature = index(feature) # Create a CategoryEncoding for our integer indices encoder = CategoryEncoding(output_mode="binary") # Prepare a dataset of indices feature_ds = feature_ds.map(index) # Learn the space of possible indices encoder.adapt(feature_ds) # Apply one-hot encoding to our indices encoded_feature = encoder(encoded_feature) return encoded_feature
def encode_integer_categorical_feature(feature, name, dataset): # Create a CategoryEncoding for the integer indices of the input feature passed as argument encoder = CategoryEncoding(output_mode='binary') # Prepare a Dataset containing only the feature feature_dset = dataset.map(lambda x, y: x[name]) feature_dset = feature_dset.map(lambda x: tf.expand_dims(x, -1)) # Learn the space of possible indices and apply one-hot encoding to them encoder.adapt(feature_dset) encoded_feature = encoder(feature) return encoded_feature
def encode_integer_categorical_feature(feature, name, dataset): encoder = CategoryEncoding(output_mode="binary") feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) encoder.adapt(feature_ds) encoded_feature = encoder(feature) return encoded_feature
def encode_integer_categorical_feature(feature, name, dataset): # Create a CategoryEncoding for our integer indices encoder = CategoryEncoding(output_mode="binary") # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the space of possible indices encoder.adapt(feature_ds) # Apply one-hot encoding to our indices encoded_feature = encoder(feature) return encoded_feature
def encode_string_categorical_feature(feature, name, dataset): index = StringLookup() feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) index.adapt(feature_ds) encoded_feature = index(feature) encoder = CategoryEncoding(output_mode="binary") feature_ds = feature_ds.map(index) encoder.adapt(feature_ds) encoded_feature = encoder(encoded_feature) return encoded_feature
def encode(self, input_feature, name, dataset): """ """ feature_ds = _extract_feature_column(dataset, name) # apply String Indexer index_encoder = StringIndexer() index_encoder.adapt(feature_ds) index_encoded_feature = index_encoder.encode(input_feature, name, dataset) feature_ds = feature_ds.map(index_encoder.encoder) # apply categorical encoding category_encoder = CategoryEncoding(output_mode="binary") category_encoder.adapt(feature_ds) encoded_feature = category_encoder(index_encoded_feature) return encoded_feature
def _category_indicate(self, params: dict, weight_input: Layer = None): """ Replacing tf.feature_column.indicator_column with CategoryEncoding from :param params: :param weight_input: :return: """ id_input = self._category_lookup(params) if weight_input is None: encoded_input = CategoryEncoding(max_tokens=params['num_buckets'], output_mode="count", sparse=True)( id_input) else: encoded_input = CategoryEncoding(max_tokens=params['num_buckets'], output_mode="count", sparse=True)( id_input, weight_input) return encoded_input
def _encode_categorical_feature( feature: KerasTensor, name: str, dataset: Optional[BatchDataset], ) -> KerasTensor: """One-hot encode categorical features. Args: - feature: The input layer of the feature. - name: The feature's name (its column name in the original dataframe). - dataset: The training data, if not specified, return a no-op layer. Returns: The one-hot encoded tensor of the input feature. """ # Return generic layer for the tuner initialization if not dataset: return KerasTensor(type_spec=TensorSpec( shape=(None, 1), dtype=tf.float32, name=None)) # Create a StringLookup layer which will turn strings into integer indices index = StringLookup() # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the set of possible string values and assign them a fixed integer index index.adapt(feature_ds) # Turn the string input into integer indices encoded_feature = index(feature) # Create a CategoryEncoding for our integer indices encoder = CategoryEncoding(output_mode="binary") # Learn the space of possible indices encoder.adapt(np.arange(index.vocab_size())) # Apply one-hot encoding to our indices{split + 1} / {n_splits} encoded_feature = encoder(encoded_feature) return encoded_feature
def load_data(): data = pd.read_csv( "https://storage.googleapis.com/tf-datasets/titanic/train.csv") from sklearn.model_selection import train_test_split labels = data.pop('survived') label_names = ["Not survived", "Survived"] features = {} # Converting CSV file into Tensorflow object for name, column in data.items(): dtype = column.dtype if dtype == object: dtype = string else: dtype = float32 features[name] = Input(shape=(1, ), name=name, dtype=dtype) # Extracting and normalizing numeric features numeric_features = { name: feature for name, feature in features.items() if feature.dtype == float32 } x = Concatenate()(list(numeric_features.values())) norm = Normalization() norm.adapt(np.array(data[numeric_features.keys()])) numeric_features = norm(x) processed_features = [numeric_features] # Extracting and normalizing non-numeric features for name, feature in features.items(): if feature.dtype == float32: continue word = StringLookup(vocabulary=np.unique(data[name])) one_hot = CategoryEncoding(max_tokens=word.vocab_size()) x = word(feature) x = one_hot(x) processed_features.append(x) processed_features = Concatenate()(processed_features) processed_features = Model(features, processed_features) utils.plot_model(model=processed_features, rankdir='LR', dpi=72, show_shapes=True) feature_dict = {name: np.array(value) for name, value in data.items()} train_features, test_features, train_labels, test_labels = train_test_split( processed_features(feature_dict).numpy(), labels, test_size=0.2) return train_features, train_labels, test_features, test_labels
def _category_onehot(self, params: dict): if params['dtype'] in ('int', 'int32', 'int64'): num_buckets = params['num_buckets'] key, input_layer = self._get_input_layer(params) else: input_layer = self._category_lookup(params) num_buckets = len(params['vocabulary_list']) name = params.get('name', params['key'] + '-onehot') cate_encode = CategoryEncoding( max_tokens=num_buckets, output_mode="binary", name=name) output = cate_encode(input_layer) return output
def __init__(self): super().__init__(CategoryEncoding(output_mode="binary"))