def build_model(): input_node = tf.keras.Input(shape=(3, ), dtype=tf.string) layer = layer_module.MultiCategoryEncoding( encoding=[layer_module.INT, layer_module.INT, layer_module.NONE]) output_node = layer(input_node) output_node = tf.keras.layers.Dense(1)(output_node) return tf.keras.Model(input_node, output_node), layer
def test_multi_column_categorical_encoding(tmp_path): x_train, x_test, y_train = get_data() input_node = tf.keras.Input(shape=(3, ), dtype=tf.string) layer = layer_module.MultiCategoryEncoding([ layer_module.INT, layer_module.INT, layer_module.NONE, ]) hidden_node = layer(input_node) output_node = tf.keras.layers.Dense(1, activation='sigmoid')(hidden_node) model = tf.keras.Model(input_node, output_node) model.compile(loss='binary_crossentropy', optimizer='adam') tf.data.Dataset.zip(( (tf.data.Dataset.from_tensor_slices(x_train).batch(32), ), (tf.data.Dataset.from_tensor_slices(np.random.rand(3, 1)).batch(32), ), )) layer.adapt(tf.data.Dataset.from_tensor_slices(x_train).batch(32)) model.fit(x_train, y_train, epochs=1) model2 = tf.keras.Model(input_node, hidden_node) result = model2.predict(x_train) assert result[0][0] == result[2][0] assert result[0][0] != result[1][0] assert result[0][1] != result[1][1] assert result[0][1] != result[2][1] assert result[2][2] == 0 output = model2.predict(x_test) assert output.dtype == np.dtype('float32')
def test_preprocessing_adapt_with_cat_to_int_and_norm(): x = np.array([["a", 5], ["b", 6]]).astype(np.unicode) y = np.array([[1, 2], [3, 4]]).astype(np.unicode) dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(32) model = tf.keras.models.Sequential() model.add(tf.keras.Input(shape=(2, ), dtype=tf.string)) model.add(keras_layers.MultiCategoryEncoding(["int", "none"])) model.add(preprocessing.Normalization(axis=-1)) tuner_module.AutoTuner.adapt(model, dataset)
def build(self, hp, inputs=None): input_node = nest.flatten(inputs)[0] encoding = [] for column_name in self.column_names: column_type = self.column_types[column_name] if column_type == analysers.CATEGORICAL: # TODO: Search to use one-hot or int. encoding.append(keras_layers.INT) else: encoding.append(keras_layers.NONE) return keras_layers.MultiCategoryEncoding(encoding)(input_node)
def test_model_save_load_output_same(tmp_path): x_train = np.array([["a", "ab", 2.1], ["b", "bc", 1.0], ["a", "bc", "nan"]]) layer = layer_module.MultiCategoryEncoding( encoding=[layer_module.INT, layer_module.INT, layer_module.NONE] ) layer.adapt(tf.data.Dataset.from_tensor_slices(x_train).batch(32)) model = tf.keras.Sequential([tf.keras.Input(shape=(3,), dtype=tf.string), layer]) model.save(os.path.join(tmp_path, "model")) model2 = tf.keras.models.load_model(os.path.join(tmp_path, "model")) assert np.array_equal(model.predict(x_train), model2.predict(x_train))
def __init__(self, column_names, column_types, **kwargs): super().__init__(**kwargs) self.column_names = column_names self.column_types = column_types encoding = [] for column_name in self.column_names: column_type = self.column_types[column_name] if column_type == analysers.CATEGORICAL: # TODO: Search to use one-hot or int. encoding.append(keras_layers.INT) else: encoding.append(keras_layers.NONE) self.layer = keras_layers.MultiCategoryEncoding(encoding)
def test_multi_cat_encode_strings_correctly(tmp_path): x_train = np.array([["a", "ab", 2.1], ["b", "bc", 1.0], ["a", "bc", "nan"]]) layer = layer_module.MultiCategoryEncoding( [layer_module.INT, layer_module.INT, layer_module.NONE]) dataset = tf.data.Dataset.from_tensor_slices(x_train).batch(32) layer.adapt(tf.data.Dataset.from_tensor_slices(x_train).batch(32)) for data in dataset.map(layer): result = data assert result[0][0] == result[2][0] assert result[0][0] != result[1][0] assert result[0][1] != result[1][1] assert result[0][1] != result[2][1] assert result[2][2] == 0 assert result.dtype == tf.float32
def test_call_multi_with_single_column_return_right_shape(): layer = layer_module.MultiCategoryEncoding(encoding=[layer_module.INT]) assert layer(np.array([["a"], ["b"], ["a"]])).shape == (3, 1)
def test_init_multi_one_hot_encode(): layer_module.MultiCategoryEncoding( encoding=[layer_module.ONE_HOT, layer_module.INT, layer_module.NONE] )
def test_call_multi_with_single_column_return_right_shape(): x_train = np.array([["a"], ["b"], ["a"]]) layer = layer_module.MultiCategoryEncoding(encoding=[layer_module.INT]) layer.adapt(tf.data.Dataset.from_tensor_slices(x_train).batch(32)) assert layer(x_train).shape == (3, 1)