예제 #1
0
    def _encode_numerical_feature(
        feature: KerasTensor,
        name: str,
        dataset: Optional[BatchDataset],
    ) -> KerasTensor:
        """Normalize numerical features.

        Args:
            - feature: The input layer of the feature.
            - name: The feature's name (its column name in the original dataframe).
            - dataset: The training data, if not specified, return a no-op layer.

        Returns:
            The normalized tensor of the input feature.

        """
        # Return generic layer for the tuner initialization
        if not dataset:
            return KerasTensor(type_spec=TensorSpec(
                shape=(None, 1), dtype=tf.float32, name=None))

        # Create a Normalization layer for our feature
        normalizer = Normalization()

        # Prepare a Dataset that only yields our feature
        feature_ds = dataset.map(lambda x, y: x[name])
        feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

        # Learn the statistics of the data
        normalizer.adapt(feature_ds)

        # Normalize the input feature
        encoded_feature = normalizer(feature)

        return encoded_feature
def min_normalizer():
    """ Normalizer with minimalistic data."""
    adapt_data = np.array([
        [1., 2.],
        [2., 3.],
    ], dtype=np.float32)
    normalizer = Normalization()
    normalizer.adapt(adapt_data)
    return normalizer
예제 #3
0
def encode_numerical_feature(feature, name, dataset):
    normalizer = Normalization()

    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    normalizer.adapt(feature_ds)

    encoded_feature = normalizer(feature)
    return encoded_feature
예제 #4
0
def encode_numerical_feature(feature, name, dataset):
    # Create a Keras Normalization Layer for the input feature passed as argument
    normalizer = Normalization()
    # Prepare a Dataset containing only the feature
    feature_dset = dataset.map(lambda x, y: x[name])
    feature_dset = feature_dset.map(lambda x: tf.expand_dims(x, -1))
    # Learn the statistics of the data and normalise the input feature
    normalizer.adapt(feature_dset)
    encoded_feature = normalizer(feature)
    return encoded_feature
def load_data():
    data = pd.read_csv(
        "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
    from sklearn.model_selection import train_test_split
    labels = data.pop('survived')
    label_names = ["Not survived", "Survived"]
    features = {}

    # Converting CSV file into Tensorflow object

    for name, column in data.items():
        dtype = column.dtype
        if dtype == object:
            dtype = string
        else:
            dtype = float32
        features[name] = Input(shape=(1, ), name=name, dtype=dtype)

    # Extracting and normalizing numeric features
    numeric_features = {
        name: feature
        for name, feature in features.items() if feature.dtype == float32
    }

    x = Concatenate()(list(numeric_features.values()))
    norm = Normalization()
    norm.adapt(np.array(data[numeric_features.keys()]))
    numeric_features = norm(x)

    processed_features = [numeric_features]
    # Extracting and normalizing non-numeric features

    for name, feature in features.items():
        if feature.dtype == float32:
            continue
        word = StringLookup(vocabulary=np.unique(data[name]))
        one_hot = CategoryEncoding(max_tokens=word.vocab_size())

        x = word(feature)
        x = one_hot(x)
        processed_features.append(x)

    processed_features = Concatenate()(processed_features)
    processed_features = Model(features, processed_features)

    utils.plot_model(model=processed_features,
                     rankdir='LR',
                     dpi=72,
                     show_shapes=True)

    feature_dict = {name: np.array(value) for name, value in data.items()}

    train_features, test_features, train_labels, test_labels = train_test_split(
        processed_features(feature_dict).numpy(), labels, test_size=0.2)
    return train_features, train_labels, test_features, test_labels
예제 #6
0
def encodeMyFeature(indidualFeature, name, dataset):
    # Normalization the data
    normalizer = Normalization()
    # Pull out a data set for each feature.
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # built in code to find the statistics of the data
    normalizer.adapt(feature_ds)
    encodedFeature = normalizer(indidualFeature)
    return encodedFeature
예제 #7
0
def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature
예제 #8
0
    def __init__(self, filename, m_input='u', m_output='y'):
        # Load data
        data = loadmat(filename)

        # Normalize input data
        self.input = data[m_input]
        self.layernorminput = Normalization()
        self.layernorminput.adapt(self.input)
        self.input_norm = self.layernorminput(self.input)

        # Normalize output data
        self.output = data[m_output]
        self.layernormoutput = Normalization()
        self.layernormoutput.adapt(self.output)
        self.output_norm = self.layernormoutput(self.output)
예제 #9
0
def discard_model(input_shape):
    """
    Discard Model
    Network structure using idea of CNN
    :param input_shape: data shape
    :return: keras model class
    """
    k_input = keras.Input(input_shape)
    x = Normalization()(k_input)

    for _ in range(3):
        x = Conv2D(256, (3, 1), padding="same", data_format="channels_last")(x)
    for _ in range(5):
        x = residual_block(x, 256, _project_shortcut=True)

    x = Conv2D(kernel_size=1, strides=1, filters=1, padding="same")(x)
    x = Flatten()(x)
    outputs = Dense(34, activation="softmax")(x)
    # model = keras.applications.ResNet50V2(weights=None, input_shape=(64, 34, 1), classes=34, include_top=True)
    model = Model(k_input, outputs)
    model.summary()
    model.compile(keras.optimizers.Adam(learning_rate=0.008),
                  keras.losses.CategoricalCrossentropy(),
                  metrics=keras.metrics.CategoricalAccuracy())
    return model
예제 #10
0
def hypertune(hp):
    input_shape = keras.Input((16, 34, 1))
    x = input_shape
    x = Normalization()(x)

    for i in range(hp.Int('num_conv_layer', 1, 5, default=3)):
        x = Conv2D(hp.Int('filters_' + str(i), 32, 512, step=32, default=256),
                   (3, 1),
                   padding="same",
                   data_format="channels_last")(x)
    for i in range(
            hp.Choice('num_res_block', [5, 10, 20, 30, 40, 50], default=5)):
        x = residual_block(x,
                           hp.Choice('filters_res_block' + str(i),
                                     [64, 128, 256, 512],
                                     default=256),
                           _project_shortcut=True)
        x = residual_block(x,
                           hp.Choice('filters_res_block' + str(i),
                                     [64, 128, 256, 512],
                                     default=256),
                           _project_shortcut=True)
    x = Conv2D(kernel_size=1, strides=1, filters=1, padding="same")(x)
    x = Flatten()(x)
    outputs = Dense(34, activation="softmax")(x)
    model = Model(input_shape, outputs)
    model.summary()
    model.compile(hp.Choice('optimizer', ['adam', 'sgd', 'Nadam']),
                  keras.losses.CategoricalCrossentropy(),
                  metrics=keras.metrics.CategoricalAccuracy())
    return model
    def __init__(
        self,
        pretrain_dataset: str = None,
        pooling: str = "max",
        task: str = "orig_labels",
    ):
        """A ResNet50 model that can be pretrained or trained from scratch,
        adapting to each relevant variation in this project. This is the
        model class version.

        WARNING: Although this seems more clean, it doesn't work well with
        Weights and Biases' callback. Please use func_resnet instead.

        Args:
            pretrain_dataset (str, optional): The dataset in which the model
            is pretrained. If left unspecified, the model starts with random
            weights. Available options are "imagenet" and "bigearthnet".
            Defaults to None.
            pooling (str, optional): The type of global pooling to perform
            after the ResNet layers. Available options are "max" and "avg".
            Defaults to "max".
            task (str, optional): The task on which the model will be trained
            or fine-tuned on. Available options are "orig_labels" (original
            labels from the Kaggle challenge) and "deforestation".
            Defaults to "orig_labels".

        Raises:
            Exception: [description]
        """
        super(ResNet, self).__init__()
        self.pretrain_dataset = pretrain_dataset
        self.pooling = pooling
        self.task = task
        if self.task == "orig_labels":
            self.n_outputs = 17
        elif self.task == "deforestation":
            self.n_outputs = 1
        else:
            raise Exception(
                f'ERROR: Unrecognized task "{task}". Please select one of "orig_labels" or "deforestation".'
            )
        if pretrain_dataset == "bigearthnet":
            self.core = hub.KerasLayer(
                "https://tfhub.dev/google/remote_sensing/bigearthnet-resnet50/1"
            )
            # TensorFlow Hub modules require data in a [0, 1] range
            # stats estimated from subset of data in `02_eda_amazon_planet` notebook
            self.preprocess_input = Normalization(
                mean=[79.67114306, 87.08461826, 76.46177919],
                variance=[1857.54070494, 1382.94249315, 1266.69265399],
            )
        else:
            self.core = ResNet50(
                include_top=False,
                weights=pretrain_dataset,
                pooling=self.pooling,
            )
            # Using TensorFlow's ResNet-specific preprocessing
            self.preprocess_input = preprocess_input
        self.classifier = layers.Dense(self.n_outputs, activation="sigmoid")
예제 #12
0
def rcpk_model(input_shape):
    """
    Riichi, Chi, Pon, Kan models
    Network structure using idea of CNN
    :param input_shape: data shape
    :return: keras model class
    """
    k_input = keras.Input(input_shape)
    x = Normalization()(k_input)
    for _ in range(3):
        x = Conv2D(256, (3, 1), padding="same", data_format="channels_last")(x)
    for _ in range(5):
        x = residual_block(x, 256, _project_shortcut=True)
    for _ in range(3):
        x = Conv2D(32, (3, 1), padding="same", data_format="channels_last")(x)
    x = Flatten()(x)
    x = Dense(1024)(x)
    x = Dense(256)(x)
    outputs = Dense(2, activation="softmax")(x)

    model = Model(k_input, outputs)
    model.summary()
    model.compile(keras.optimizers.Adam(learning_rate=0.008),
                  keras.losses.BinaryCrossentropy(),
                  metrics=keras.metrics.Accuracy())
    return model
def func_resnet(
    pretrain_dataset: str = None,
    pooling: str = "max",
    task: str = "orig_labels",
):
    """A ResNet50 model that can be pretrained or trained from scratch,
    adapting to each relevant variation in this project. This is the
    functional API version.

    Works well with Weights and Biases' callback.

    Args:
        pretrain_dataset (str, optional): The dataset in which the model
        is pretrained. If left unspecified, the model starts with random
        weights. Available options are "imagenet" and "bigearthnet".
        Defaults to None.
        pooling (str, optional): The type of global pooling to perform
        after the ResNet layers. Available options are "max" and "avg".
        Defaults to "max".
        task (str, optional): The task on which the model will be trained
        or fine-tuned on. Available options are "orig_labels" (original
        labels from the Kaggle challenge) and "deforestation".
        Defaults to "orig_labels".

    Raises:
        Exception: [description]
    """
    inputs = layers.Input(shape=(256, 256, 3))
    if task == "orig_labels":
        n_outputs = 17
    elif task == "deforestation":
        n_outputs = 1
    else:
        raise Exception(
            f'ERROR: Unrecognized task "{task}". Please select one of "orig_labels" or "deforestation".'
        )
    if pretrain_dataset == "bigearthnet":
        # TensorFlow Hub modules require data in a [0, 1] range
        # stats estimated from subset of data in `02_eda_amazon_planet` notebook
        x = Normalization(
            mean=[79.67114306, 87.08461826, 76.46177919],
            variance=[1857.54070494, 1382.94249315, 1266.69265399],
        )(inputs)
        x = data_augmentation(x)
        x = hub.KerasLayer(
            "https://tfhub.dev/google/remote_sensing/bigearthnet-resnet50/1")(
                x)
    else:
        # Using TensorFlow's ResNet-specific preprocessing
        x = preprocess_input(x)
        x = data_augmentation(x)
        x = ResNet50(
            include_top=False,
            weights=pretrain_dataset,
            pooling=pooling,
        )(x)
    outputs = layers.Dense(n_outputs, activation="sigmoid")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model
예제 #14
0
def train(data):

    types = set()
    for file in data["files"]:
        types.add(file["type"])

    # Try to get >100 of each type
    training_input, training_output = convert_data_to_np(data, list(types))

    # n = tf.keras.utils.normalize(training_input, axis=-1, order=2)
    normalizer = Normalization(axis=-1)
    normalizer.adapt(training_input)
    normalized_data = normalizer(training_input)
    print("var: %.4f" % np.var(normalized_data))
    print("mean: %.4f" % np.mean(normalized_data))

    dense = keras.layers.Dense(units=16)
예제 #15
0
 def __init__(self,
              test_size: float = 0.2,
              validation_size: float = 0.33) -> None:
     # User-definen constants
     self.num_targets = 1
     self.batch_size = 128
     # Load the data set
     dataset = fetch_california_housing()
     self.x, self.y = dataset.data, dataset.target
     self.feature_names = dataset.feature_names
     self.description = dataset.DESCR
     # Split the dataset
     x_train, x_test, y_train, y_test = train_test_split(
         self.x, self.y, test_size=test_size)
     x_train, x_val, y_train, y_val = train_test_split(
         x_train, y_train, test_size=validation_size)
     # Preprocess x data
     self.x_train = x_train.astype(np.float32)
     self.x_test = x_test.astype(np.float32)
     self.x_val = x_val.astype(np.float32)
     # Preprocess y data
     self.y_train = np.reshape(y_train,
                               (-1, self.num_targets)).astype(np.float32)
     self.y_test = np.reshape(y_test,
                              (-1, self.num_targets)).astype(np.float32)
     self.y_val = np.reshape(y_val,
                             (-1, self.num_targets)).astype(np.float32)
     # Dataset attributes
     self.train_size = self.x_train.shape[0]
     self.test_size = self.x_test.shape[0]
     self.num_features = self.x_train.shape[1]
     self.num_targets = self.y_train.shape[1]
     # Normalization variables
     self.normalization_layer = Normalization()
     self.normalization_layer.adapt(self.x_train)
     # tf.data Datasets
     self.train_dataset = tf.data.Dataset.from_tensor_slices(
         (self.x_train, self.y_train))
     self.test_dataset = tf.data.Dataset.from_tensor_slices(
         (self.x_test, self.y_test))
     self.val_dataset = tf.data.Dataset.from_tensor_slices(
         (self.x_val, self.y_val))
     self.train_dataset = self._prepare_dataset(self.train_dataset,
                                                shuffle=True)
     self.test_dataset = self._prepare_dataset(self.test_dataset)
     self.val_dataset = self._prepare_dataset(self.val_dataset)
예제 #16
0
def cifar_standardization(x, mode='FEATURE_NORMALIZE', data_samples=None):
    mode = mode.upper()
    assert mode in ['FEATURE_NORMALIZE', 'PIXEL_MEAN_SUBTRACT']

    if mode == 'PIXEL_MEAN_SUBTRACT' and not data_samples:
        raise ValueError('`data_samples` argument should not be `None`, '
                         'when `mode="PIXEL_MEAN_SUBTRACT"`.')

    if mode == 'FEATURE_NORMALIZE':
        cifar_mean = tf.cast(CIFAR_MEAN, tf.float32).numpy()
        cifar_std = tf.cast(CIFAR_STD, tf.float32).numpy()

        x = Rescaling(scale=1. / cifar_std,
                      offset=-(cifar_mean / cifar_std),
                      name='mean_normalization')(x)
    elif mode == 'PIXEL_MEAN_SUBTRACT':
        mean_subtraction_layer = Normalization(axis=[1, 2, 3],
                                               name='pixel_mean_subtraction')
        mean_subtraction_layer.adapt(data_samples)

        # set values of variance = 1. and keep mean values as is
        mean_pixels = mean_subtraction_layer.get_weights()[0]
        mean_subtraction_layer.set_weights(
            [mean_pixels, tf.ones_like(mean_pixels)])

        x = mean_subtraction_layer(x)
        x = Rescaling(scale=1 / 255., name='rescaling')(x)
    return x
예제 #17
0
    def create(self, params=None):
        self.set_params(params)
        inputs = Input(shape=(self.input_shape, ))
        norm = Normalization()(inputs)
        dense_1 = Dense(32, activation="relu")(norm)
        relu_1 = ReLU()(dense_1)
        dense_2 = Dense(32, activation="relu")(relu_1)
        outputs = Dense(1, activation="sigmoid")(dense_2)

        self.model = Model(inputs=inputs, outputs=outputs, name=self.name)
예제 #18
0
 def __init__(self, excel_file_path: str, test_size: float = 0.2, validation_size: float = 0.33) -> None:
     # Load the dataset
     dataset = load_dataset(excel_file_path)
     self.x = dataset["data"]
     self.y = dataset["target"]
     self.feature_names = dataset["feature_names"]
     # User-definen constants
     self.num_targets = 1
     self.batch_size = 128
     # Split the dataset
     x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=test_size)
     x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_size)
     # Preprocess x data
     self.x_train = x_train.astype(np.float32)
     self.x_test = x_test.astype(np.float32)
     self.x_val = x_val.astype(np.float32)
     # Preprocess y data
     self.y_train = np.reshape(y_train, (-1, self.num_targets)).astype(np.float32)
     self.y_test = np.reshape(y_test, (-1, self.num_targets)).astype(np.float32)
     self.y_val = np.reshape(y_val, (-1, self.num_targets)).astype(np.float32)
     # Dataset attributes
     self.train_size = self.x_train.shape[0]
     self.test_size = self.x_test.shape[0]
     self.num_features = self.x_train.shape[1]
     self.num_targets = self.y_train.shape[1]
     # Normalization variables
     self.normalization_layer = Normalization()
     self.normalization_layer.adapt(self.x_train)
     # tf.data Datasets
     self.train_dataset = tf.data.Dataset.from_tensor_slices((self.x_train, self.y_train))
     self.test_dataset = tf.data.Dataset.from_tensor_slices((self.x_test, self.y_test))
     self.val_dataset = tf.data.Dataset.from_tensor_slices((self.x_val, self.y_val))
     # Dataset preparation
     self.train_dataset = self._prepare_dataset(self.train_dataset, shuffle=True)
     self.test_dataset = self._prepare_dataset(self.test_dataset)
     self.val_dataset = self._prepare_dataset(self.val_dataset)
예제 #19
0
inputWords = [[gloveDict.get(w, [0] * dims) for w in t] for t in inputWords] #vectorise words: if the word is unique to MTG it just gets set to 0
print("Vectorised oracle text")

#split data into training and test datasets
testNames = [data[i]["name"] for i in range(len(data)) if data[i]["set"] == "m20"]
trainVecs = [inputVecs[i] for i in range(len(inputVecs)) if not data[i]["set"] == "m20"]
testVecs = [inputVecs[i] for i in range(len(inputVecs)) if data[i]["set"] == "m20"]
trainWords = tf.ragged.constant([inputWords[i] for i in range(len(inputWords)) if not data[i]["set"] == "m20"])
testWords = tf.ragged.constant([inputWords[i] for i in range(len(inputWords)) if data[i]["set"] == "m20"])
trainCorRars = [corRars[i] for i in range(len(corRars)) if not data[i]["set"] == "m20"]
testCorRars = [corRars[i] for i in range(len(corRars)) if data[i]["set"] == "m20"]

#normalise input vectors
trainVecs = np.array(trainVecs).astype("float32") #convert to numpy format
normalizer = Normalization()
normalizer.adapt(trainVecs)
trainVecs = normalizer(trainVecs)
testVecs = normalizer(testVecs)
print("Normalised numerical data")

#build keras model
wordIn = layers.Input(shape = (None, len(inputWords[0][0]))) #input layer for var-length word vec data
numIn = layers.Input(shape = (len(inputVecs[0]), )) #input layer for fixed-length numerical data
rnn = layers.LSTM(32)(wordIn) #RNN layer for word vec data
numLayer = layers.Dense(20, activation = "relu")(numIn) #layer for numerical data
merge = layers.concatenate([rnn, numLayer]) #combine the two vectors
combLayer = layers.Dense(64, activation = "relu")(merge) #hidden intermediate layer for combined data
out = layers.Dense(3, activation = "softmax")(combLayer) #final layer: softmax ensures output is a set of probabilities
model = DualModel(inputs = [numIn, wordIn], outputs = [out])
model.compile(loss = "sparse_categorical_crossentropy", metrics = "sparse_categorical_accuracy", optimizer = "adam") #I have no idea whether these ones are the best ones to use
예제 #20
0
 def __init__(self):
     super().__init__(Normalization())
# in the `adapt()` data. Unknown n-grams are encoded via an "out-of-vocabulary"
# token.
integer_data = vectorizer(training_data)
print(integer_data)
"""
**Example: normalizing features**

"""

from tensorflow.keras.layers.experimental.preprocessing import Normalization

# Example image data, with values in the [0, 255] range
training_data = np.random.randint(0, 256,
                                  size=(64, 200, 200, 3)).astype("float32")

normalizer = Normalization(axis=-1)
normalizer.adapt(training_data)

normalized_data = normalizer(training_data)
print("var: %.4f" % np.var(normalized_data))
print("mean: %.4f" % np.mean(normalized_data))
"""
**Example: rescaling & center-cropping images**

Both the `Rescaling` layer and the `CenterCrop` layer are stateless, so it isn't
 necessary to call `adapt()` in this case.
"""

from tensorflow.keras.layers.experimental.preprocessing import CenterCrop
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
model.fit(x_train, y_train, epochs = 1,
validation_split=0.1, batch_size=16)
"""
In general, it's a good practice to develop models that
take raw data as input, as opposed to models that take
already-preprocessed data. The reason being that, if your
model expects preprocessed data, any time you export
your model to use it elsewhere (in a web browser, in a
mobile app), you'll need to reimplement the same exact
preprocessing pipeline. This can be a bit tricky to do.
"""
# normalize in range [0, 1]
scaling_layer = Rescaling(1.0 / 255)
# normalize in range [-1, 1]
input_ = tf.keras.Input(shape=(32, 32, 3))
norm_neg_one_to_one = Normalization()
x = norm_neg_one_to_one(input_)
import numpy as np
mean = [127.5]*3
var = mean ** 2
norm_neg_one_to_one.set_weights([mean, var])
norm_neg_one_to_one.get_weights()

# normalize with mean 0 and std 1
norm_mean_std = Normalization()
norm_mean_std.adapt(x_train[0])

model_ = Sequential([
    tf.keras.Input(shape=(32, 32, 3)),
    norm_mean_std,
    model
예제 #23
0
dept_targets = np.random.randint(2, size=(6, 4))
print(dept_targets)

dataset = keras.preprocessing.image_dataset_from_directory('mian_directory',
                                                           batch_size=64,
                                                           image_size=(2000,
                                                                       2000))
for data, labels in dataset:
    print(data.shape)
    print(data.dtype)
    print(labels.shape)
    print(labels.dtype)
#规范化
trianing_data = np.random.randint(0, 256,
                                  size=(64, 200, 200, 3)).astype("float32")
normalizetion = Normalization(axis=-1)  #沿着最后一个下标变换的方向
normalizetion.adapt(trianing_data)
normalizetion_data = normalizetion(trianing_data)
print("var:%.4f" % np.var(normalizetion_data))  #np.var求标准差
print("mean:%.4f" % np.mean(normalizetion_data))
#重新缩放和中心裁剪图像
cropper = CenterCrop(height=150, width=150)
scaler = Rescaling(scale=1.0 / 255)
output_data = scaler(cropper(
    trianing_data))  #无论是Rescaling层与CenterCrop层是无状态的,所以没有必要调用adapt()在这种情况下。
print(output_data.shape)
print("min:", np.min(output_data))

#使用Keras Functional API构建模型
dense = keras.layers.Dense(units=16)  #它将其输入映射到16维特征空间:
예제 #24
0
class CALIHOUSING:
    def __init__(self,
                 test_size: float = 0.2,
                 validation_size: float = 0.33) -> None:
        # User-definen constants
        self.num_targets = 1
        self.batch_size = 128
        # Load the data set
        dataset = fetch_california_housing()
        self.x, self.y = dataset.data, dataset.target
        self.feature_names = dataset.feature_names
        self.description = dataset.DESCR
        # Split the dataset
        x_train, x_test, y_train, y_test = train_test_split(
            self.x, self.y, test_size=test_size)
        x_train, x_val, y_train, y_val = train_test_split(
            x_train, y_train, test_size=validation_size)
        # Preprocess x data
        self.x_train = x_train.astype(np.float32)
        self.x_test = x_test.astype(np.float32)
        self.x_val = x_val.astype(np.float32)
        # Preprocess y data
        self.y_train = np.reshape(y_train,
                                  (-1, self.num_targets)).astype(np.float32)
        self.y_test = np.reshape(y_test,
                                 (-1, self.num_targets)).astype(np.float32)
        self.y_val = np.reshape(y_val,
                                (-1, self.num_targets)).astype(np.float32)
        # Dataset attributes
        self.train_size = self.x_train.shape[0]
        self.test_size = self.x_test.shape[0]
        self.num_features = self.x_train.shape[1]
        self.num_targets = self.y_train.shape[1]
        # Normalization variables
        self.normalization_layer = Normalization()
        self.normalization_layer.adapt(self.x_train)
        # tf.data Datasets
        self.train_dataset = tf.data.Dataset.from_tensor_slices(
            (self.x_train, self.y_train))
        self.test_dataset = tf.data.Dataset.from_tensor_slices(
            (self.x_test, self.y_test))
        self.val_dataset = tf.data.Dataset.from_tensor_slices(
            (self.x_val, self.y_val))
        self.train_dataset = self._prepare_dataset(self.train_dataset,
                                                   shuffle=True)
        self.test_dataset = self._prepare_dataset(self.test_dataset)
        self.val_dataset = self._prepare_dataset(self.val_dataset)

    def get_train_set(self) -> tf.data.Dataset:
        return self.train_dataset

    def get_test_set(self) -> tf.data.Dataset:
        return self.test_dataset

    def get_val_set(self) -> tf.data.Dataset:
        return self.val_dataset

    def _prepare_dataset(self,
                         dataset: tf.data.Dataset,
                         shuffle: bool = False) -> tf.data.Dataset:
        dataset = dataset.map(
            map_func=lambda x, y: (tf.reshape(self.normalization_layer(
                tf.reshape(x, shape=(1, self.num_features)), training=False),
                                              shape=(self.num_features, )), y),
            num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if shuffle:
            dataset = dataset.shuffle(buffer_size=1_000)

        dataset = dataset.batch(batch_size=self.batch_size)

        return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
 def __init__(self, data=None, axis=-1):
     """ typically the last axis is the one we normalize over """
     super().__init__(data=data)
     self.processor = Normalization(axis=axis)