def get_train_dataloader(self,
                             batch_size=500,
                             shuffle=True,
                             drop_last=True):
        """Returns a pytorch DataLoader for the training dataset.

        A DataLoader represents a Python iterable over a dataset with
        additional functions like batching, shuffling of the data, etc.
        This function creates and returns a DataLoader created on
        the training dataset.

        Args:
            batch_size: Integer, how many samples per batch to load, default=500.
            shuffle: Boolean, set to True to have the data reshuffled at every epoch, default=True.
            drop_last: Boolean, set to True to drop the last incomplete batch, default=True.

        Returns:
            Returns pytorch DataLoader object.

        Raises:
            FileNotFoundError: It is raised if the numpy data file doesn't exist.
        """
        if self._train_dataset is None:
            self._train_dataset = MamoDataset(np.load(self._train_data_path),
                                              None)
        return DataLoader(self._train_dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last)
def test_none_value_error_1d():
    input_data = None
    output_data = np.array([-1, -2])
    with pytest.raises(
            ValueError,
            match='The input data is None, please give a valid input data.'):
        mamoDataset_exception = MamoDataset(input_data, output_data)
        mamoDataset_exception.__len__()
def test_length_value_error_2d():
    input_data = np.array([[1, 2], [3, 4], [5, 6]])
    output_data = np.array([-1, -2])
    with pytest.raises(
            ValueError,
            match=
            'The length of the input data must match the length of the output data!'
    ):
        mamoDataset_exception = MamoDataset(input_data, output_data)
        mamoDataset_exception.__len__()
    def get_traindata_len(self):
        """Returns the number of samples in the training dataset.

        Returns:
            Returns integer, the number of samples in the
            training dataset.
        """
        if self._train_dataset is None:
            self._train_dataset = MamoDataset(np.load(self._train_data_path),
                                              None)
        return self._train_dataset.__len__()
    def get_testdata_len(self):
        """Returns the number of samples in the testing dataset.

        Returns:
            Returns integer, the number of samples in the
            testing dataset.
        """
        if self._test_dataset is None:
            self._test_dataset = MamoDataset(
                np.load(self._test_input_data_path),
                np.load(self._test_output_data_path))
        return self._test_dataset.__len__()
예제 #6
0
def test_validator_mock_opposite_model():
    mock_dataset = MamoDataset(input_data, input_data.copy())
    mock_dataloader = DataLoader(mock_dataset,
                                 batch_size=1,
                                 shuffle=False,
                                 drop_last=False)
    v_opposite = Validator(MockOpposite(), mock_dataloader, [RecallAtK(1)],
                           [MSELoss()])
    results = v_opposite.evaluate()
    assert isinstance(results, tuple)
    assert isinstance(results[0], list)
    assert (results[0][0] == 0)
    assert isinstance(results[1], list)
    assert (results[1][0] == 1)
    assert (v_opposite.combine_objectives(results[1]) == 1)
예제 #7
0
def test_validator_mock_shift_right_by_one_model():
    mock_dataset = MamoDataset(input_data,
                               np.roll(input_data.copy(), shift=1, axis=1))
    mock_dataloader = DataLoader(mock_dataset,
                                 batch_size=1,
                                 shuffle=False,
                                 drop_last=False)
    v_shift_right = Validator(MockShiftRightByOne(), mock_dataloader,
                              [RecallAtK(1)], [MSELoss()])
    results = v_shift_right.evaluate()
    assert isinstance(results, tuple)
    assert isinstance(results[0], list)
    assert (results[0][0] == 1)
    assert isinstance(results[1], list)
    assert (results[1][0] == 0)
    assert (v_shift_right.combine_objectives(results[1]) == 0)
예제 #8
0
def test_validator_mock_no_change_model():
    mock_dataset = MamoDataset(input_data, input_data.copy())
    mock_dataloader = DataLoader(mock_dataset,
                                 batch_size=1,
                                 shuffle=False,
                                 drop_last=False)
    v_no_change = Validator(MockNoChange(), mock_dataloader, [RecallAtK(1)],
                            [MSELoss()])
    results = v_no_change.evaluate()
    assert isinstance(results, tuple)
    assert isinstance(results[0], list)
    assert (results[0][0] == 0)
    assert isinstance(results[1], list)
    # Removing chosen elements -so:
    mse = np.mean(input_data)
    assert (round(results[1][0], 2) == round(mse, 2))
    assert (round(v_no_change.combine_objectives(results[1]),
                  2) == round(mse, 2))
예제 #9
0
def test_validator_mock_all_zeros_model():
    mock_dataset = MamoDataset(input_data, input_data.copy())
    mock_dataloader = DataLoader(mock_dataset,
                                 batch_size=1,
                                 shuffle=False,
                                 drop_last=False)
    v_all_zeros = Validator(MockAllZeros(), mock_dataloader, [RecallAtK(1)],
                            None)
    results = v_all_zeros.evaluate()
    assert isinstance(results, tuple)
    assert isinstance(results[0], list)
    assert (results[0][0] == 0)
    assert isinstance(results[1], list)
    assert (results[1] == [])
    v_all_zeros = Validator(MockAllZeros(), mock_dataloader, None, [MSELoss()])
    results = v_all_zeros.evaluate()
    assert isinstance(results, tuple)
    assert isinstance(results[0], list)
    assert (results[0] == [])
    assert isinstance(results[1], list)
    mse = np.mean(input_data)
    assert (round(results[1][0], 2) == round(mse, 2))
    assert (round(v_all_zeros.combine_objectives(results[1]),
                  2) == round(mse, 2))
"""Copyright © 2020-present, Swisscom (Schweiz) AG.
All rights reserved.

This test doesn't need any custom library or any data loading.
To run them just execute 'pytest'.
"""
from dataloader.mamo_dataset import MamoDataset
import pytest
import numpy as np

# Tests for 1-d data
test_input = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
test_output = np.array([-1, -2, -3, -4, -5, -6, -7, -8, -9, -10])

# Two objects for testing
mamoDataset1 = MamoDataset(test_input)
mamoDataset2 = MamoDataset(test_input, test_output)


# Testing the none input data error
def test_none_value_error_1d():
    input_data = None
    output_data = np.array([-1, -2])
    with pytest.raises(
            ValueError,
            match='The input data is None, please give a valid input data.'):
        mamoDataset_exception = MamoDataset(input_data, output_data)
        mamoDataset_exception.__len__()


# Testing the length error
# create temporary directories
if not os.path.isdir('test_data_val'):
    os.mkdir('test_data_val')

# generate random data
np.random.seed(42)
dir_path = 'test_data_val/'

test_input_data_path = os.path.join(dir_path, 'movielens_small_test_input.npy')
test_output_data_path = os.path.join(dir_path, 'movielens_small_test_test.npy')

np.save(test_input_data_path, np.random.rand(2000, 8936).astype('float32'))
np.save(test_output_data_path, np.random.rand(2000, 8936).astype('float32'))

# Variables
dataset = MamoDataset(np.load(test_input_data_path),
                      np.load(test_output_data_path))

model = MultiVAE(params='yaml_files/params_multi_VAE.yaml')
model.initialize_model()
dataloader = DataLoader(dataset,
                        batch_size=data_info['batch_size'],
                        shuffle=True,
                        drop_last=True)
metrics = [RecallAtK(10)]
objectives = [VAELoss()]

obj_results = [0.4, 0.5, 0.7]
alphas = [0.5, 0.2, 0.3]
max_normalization = [1, 0.5, 2]

class AEDataHandler(MamoDataHandler):
    """Implementation of the MAMO Data Handler for developing
        AE recommender system models.

    This class is implementation of the abstract class Mamo Data Handler.
    It reads the data from already preprocessed and saved numpy arrays and
    returns DataLoaders for training, validating and testing.

    Attributes:
        _train_data_path: A string contaning the path to the traning numpy array.
        _validation_input_data_path: A string contaning the path to the validating input numpy array.
        _validation_output_data_path: A string contaning the path to the validating output numpy array.
        _test_input_data_path: A string contaning the path to the testing input numpy array.
        _test_output_data_path: A string contaning the path to the testing output numpy array.
        _train_dataset: A Mamo Dataset object for the training dataset.
        _validation_dataset: A Mamo Dataset object for the validating dataset.
        _test_dataset: A Mamo Dataset object for the testing dataset.
    """
    def __init__(self, dataset_name, train_data_path,
                 validation_input_data_path, validation_output_data_path,
                 test_input_data_path, test_output_data_path):
        """Inits a MAMO Data Handler object.

        This constructor inits a MAMO Data Handler object from preprocessed and saved numpy
        arrays. The arrays are saved in permanent storage in 'npy' format.

        Args:
        train_data_path: A string contaning the path to the traning numpy array.
        validation_input_data_path: A string contaning the path to the validating input numpy array.
        validation_output_data_path: A string contaning the path to the validating output numpy array.
        test_input_data_path: A string contaning the path to the testing input numpy array.
        test_output_data_path: A string contaning the path to the testing output numpy array.
        train_dataset: A Mamo Dataset object for the training dataset.
        validation_dataset: A Mamo Dataset object for the validating dataset.
        test_dataset: A Mamo Dataset object for the testing dataset.

        Raises:
            ValueError: It is raised if one more of the paths to the numpy arrays is None.
        """
        super().__init__(dataset_name)
        if train_data_path is None or validation_input_data_path is None or \
                validation_output_data_path is None or test_input_data_path is None or \
                test_output_data_path is None:
            raise ValueError(
                'One or more of the paths is None, please specify a valid path to numpy array.'
            )
        self._train_data_path = train_data_path
        self._validation_input_data_path = validation_input_data_path
        self._validation_output_data_path = validation_output_data_path
        self._test_input_data_path = test_input_data_path
        self._test_output_data_path = test_output_data_path
        self._train_dataset = None
        self._validation_dataset = None
        self._test_dataset = None

    def get_train_dataloader(self,
                             batch_size=500,
                             shuffle=True,
                             drop_last=True):
        """Returns a pytorch DataLoader for the training dataset.

        A DataLoader represents a Python iterable over a dataset with
        additional functions like batching, shuffling of the data, etc.
        This function creates and returns a DataLoader created on
        the training dataset.

        Args:
            batch_size: Integer, how many samples per batch to load, default=500.
            shuffle: Boolean, set to True to have the data reshuffled at every epoch, default=True.
            drop_last: Boolean, set to True to drop the last incomplete batch, default=True.

        Returns:
            Returns pytorch DataLoader object.

        Raises:
            FileNotFoundError: It is raised if the numpy data file doesn't exist.
        """
        if self._train_dataset is None:
            self._train_dataset = MamoDataset(np.load(self._train_data_path),
                                              None)
        return DataLoader(self._train_dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last)

    def get_validation_dataloader(self,
                                  batch_size=500,
                                  shuffle=True,
                                  drop_last=False):
        """Returns a pytorch DataLoader for the validating dataset.

        A DataLoader represents a Python iterable over a dataset with
        additional functions like batching, shuffling of the data, etc.
        This function creates and returns a DataLoader created on
        the validating dataset.

        Args:
            batch_size: Integer, how many samples per batch to load, default=500.
            shuffle: Boolean, set to True to have the data reshuffled at every epoch, default=True.
            drop_last: Boolean, set to True to drop the last incomplete batch, default=True.

        Returns:
            Returns pytorch DataLoader object.

        Raises:
            FileNotFoundError: It is raised if the numpy data file doesn't exist.
        """
        if self._validation_dataset is None:
            self._validation_dataset = MamoDataset(
                np.load(self._validation_input_data_path),
                np.load(self._validation_output_data_path))
        return DataLoader(self._validation_dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last)

    def get_test_dataloader(self,
                            batch_size=500,
                            shuffle=True,
                            drop_last=True):
        """Returns a pytorch DataLoader for the testing dataset.

        A DataLoader represents a Python iterable over a dataset with
        additional functions like batching, shuffling of the data, etc.
        This function creates and returns a DataLoader created on
        the testing dataset.

        Args:
            batch_size: Integer, how many samples per batch to load, default=500.
            shuffle: Boolean, set to True to have the data reshuffled at every epoch, default=True.
            drop_last: Boolean, set to True to drop the last incomplete batch, default=True.

        Returns:
            Returns pytorch DataLoader object.

        Raises:
            FileNotFoundError: It is raised if the numpy data file doesn't exist.
        """
        if self._test_dataset is None:
            self._test_dataset = MamoDataset(
                np.load(self._test_input_data_path),
                np.load(self._test_output_data_path))
        return DataLoader(self._test_dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last)

    def get_traindata_len(self):
        """Returns the number of samples in the training dataset.

        Returns:
            Returns integer, the number of samples in the
            training dataset.
        """
        if self._train_dataset is None:
            self._train_dataset = MamoDataset(np.load(self._train_data_path),
                                              None)
        return self._train_dataset.__len__()

    def get_validationdata_len(self):
        """Returns the number of samples in the validating dataset.

        Returns:
            Returns integer, the number of samples in the
            validating dataset.
        """
        if self._validation_dataset is None:
            self._validation_dataset = MamoDataset(
                np.load(self._validation_input_data_path),
                np.load(self._validation_output_data_path))
        return self._validation_dataset.__len__()

    def get_testdata_len(self):
        """Returns the number of samples in the testing dataset.

        Returns:
            Returns integer, the number of samples in the
            testing dataset.
        """
        if self._test_dataset is None:
            self._test_dataset = MamoDataset(
                np.load(self._test_input_data_path),
                np.load(self._test_output_data_path))
        return self._test_dataset.__len__()

    def get_input_dim(self):
        """Returns the second dimension of the input data.

        Returns:
            Returns integer, the second dimension of the input data.
        """
        return np.load(self._test_input_data_path).shape[1]

    def get_output_dim(self):
        """Returns the second dimension of the output data.

        Returns:
            Returns integer, the second dimension of the output data.
        """
        return np.load(self._test_output_data_path).shape[1]