示例#1
0
    def test_append(self):
        """Test for .append() method."""
        ds_append = self.dataset.append(self.dataset)

        self.assertEqual(self.dataset.num_samples * 2, ds_append.num_samples)

        self.assert_array_equal(
            ds_append.X,
            CArray([[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2, 3], [4, 5, 6],
                    [7, 8, 9]]))
        self.assert_array_equal(ds_append.Y, CArray([1, 2, 2, 1, 2, 2]))

        # Test append with header
        ds = self.dataset.deepcopy()

        # Test append with header in both ds
        header = CDatasetHeader(id='mydataset',
                                age=34,
                                colors=CArray([1, 2, 3]))

        ds.header = header

        # Test append with header in both ds
        ds_append = ds.append(ds)
        ds_params = ds_append.header.get_params()
        self.assertEqual(ds_params['id'], 'mydataset')
        self.assertEqual(ds_params['age'], 34)
        self.assert_array_equal(ds_params['colors'], CArray([1, 2, 3, 1, 2,
                                                             3]))

        # Create two copies now for later tests
        ds1 = self.dataset.deepcopy()
        ds2 = self.dataset.deepcopy()

        # For the following tests we cannot use CArrays as params. Use tuple
        header = CDatasetHeader(id='mydataset', age=34, colors=(1, 2, 3))
        ds1.header = header
        ds2.header = header

        # Test append with header in first ds
        ds_append = ds1.append(self.dataset)
        ds_params = ds_append.header.get_params()
        self.assertEqual(ds_params['id'], 'mydataset')
        self.assertEqual(ds_params['age'], 34)
        self.assertEqual(ds_params['colors'], (1, 2, 3))

        # Test append with header in second ds
        ds_append = self.dataset.append(ds2)
        ds_params = ds_append.header.get_params()
        self.assertEqual(ds_params['id'], 'mydataset')
        self.assertEqual(ds_params['age'], 34)
        self.assert_array_equal(ds_params['colors'], (1, 2, 3))
示例#2
0
    def test_copy(self):
        """Test for .deepcopy() method."""
        ds_copy = self.dataset.deepcopy()
        ds_copy.X[0, :] = 100
        ds_copy.Y[0] = 100

        self.assert_array_equal(self.dataset.X[0, :], CArray([[1, 2, 3]]))
        self.assert_array_equal(self.dataset.Y[0], CArray([1]))

        self.assert_array_equal(ds_copy.X[0, :], CArray([[100, 100, 100]]))
        self.assert_array_equal(ds_copy.Y[0], CArray([100]))

        # Test deepcopy with header
        header = CDatasetHeader(id='mydataset',
                                age=34,
                                colors=CArray([1, 2, 3]))
        self.dataset.header = header

        ds_copy = self.dataset.deepcopy()

        # Now change header of original dataset
        self.dataset.header.colors[0] = 100
        ds_params = self.dataset.header.get_params()
        self.assertEqual(ds_params['id'], 'mydataset')
        self.assertEqual(ds_params['age'], 34)
        self.assert_array_equal(ds_params['colors'], CArray([100, 2, 3]))

        ds_params = ds_copy.header.get_params()
        self.assertEqual(ds_params['id'], 'mydataset')
        self.assertEqual(ds_params['age'], 34)
        self.assert_array_equal(ds_params['colors'], CArray([1, 2, 3]))
示例#3
0
    def setUp(self):

        self.ds = CDLRandom(n_samples=10, random_state=0).load()

        timestamps = CArray([
            '2016-02-17T10:35:58', '2014-04-04T22:24:22',
            '2016-08-07T17:10:36', '2014-05-22T11:02:58',
            '2016-07-01T07:12:34', '2016-01-03T13:10:38',
            '2014-07-28T23:42:00', '2014-07-08T09:42:42',
            '2016-05-06T18:38:08', '2015-11-03T21:07:04'
        ])

        self.ds.header = CDatasetHeader(timestamp=timestamps,
                                        timestamp_fmt='%Y-%m-%dT%H:%M:%S')
    def load(self, min_faces_per_person=None, funneled=True, color=False):
        """Load LFW dataset.

        Extra dataset attributes:
         - 'img_w', 'img_h': size of the images in pixels.
         - 'y_names': tuple with the name string for each class.

        Parameters
        ----------
        min_faces_per_person : int or None, optional
            The extracted dataset will only retain pictures of people
            that have at least min_faces_per_person different pictures.
            Default None, so all db images are returned.
        funneled : bool, optional
            Download and use the images aligned with deep funneling.
            Default True.
        color : bool, optional
            Keep the 3 RGB channels instead of averaging them to a
            single gray level channel. Default False.

        """
        with CDataLoaderLFW.__lock:
            lfw_people = fetch_lfw_people(
                data_home=SECML_DS_DIR,
                funneled=funneled,
                resize=1,
                min_faces_per_person=min_faces_per_person,
                color=color,
                slice_=None,
                download_if_missing=True)

        x = CArray(lfw_people.data)
        y = CArray(lfw_people.target)

        img_w = lfw_people.images.shape[2]
        img_h = lfw_people.images.shape[1]

        y_names = tuple(lfw_people.target_names.tolist())

        header = CDatasetHeader(img_w=img_w, img_h=img_h, y_names=y_names)

        return CDataset(x, y, header=header)
示例#5
0
    def test_custom_attr(self):
        """Testing for custom attributes."""
        header = CDatasetHeader(id='mydataset',
                                age=34,
                                colors=CArray([1, 2, 3]))
        ds = CDataset(self.X, self.Y, header=header)

        ds_params = ds.header.get_params()
        self.assertEqual(ds_params['id'], 'mydataset')
        self.assertEqual(ds_params['age'], 34)
        self.assert_array_equal(ds_params['colors'], CArray([1, 2, 3]))

        # Testing getitem. Immutable objects should be copied as they are.
        # Arrays should be indexed.
        ds_get = ds[[0, 2], :]
        ds_params = ds_get.header.get_params()
        self.assert_array_equal(ds_get.X, CArray([[1, 2, 3], [7, 8, 9]]))
        self.assert_array_equal(ds_get.Y, CArray([1, 2]))
        self.assertEqual(ds_params['id'], 'mydataset')
        self.assertEqual(ds_params['age'], 34)
        self.assert_array_equal(ds_params['colors'], CArray([1, 3]))
    def _load(self, train_files, test_files, meta_file,
              labels_key, class_names_key, val_size=0):
        """Load all images of the dataset.

        Adapted from: http://dataset-loading.readthedocs.io/en/latest/_modules/dataset_loading/cifar.html

        Parameters
        ----------
        train_files : list
            List of the files where the training set is stored.
        test_files : list
            List of the files where the test set is stored.
        meta_file : str
            Name of the metafile containing the class names.
        labels_key : bytes
            Dictionary key where the labels are stored.
        class_names_key : bytes
            Dictionary key where the class names are stored.
        val_size : int, optional
            Size of the validation set.
            Default 0, so no validation dataset will be returned.

        Returns
        -------
        training_set : CDataset
            Training set.
        test_set : CDataset
            Test set.
        validation_set : CDataset, optional
            Validation set. Returned only if val_size > 0.

        """
        self.logger.info(
            "Loading {:} dataset from {:}...".format(self.class_type,
                                                     self.data_path))

        def load_files(batches_list):
            # Function that loads the data into memory
            data = None
            labels = None
            for batch in batches_list:
                with open(batch, 'rb') as bf:
                    mydict = pickle.load(bf, encoding='bytes')

                # The labels have different names in the two datasets
                new_data = np.array(mydict[b'data'], dtype='uint8')
                newlabels = np.array(mydict[labels_key], dtype='int32')
                if data is not None:
                    data = np.vstack([data, new_data])
                    labels = np.hstack([labels, newlabels])
                else:
                    data = new_data
                    labels = newlabels

            return data, labels

        # Load training and test sets
        train_data, train_labels = load_files(
            [fm.join(self.data_path, f) for f in train_files])
        test_data, test_labels = load_files(
            [fm.join(self.data_path, f) for f in test_files])

        val_data = None
        val_labels = None
        # Populate the validation set if needed
        if val_size > 0:
            train_data, val_data = np.split(
                train_data, [train_data.shape[0] - val_size])
            train_labels, val_labels = np.split(
                train_labels, [train_labels.shape[0] - val_size])

        # Load the class names from the meta file
        class_names = self._load_class_names(meta_file, class_names_key)

        header = CDatasetHeader(img_w=32, img_h=32, class_names=class_names)

        tr = CDataset(train_data, train_labels, header=header)
        ts = CDataset(test_data, test_labels, header=header)

        # Return training set and test set for sure
        out_datasets = (tr, ts)

        if val_size > 0:
            val = CDataset(val_data, val_labels, header=header)
            # Also return the validation dataset
            out_datasets += (val, )

        return out_datasets
    def load(self,
             ds_type,
             day='day4',
             icub7=False,
             resize_shape=(128, 128),
             crop_shape=None,
             normalize=True):
        """Load the dataset.

        The pre-cropped version of the images is loaded, with size 128 x 128.
        An additional resize/crop shape could be passed as input if needed.

        Extra dataset attributes:
          - 'img_w', 'img_h': size of the images in pixels.
          - 'y_orig': CArray with the original labels of the objects.

        Parameters
        ----------
        ds_type : str
            Identifier of the dataset to download, either 'train' or 'test'.
        day : str, optional
            Acquisition day from which to load the images. Default 'day4'.
            The available options are: 'day1', 'day2', 'day3', 'day4'.
        icub7 : bool or int, optional
            If True, load a reduced dataset with 7 objects by
            taking the 3rd object for each category. Default False.
            If int, the Nth object for each category will be loaded.
        resize_shape : tuple, optional
           Images will be resized to (height, width) shape. Default (128, 128).
        crop_shape : tuple or None, optional
            If a tuple, a crop of (height, width) shape will be extracted
            from the center of each image. Default None.
        normalize : bool, optional
            If True, images are normalized between 0-1. Default True.

        Returns
        -------
        CDataset
            Output dataset.

        """
        if ds_type == 'train':
            data_path = self._train_path
        elif ds_type == 'test':
            data_path = self._test_path
        else:
            raise ValueError("use ds_type = {'train', 'test'}.")

        day_path = fm.join(data_path, day)
        if not fm.folder_exist(day_path):
            raise ValueError("{:} not available.".format(day))

        self.logger.info(
            "Loading iCubWorld{:} {:} {:} dataset from {:}".format(
                '7' if icub7 else '28', day, ds_type, day_path))

        icub7 = 3 if icub7 is True else icub7  # Use the 3rd sub-obj by default

        x = None
        y_orig = []
        for obj in sorted(fm.listdir(day_path)):  # Objects (cup, sponge, ..)

            obj_path = fm.join(day_path, obj)

            # Sub-objects (cup1, cup2, ...)
            for sub_obj in sorted(fm.listdir(obj_path)):

                if icub7 and sub_obj[-1] != str(icub7):
                    continue  # Load only the `icub7`th object

                self.logger.debug("Loading images for {:}".format(sub_obj))

                sub_obj_path = fm.join(obj_path, sub_obj)

                for f in sorted(fm.listdir(sub_obj_path)):

                    img = Image.open(fm.join(sub_obj_path, f))

                    if resize_shape is not None:
                        img = resize_img(img, resize_shape)
                    if crop_shape is not None:
                        img = crop_img(img, crop_shape)

                    img = CArray(img.getdata(), dtype='uint8').ravel()
                    x = x.append(img, axis=0) if x is not None else img

                    y_orig.append(sub_obj)  # Label is given by sub-obj name

        # Create the int-based array of labels. Keep original labels in y_orig
        y_orig = CArray(y_orig)
        y = CArray(y_orig).unique(return_inverse=True)[1]

        if normalize is True:
            x /= 255.0

        # Size of images is the crop shape (if any) otherwise, the resize shape
        img_h, img_w = crop_shape if crop_shape is not None else resize_shape

        header = CDatasetHeader(img_w=img_w, img_h=img_h, y_orig=y_orig)

        return CDataset(x, y, header=header)
示例#8
0
    def load(self, ds, digits=tuple(range(0, 10)), num_samples=None):
        """Load all images of specified format inside given path.

        Adapted from: http://cvxopt.org/_downloads/mnist.py

        Extra dataset attributes:
         - 'img_w', 'img_h': size of the images in pixels.
         - 'y_original': array with the original labels (before renumbering)

        Parameters
        ----------
        ds : str
            Identifier of the dataset to download,
            either 'training' or 'testing'.
        digits : tuple
            Tuple with the digits to load. By default all digits are loaded.
        num_samples : int or None, optional
            Number of expected samples in resulting ds.
            If int, an equal number of samples will be taken
            from each class until `num_samples` have been loaded.
            If None, all samples will be loaded.

        """
        if ds == "training":
            data_path = self.train_data_path
            lbl_path = self.train_labels_path
        elif ds == "testing":
            data_path = self.test_data_path
            lbl_path = self.test_labels_path
        else:
            raise ValueError("ds must be 'training' or 'testing'")

        self.logger.info("Loading MNIST {:} dataset from {:}...".format(
            ds, MNIST_PATH))

        # Opening the labels data
        flbl = open(lbl_path, 'rb')
        magic_nr, size = struct.unpack(">II", flbl.read(8))
        if magic_nr != 2049:
            raise ValueError('Magic number mismatch, expected 2049,'
                             'got {}'.format(magic_nr))
        lbl = array("b", flbl.read())
        flbl.close()

        # Opening the images data
        fimg = open(data_path, 'rb')
        magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
        if magic_nr != 2051:
            raise ValueError('Magic number mismatch, expected 2051,'
                             'got {}'.format(magic_nr))
        img = array("B", fimg.read())
        fimg.close()

        # Convert digits to tuple in case was passed as array/list
        digits = tuple(digits)

        # Number of samples per class
        if num_samples is not None:
            div = len(digits)
            n_samples_class = [
                int(num_samples / div) + (1 if x < num_samples % div else 0)
                for x in range(div)
            ]
            n_samples_class = {
                e: n_samples_class[e_i]
                for e_i, e in enumerate(digits)
            }
        else:  # No constraint on the number of samples
            n_samples_class = {e: size for e in digits}

        # Counter of already taken sample for a class
        count_samples_class = {e: 0 for e in digits}

        # Extract the indices of samples to load
        ind = []
        for k in range(size):
            if lbl[k] in digits:
                # Check the maximum number of samples for current digits
                if count_samples_class[lbl[k]] < n_samples_class[lbl[k]]:
                    ind += [k]
                    count_samples_class[lbl[k]] += 1

        # Number of loaded samples
        num_loaded = sum(count_samples_class.values())

        # Check if dataset has enough samples
        if num_samples is not None and num_loaded < num_samples:
            min_val = min(count_samples_class.values())
            raise ValueError(
                "not enough samples in dataset for one ore more of the "
                "desired classes ({:} available)".format(min_val))

        images = CArray.zeros((len(ind), rows * cols), dtype=np.uint8)
        labels = CArray.zeros(len(ind), dtype=int)
        digs_array = CArray(digits)  # To use find method
        for i in range(len(ind)):
            images[i, :] = CArray(img[ind[i] * rows * cols:(ind[i] + 1) *
                                      rows * cols])
            labels[i] = CArray(digs_array.find(digs_array == lbl[ind[i]]))

        header = CDatasetHeader(img_w=28, img_h=28, y_original=digits)

        return CDataset(images, labels, header=header)
    def load(self, ds_path, img_format, label_dtype=None, load_data=True):
        """Load all images of specified format inside given path.

        Extra dataset attributes:
         - 'id': last `ds_path` folder.
         - 'img_w', 'img_h': size of the images in pixels.
         - 'img_c': images number of channels.
         - Any other custom attribute is retrieved from 'attributes.txt' file.
           Only attributes of `str` type are currently supported.

        Parameters
        ----------
        ds_path : str
            Full path to dataset folder.
        img_format : str
            Format of the files to load.
        label_dtype : str or dtype, optional
            Datatype of the labels. If None, labels will be strings.
        load_data : bool, optional
            If True (default) features will be stored.
            Otherwise store the paths to the files with dtype=object.

        """
        # Labels file MUST be available
        if not fm.file_exist(fm.join(ds_path, 'clients.txt')):
            raise OSError("cannot load clients file.")

        # Ensuring 'img_format' always has an extension-like pattern
        img_ext = '.' + img_format.strip('.').lower()

        # Dimensions of each image
        img_w = CArray([], dtype=int)
        img_h = CArray([], dtype=int)
        img_c = CArray([], dtype=int)

        # Load files!
        patterns, img_w, img_h, img_c = self._load_files(ds_path,
                                                         img_w,
                                                         img_h,
                                                         img_c,
                                                         img_ext,
                                                         load_data=load_data)

        labels = CArray.load(fm.join(ds_path, 'clients.txt'),
                             dtype=label_dtype).ravel()

        if patterns.shape[0] != labels.size:
            raise ValueError("patterns ({:}) and labels ({:}) do not have "
                             "the same number of elements.".format(
                                 patterns.shape[0], labels.size))

        # Load the file with extra dataset attributes (optional)
        attributes_path = fm.join(ds_path, 'attributes.txt')
        attributes = load_dict(attributes_path) if \
            fm.file_exist(attributes_path) else dict()

        self.logger.info("Loaded {:} images from {:}...".format(
            patterns.shape[0], ds_path))

        header = CDatasetHeader(id=fm.split(ds_path)[1],
                                img_w=img_w,
                                img_h=img_h,
                                img_c=img_c,
                                **attributes)

        return CDataset(patterns, labels, header=header)
    def load(self,
             file_path,
             dtype_samples=float,
             dtype_labels=float,
             n_features=None,
             zero_based=True,
             remove_all_zero=False,
             multilabel=False,
             load_infos=False):
        """Loads a dataset from the svmlight / libsvm format and
        returns a sparse dataset.

        Datasets must have only numerical feature indices and
        for every pattern indices must be ordered.

        Extra dataset attributes:
         - 'infos', CArray with inline comment for each sample.

        Parameters
        ----------
        file_path : String
            Path to file were dataset are stored into format svmlight or libsvm.
        dtype_samples : str or dtype, optional
            Data-type to which the samples should be casted. Default is float.
        dtype_labels : str or dtype, optional
            Data-type to which the labels should be casted. Default is float.
        n_features : None or int, optional
            The number of features to use.
            If None (default), it will be inferred. This argument is useful
            to load several files that are subsets of a bigger sliced
            dataset: each subset might not have examples of every feature,
            hence the inferred shape might vary from one slice to another.
        zero_based: bool, optional
            Whether column indices are zero-based (True, default) or
            one-based (False). If column indices are set to be one-based,
            they are transformed to zero-based to match
            Python/NumPy conventions.
        remove_all_zero: boolean, optional, default True
            If True every feature which is zero for every pattern
            will be removed from dataset.
        multilabel : boolean, optional
            True if every sample can have more than one label. Default False.
            (see http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)
        load_infos : bool, optional
            If True, inline comments will be loaded from the svmlight file
            and stored in the infos CDataset parameter (as CArray).
            Default False.

        Returns
        -------
        dataset : CDataset
            Dataset object that contain patterns and labels.
            If `remove_all_zero` is set to True, the returned dataset
            will have the new argument `idx_mapping` with the mapping of
            the returned features to the original features's indices.

        Examples
        --------
        >>> from secml.data.loader import CDataLoaderSvmLight
        >>> from secml.array import CArray
        >>> patterns = CArray ([[1,0,2], [4,0,5]])
        >>> labels = CArray ([0, 1])
        >>> CDataLoaderSvmLight().dump(CDataset(patterns,labels), "myfile.libsvm")
        >>> new_dataset = CDataLoaderSvmLight().load("myfile.libsvm", remove_all_zero=True)
        >>> print(new_dataset.X)  # doctest: +NORMALIZE_WHITESPACE
        CArray(  (0, 1)	2.0
          (0, 0)	1.0
          (1, 1)	5.0
          (1, 0)	4.0)
        >>> print(new_dataset.Y)
        CArray([0. 1.])
        >>> print(new_dataset.header.idx_mapping)
        CArray([0 2])

        """
        # Never use zero_based='auto' in order to avoid
        # any ambiguity with the features indices...
        patterns, labels = load_svmlight_file(file_path,
                                              n_features=n_features,
                                              dtype=float,
                                              multilabel=multilabel,
                                              zero_based=zero_based)

        patterns = CArray(patterns, tosparse=True, dtype=dtype_samples)
        labels = CArray(labels, dtype=dtype_labels)

        header = CDatasetHeader()  # Will be populated with extra attributes

        if remove_all_zero is True:
            patterns, idx_mapping = \
                CDataLoaderSvmLight._remove_all_zero_features(patterns)
            # Store reverse mapping as extra ds attribute
            header.idx_mapping = idx_mapping

        if load_infos is True:
            infos = []
            with open(file_path, 'rt') as f:
                for l_idx, l in enumerate(f):
                    i = l.split(' # ')
                    if len(i) > 2:  # Line should have only one split point
                        raise ValueError("Something wrong happened when "
                                         "extracting infos for line {:}"
                                         "".format(l_idx))
                    infos.append(i[1].rstrip() if len(i) == 2 else '')
            header.infos = CArray(infos)

        if len(header.get_params()) == 0:
            header = None  # Header is empty, store None in ds

        return CDataset(patterns, labels, header=header)
示例#11
0
    def load(self,
             ds_path,
             img_format,
             label_re=None,
             label_dtype=None,
             load_data=True):
        """Load all images of specified format inside given path.

        The following custom CDataset attributes are available:
         - 'id': last `ds_path` folder.
         - 'img_w', 'img_h': size of the images in pixels.
         - 'img_c': images number of channels.
         - Any other custom attribute is retrieved from 'attributes.txt' file.
           Only attributes of `str` type are currently supported.

        Any other custom attribute is retrieved from 'attributes.txt' file.

        Parameters
        ----------
        ds_path : str
            Full path to dataset folder.
        img_format : str
            Format of the files to load.
        label_re : re, optional
            Regular expression that identify the correct label.
            If None, the whole name of the leaf folder will be used as label.
        label_dtype : str or dtype, optional
            Datatype of the labels. If None, labels will be strings.
        load_data : bool, optional
            If True (default) features will be stored.
            Otherwise store the paths to the files with dtype=object.

        """
        # Ensuring 'img_format' always has an extension-like pattern
        img_ext = '.' + img_format.strip('.').lower()

        # Dimensions of each image
        img_w = CArray([], dtype=int)
        img_h = CArray([], dtype=int)
        img_c = CArray([], dtype=int)

        # Each directory inside the provided path will be explored recursively
        # and, if leaf, contained images will be loaded
        patterns, labels, img_w, img_h, img_c = self._explore_dir(
            ds_path,
            img_w,
            img_h,
            img_c,
            img_ext,
            label_re=label_re,
            load_data=load_data)

        if label_dtype is not None:  # Converting labels if requested
            labels = labels.astype(label_dtype)

        if patterns.shape[0] != labels.size:
            raise ValueError("patterns ({:}) and labels ({:}) do not have "
                             "the same number of elements.".format(
                                 patterns.shape[0], labels.size))

        # Load the file with extra dataset attributes (optional)
        attributes_path = fm.join(ds_path, 'attributes.txt')
        attributes = load_dict(attributes_path) if \
            fm.file_exist(attributes_path) else dict()

        self.logger.info("Loaded {:} images from {:}...".format(
            patterns.shape[0], ds_path))

        header = CDatasetHeader(id=fm.split(ds_path)[1],
                                img_w=img_w,
                                img_h=img_h,
                                img_c=img_c,
                                **attributes)

        return CDataset(patterns, labels, header=header)