def test_append(self): """Test for .append() method.""" ds_append = self.dataset.append(self.dataset) self.assertEqual(self.dataset.num_samples * 2, ds_append.num_samples) self.assert_array_equal( ds_append.X, CArray([[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2, 3], [4, 5, 6], [7, 8, 9]])) self.assert_array_equal(ds_append.Y, CArray([1, 2, 2, 1, 2, 2])) # Test append with header ds = self.dataset.deepcopy() # Test append with header in both ds header = CDatasetHeader(id='mydataset', age=34, colors=CArray([1, 2, 3])) ds.header = header # Test append with header in both ds ds_append = ds.append(ds) ds_params = ds_append.header.get_params() self.assertEqual(ds_params['id'], 'mydataset') self.assertEqual(ds_params['age'], 34) self.assert_array_equal(ds_params['colors'], CArray([1, 2, 3, 1, 2, 3])) # Create two copies now for later tests ds1 = self.dataset.deepcopy() ds2 = self.dataset.deepcopy() # For the following tests we cannot use CArrays as params. Use tuple header = CDatasetHeader(id='mydataset', age=34, colors=(1, 2, 3)) ds1.header = header ds2.header = header # Test append with header in first ds ds_append = ds1.append(self.dataset) ds_params = ds_append.header.get_params() self.assertEqual(ds_params['id'], 'mydataset') self.assertEqual(ds_params['age'], 34) self.assertEqual(ds_params['colors'], (1, 2, 3)) # Test append with header in second ds ds_append = self.dataset.append(ds2) ds_params = ds_append.header.get_params() self.assertEqual(ds_params['id'], 'mydataset') self.assertEqual(ds_params['age'], 34) self.assert_array_equal(ds_params['colors'], (1, 2, 3))
def test_copy(self): """Test for .deepcopy() method.""" ds_copy = self.dataset.deepcopy() ds_copy.X[0, :] = 100 ds_copy.Y[0] = 100 self.assert_array_equal(self.dataset.X[0, :], CArray([[1, 2, 3]])) self.assert_array_equal(self.dataset.Y[0], CArray([1])) self.assert_array_equal(ds_copy.X[0, :], CArray([[100, 100, 100]])) self.assert_array_equal(ds_copy.Y[0], CArray([100])) # Test deepcopy with header header = CDatasetHeader(id='mydataset', age=34, colors=CArray([1, 2, 3])) self.dataset.header = header ds_copy = self.dataset.deepcopy() # Now change header of original dataset self.dataset.header.colors[0] = 100 ds_params = self.dataset.header.get_params() self.assertEqual(ds_params['id'], 'mydataset') self.assertEqual(ds_params['age'], 34) self.assert_array_equal(ds_params['colors'], CArray([100, 2, 3])) ds_params = ds_copy.header.get_params() self.assertEqual(ds_params['id'], 'mydataset') self.assertEqual(ds_params['age'], 34) self.assert_array_equal(ds_params['colors'], CArray([1, 2, 3]))
def setUp(self): self.ds = CDLRandom(n_samples=10, random_state=0).load() timestamps = CArray([ '2016-02-17T10:35:58', '2014-04-04T22:24:22', '2016-08-07T17:10:36', '2014-05-22T11:02:58', '2016-07-01T07:12:34', '2016-01-03T13:10:38', '2014-07-28T23:42:00', '2014-07-08T09:42:42', '2016-05-06T18:38:08', '2015-11-03T21:07:04' ]) self.ds.header = CDatasetHeader(timestamp=timestamps, timestamp_fmt='%Y-%m-%dT%H:%M:%S')
def load(self, min_faces_per_person=None, funneled=True, color=False): """Load LFW dataset. Extra dataset attributes: - 'img_w', 'img_h': size of the images in pixels. - 'y_names': tuple with the name string for each class. Parameters ---------- min_faces_per_person : int or None, optional The extracted dataset will only retain pictures of people that have at least min_faces_per_person different pictures. Default None, so all db images are returned. funneled : bool, optional Download and use the images aligned with deep funneling. Default True. color : bool, optional Keep the 3 RGB channels instead of averaging them to a single gray level channel. Default False. """ with CDataLoaderLFW.__lock: lfw_people = fetch_lfw_people( data_home=SECML_DS_DIR, funneled=funneled, resize=1, min_faces_per_person=min_faces_per_person, color=color, slice_=None, download_if_missing=True) x = CArray(lfw_people.data) y = CArray(lfw_people.target) img_w = lfw_people.images.shape[2] img_h = lfw_people.images.shape[1] y_names = tuple(lfw_people.target_names.tolist()) header = CDatasetHeader(img_w=img_w, img_h=img_h, y_names=y_names) return CDataset(x, y, header=header)
def test_custom_attr(self): """Testing for custom attributes.""" header = CDatasetHeader(id='mydataset', age=34, colors=CArray([1, 2, 3])) ds = CDataset(self.X, self.Y, header=header) ds_params = ds.header.get_params() self.assertEqual(ds_params['id'], 'mydataset') self.assertEqual(ds_params['age'], 34) self.assert_array_equal(ds_params['colors'], CArray([1, 2, 3])) # Testing getitem. Immutable objects should be copied as they are. # Arrays should be indexed. ds_get = ds[[0, 2], :] ds_params = ds_get.header.get_params() self.assert_array_equal(ds_get.X, CArray([[1, 2, 3], [7, 8, 9]])) self.assert_array_equal(ds_get.Y, CArray([1, 2])) self.assertEqual(ds_params['id'], 'mydataset') self.assertEqual(ds_params['age'], 34) self.assert_array_equal(ds_params['colors'], CArray([1, 3]))
def _load(self, train_files, test_files, meta_file, labels_key, class_names_key, val_size=0): """Load all images of the dataset. Adapted from: http://dataset-loading.readthedocs.io/en/latest/_modules/dataset_loading/cifar.html Parameters ---------- train_files : list List of the files where the training set is stored. test_files : list List of the files where the test set is stored. meta_file : str Name of the metafile containing the class names. labels_key : bytes Dictionary key where the labels are stored. class_names_key : bytes Dictionary key where the class names are stored. val_size : int, optional Size of the validation set. Default 0, so no validation dataset will be returned. Returns ------- training_set : CDataset Training set. test_set : CDataset Test set. validation_set : CDataset, optional Validation set. Returned only if val_size > 0. """ self.logger.info( "Loading {:} dataset from {:}...".format(self.class_type, self.data_path)) def load_files(batches_list): # Function that loads the data into memory data = None labels = None for batch in batches_list: with open(batch, 'rb') as bf: mydict = pickle.load(bf, encoding='bytes') # The labels have different names in the two datasets new_data = np.array(mydict[b'data'], dtype='uint8') newlabels = np.array(mydict[labels_key], dtype='int32') if data is not None: data = np.vstack([data, new_data]) labels = np.hstack([labels, newlabels]) else: data = new_data labels = newlabels return data, labels # Load training and test sets train_data, train_labels = load_files( [fm.join(self.data_path, f) for f in train_files]) test_data, test_labels = load_files( [fm.join(self.data_path, f) for f in test_files]) val_data = None val_labels = None # Populate the validation set if needed if val_size > 0: train_data, val_data = np.split( train_data, [train_data.shape[0] - val_size]) train_labels, val_labels = np.split( train_labels, [train_labels.shape[0] - val_size]) # Load the class names from the meta file class_names = self._load_class_names(meta_file, class_names_key) header = CDatasetHeader(img_w=32, img_h=32, class_names=class_names) tr = CDataset(train_data, train_labels, header=header) ts = CDataset(test_data, test_labels, header=header) # Return training set and test set for sure out_datasets = (tr, ts) if val_size > 0: val = CDataset(val_data, val_labels, header=header) # Also return the validation dataset out_datasets += (val, ) return out_datasets
def load(self, ds_type, day='day4', icub7=False, resize_shape=(128, 128), crop_shape=None, normalize=True): """Load the dataset. The pre-cropped version of the images is loaded, with size 128 x 128. An additional resize/crop shape could be passed as input if needed. Extra dataset attributes: - 'img_w', 'img_h': size of the images in pixels. - 'y_orig': CArray with the original labels of the objects. Parameters ---------- ds_type : str Identifier of the dataset to download, either 'train' or 'test'. day : str, optional Acquisition day from which to load the images. Default 'day4'. The available options are: 'day1', 'day2', 'day3', 'day4'. icub7 : bool or int, optional If True, load a reduced dataset with 7 objects by taking the 3rd object for each category. Default False. If int, the Nth object for each category will be loaded. resize_shape : tuple, optional Images will be resized to (height, width) shape. Default (128, 128). crop_shape : tuple or None, optional If a tuple, a crop of (height, width) shape will be extracted from the center of each image. Default None. normalize : bool, optional If True, images are normalized between 0-1. Default True. Returns ------- CDataset Output dataset. """ if ds_type == 'train': data_path = self._train_path elif ds_type == 'test': data_path = self._test_path else: raise ValueError("use ds_type = {'train', 'test'}.") day_path = fm.join(data_path, day) if not fm.folder_exist(day_path): raise ValueError("{:} not available.".format(day)) self.logger.info( "Loading iCubWorld{:} {:} {:} dataset from {:}".format( '7' if icub7 else '28', day, ds_type, day_path)) icub7 = 3 if icub7 is True else icub7 # Use the 3rd sub-obj by default x = None y_orig = [] for obj in sorted(fm.listdir(day_path)): # Objects (cup, sponge, ..) obj_path = fm.join(day_path, obj) # Sub-objects (cup1, cup2, ...) for sub_obj in sorted(fm.listdir(obj_path)): if icub7 and sub_obj[-1] != str(icub7): continue # Load only the `icub7`th object self.logger.debug("Loading images for {:}".format(sub_obj)) sub_obj_path = fm.join(obj_path, sub_obj) for f in sorted(fm.listdir(sub_obj_path)): img = Image.open(fm.join(sub_obj_path, f)) if resize_shape is not None: img = resize_img(img, resize_shape) if crop_shape is not None: img = crop_img(img, crop_shape) img = CArray(img.getdata(), dtype='uint8').ravel() x = x.append(img, axis=0) if x is not None else img y_orig.append(sub_obj) # Label is given by sub-obj name # Create the int-based array of labels. Keep original labels in y_orig y_orig = CArray(y_orig) y = CArray(y_orig).unique(return_inverse=True)[1] if normalize is True: x /= 255.0 # Size of images is the crop shape (if any) otherwise, the resize shape img_h, img_w = crop_shape if crop_shape is not None else resize_shape header = CDatasetHeader(img_w=img_w, img_h=img_h, y_orig=y_orig) return CDataset(x, y, header=header)
def load(self, ds, digits=tuple(range(0, 10)), num_samples=None): """Load all images of specified format inside given path. Adapted from: http://cvxopt.org/_downloads/mnist.py Extra dataset attributes: - 'img_w', 'img_h': size of the images in pixels. - 'y_original': array with the original labels (before renumbering) Parameters ---------- ds : str Identifier of the dataset to download, either 'training' or 'testing'. digits : tuple Tuple with the digits to load. By default all digits are loaded. num_samples : int or None, optional Number of expected samples in resulting ds. If int, an equal number of samples will be taken from each class until `num_samples` have been loaded. If None, all samples will be loaded. """ if ds == "training": data_path = self.train_data_path lbl_path = self.train_labels_path elif ds == "testing": data_path = self.test_data_path lbl_path = self.test_labels_path else: raise ValueError("ds must be 'training' or 'testing'") self.logger.info("Loading MNIST {:} dataset from {:}...".format( ds, MNIST_PATH)) # Opening the labels data flbl = open(lbl_path, 'rb') magic_nr, size = struct.unpack(">II", flbl.read(8)) if magic_nr != 2049: raise ValueError('Magic number mismatch, expected 2049,' 'got {}'.format(magic_nr)) lbl = array("b", flbl.read()) flbl.close() # Opening the images data fimg = open(data_path, 'rb') magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16)) if magic_nr != 2051: raise ValueError('Magic number mismatch, expected 2051,' 'got {}'.format(magic_nr)) img = array("B", fimg.read()) fimg.close() # Convert digits to tuple in case was passed as array/list digits = tuple(digits) # Number of samples per class if num_samples is not None: div = len(digits) n_samples_class = [ int(num_samples / div) + (1 if x < num_samples % div else 0) for x in range(div) ] n_samples_class = { e: n_samples_class[e_i] for e_i, e in enumerate(digits) } else: # No constraint on the number of samples n_samples_class = {e: size for e in digits} # Counter of already taken sample for a class count_samples_class = {e: 0 for e in digits} # Extract the indices of samples to load ind = [] for k in range(size): if lbl[k] in digits: # Check the maximum number of samples for current digits if count_samples_class[lbl[k]] < n_samples_class[lbl[k]]: ind += [k] count_samples_class[lbl[k]] += 1 # Number of loaded samples num_loaded = sum(count_samples_class.values()) # Check if dataset has enough samples if num_samples is not None and num_loaded < num_samples: min_val = min(count_samples_class.values()) raise ValueError( "not enough samples in dataset for one ore more of the " "desired classes ({:} available)".format(min_val)) images = CArray.zeros((len(ind), rows * cols), dtype=np.uint8) labels = CArray.zeros(len(ind), dtype=int) digs_array = CArray(digits) # To use find method for i in range(len(ind)): images[i, :] = CArray(img[ind[i] * rows * cols:(ind[i] + 1) * rows * cols]) labels[i] = CArray(digs_array.find(digs_array == lbl[ind[i]])) header = CDatasetHeader(img_w=28, img_h=28, y_original=digits) return CDataset(images, labels, header=header)
def load(self, ds_path, img_format, label_dtype=None, load_data=True): """Load all images of specified format inside given path. Extra dataset attributes: - 'id': last `ds_path` folder. - 'img_w', 'img_h': size of the images in pixels. - 'img_c': images number of channels. - Any other custom attribute is retrieved from 'attributes.txt' file. Only attributes of `str` type are currently supported. Parameters ---------- ds_path : str Full path to dataset folder. img_format : str Format of the files to load. label_dtype : str or dtype, optional Datatype of the labels. If None, labels will be strings. load_data : bool, optional If True (default) features will be stored. Otherwise store the paths to the files with dtype=object. """ # Labels file MUST be available if not fm.file_exist(fm.join(ds_path, 'clients.txt')): raise OSError("cannot load clients file.") # Ensuring 'img_format' always has an extension-like pattern img_ext = '.' + img_format.strip('.').lower() # Dimensions of each image img_w = CArray([], dtype=int) img_h = CArray([], dtype=int) img_c = CArray([], dtype=int) # Load files! patterns, img_w, img_h, img_c = self._load_files(ds_path, img_w, img_h, img_c, img_ext, load_data=load_data) labels = CArray.load(fm.join(ds_path, 'clients.txt'), dtype=label_dtype).ravel() if patterns.shape[0] != labels.size: raise ValueError("patterns ({:}) and labels ({:}) do not have " "the same number of elements.".format( patterns.shape[0], labels.size)) # Load the file with extra dataset attributes (optional) attributes_path = fm.join(ds_path, 'attributes.txt') attributes = load_dict(attributes_path) if \ fm.file_exist(attributes_path) else dict() self.logger.info("Loaded {:} images from {:}...".format( patterns.shape[0], ds_path)) header = CDatasetHeader(id=fm.split(ds_path)[1], img_w=img_w, img_h=img_h, img_c=img_c, **attributes) return CDataset(patterns, labels, header=header)
def load(self, file_path, dtype_samples=float, dtype_labels=float, n_features=None, zero_based=True, remove_all_zero=False, multilabel=False, load_infos=False): """Loads a dataset from the svmlight / libsvm format and returns a sparse dataset. Datasets must have only numerical feature indices and for every pattern indices must be ordered. Extra dataset attributes: - 'infos', CArray with inline comment for each sample. Parameters ---------- file_path : String Path to file were dataset are stored into format svmlight or libsvm. dtype_samples : str or dtype, optional Data-type to which the samples should be casted. Default is float. dtype_labels : str or dtype, optional Data-type to which the labels should be casted. Default is float. n_features : None or int, optional The number of features to use. If None (default), it will be inferred. This argument is useful to load several files that are subsets of a bigger sliced dataset: each subset might not have examples of every feature, hence the inferred shape might vary from one slice to another. zero_based: bool, optional Whether column indices are zero-based (True, default) or one-based (False). If column indices are set to be one-based, they are transformed to zero-based to match Python/NumPy conventions. remove_all_zero: boolean, optional, default True If True every feature which is zero for every pattern will be removed from dataset. multilabel : boolean, optional True if every sample can have more than one label. Default False. (see http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html) load_infos : bool, optional If True, inline comments will be loaded from the svmlight file and stored in the infos CDataset parameter (as CArray). Default False. Returns ------- dataset : CDataset Dataset object that contain patterns and labels. If `remove_all_zero` is set to True, the returned dataset will have the new argument `idx_mapping` with the mapping of the returned features to the original features's indices. Examples -------- >>> from secml.data.loader import CDataLoaderSvmLight >>> from secml.array import CArray >>> patterns = CArray ([[1,0,2], [4,0,5]]) >>> labels = CArray ([0, 1]) >>> CDataLoaderSvmLight().dump(CDataset(patterns,labels), "myfile.libsvm") >>> new_dataset = CDataLoaderSvmLight().load("myfile.libsvm", remove_all_zero=True) >>> print(new_dataset.X) # doctest: +NORMALIZE_WHITESPACE CArray( (0, 1) 2.0 (0, 0) 1.0 (1, 1) 5.0 (1, 0) 4.0) >>> print(new_dataset.Y) CArray([0. 1.]) >>> print(new_dataset.header.idx_mapping) CArray([0 2]) """ # Never use zero_based='auto' in order to avoid # any ambiguity with the features indices... patterns, labels = load_svmlight_file(file_path, n_features=n_features, dtype=float, multilabel=multilabel, zero_based=zero_based) patterns = CArray(patterns, tosparse=True, dtype=dtype_samples) labels = CArray(labels, dtype=dtype_labels) header = CDatasetHeader() # Will be populated with extra attributes if remove_all_zero is True: patterns, idx_mapping = \ CDataLoaderSvmLight._remove_all_zero_features(patterns) # Store reverse mapping as extra ds attribute header.idx_mapping = idx_mapping if load_infos is True: infos = [] with open(file_path, 'rt') as f: for l_idx, l in enumerate(f): i = l.split(' # ') if len(i) > 2: # Line should have only one split point raise ValueError("Something wrong happened when " "extracting infos for line {:}" "".format(l_idx)) infos.append(i[1].rstrip() if len(i) == 2 else '') header.infos = CArray(infos) if len(header.get_params()) == 0: header = None # Header is empty, store None in ds return CDataset(patterns, labels, header=header)
def load(self, ds_path, img_format, label_re=None, label_dtype=None, load_data=True): """Load all images of specified format inside given path. The following custom CDataset attributes are available: - 'id': last `ds_path` folder. - 'img_w', 'img_h': size of the images in pixels. - 'img_c': images number of channels. - Any other custom attribute is retrieved from 'attributes.txt' file. Only attributes of `str` type are currently supported. Any other custom attribute is retrieved from 'attributes.txt' file. Parameters ---------- ds_path : str Full path to dataset folder. img_format : str Format of the files to load. label_re : re, optional Regular expression that identify the correct label. If None, the whole name of the leaf folder will be used as label. label_dtype : str or dtype, optional Datatype of the labels. If None, labels will be strings. load_data : bool, optional If True (default) features will be stored. Otherwise store the paths to the files with dtype=object. """ # Ensuring 'img_format' always has an extension-like pattern img_ext = '.' + img_format.strip('.').lower() # Dimensions of each image img_w = CArray([], dtype=int) img_h = CArray([], dtype=int) img_c = CArray([], dtype=int) # Each directory inside the provided path will be explored recursively # and, if leaf, contained images will be loaded patterns, labels, img_w, img_h, img_c = self._explore_dir( ds_path, img_w, img_h, img_c, img_ext, label_re=label_re, load_data=load_data) if label_dtype is not None: # Converting labels if requested labels = labels.astype(label_dtype) if patterns.shape[0] != labels.size: raise ValueError("patterns ({:}) and labels ({:}) do not have " "the same number of elements.".format( patterns.shape[0], labels.size)) # Load the file with extra dataset attributes (optional) attributes_path = fm.join(ds_path, 'attributes.txt') attributes = load_dict(attributes_path) if \ fm.file_exist(attributes_path) else dict() self.logger.info("Loaded {:} images from {:}...".format( patterns.shape[0], ds_path)) header = CDatasetHeader(id=fm.split(ds_path)[1], img_w=img_w, img_h=img_h, img_c=img_c, **attributes) return CDataset(patterns, labels, header=header)