def load_data(self): """ Fetch the MNIST dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. Returns: tuple: Both training and test sets are returned. """ workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath, self.size) with gzip.open(filepath, 'rb') as f: self.train_set, self.valid_set = pickle_load(f) self.train_set = {'image': {'data': self.train_set[0].reshape(60000, 28, 28), 'axes': ('N', 'H', 'W')}, 'label': {'data': self.train_set[1], 'axes': ('N',)}} self.valid_set = {'image': {'data': self.valid_set[0].reshape(10000, 28, 28), 'axes': ('N', 'H', 'W')}, 'label': {'data': self.valid_set[1], 'axes': ('N',)}} return self.train_set, self.valid_set
def download_lsun(self, category, dset, tag='latest', overwrite=False): """ Download LSUN data and unpack Arguments: category (str): LSUN category (valid selections: lsun_categories) dset (str): dataset, "train", "val", or "test" tag (str, optional): version tag, defaults to most recent overwrite (bool): whether to overwrite existing data """ dfile = 'test_lmdb' if dset == 'test' else '{0}_{1}_lmdb'.format( category, dset) self.filepath = filepath = valid_path_append(self.path, dfile) if not os.path.exists(filepath) or overwrite: filepath += '.zip' if not os.path.exists(filepath): url = LSUN.url + \ 'download.cgi?tag={0}&category={1}&set={2}'.format(tag, category, dset) print('Data download might take a long time.') print('Downloading {0} {1} set...'.format(category, dset)) subprocess.call(['curl', url, '-o', filepath]) # TODO # should change to fetch_file, # but currently did not get the correct "Content-length" or total_size # fetch_file(url, 'bedroom_train_lmdb.zip', filepath) print('Extracting {0} {1} set...'.format(category, dset)) zf = zipfile.ZipFile(filepath, 'r') zf.extractall(self.path) zf.close() print('Deleting {} ...'.format(filepath)) os.remove(filepath) else: pass # data already downloaded print("LSUN {0} {1} dataset downloaded and unpacked.".format( category, dset))
def load_data(self): self.data_dict = {} workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath) tokens = open(filepath).read() train_samples = int(self.train_split * len(tokens)) train = tokens[:train_samples] test = tokens[train_samples:] return train, test
def load_data(self): """ Fetch the CIFAR-100 dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. normalize (bool, optional): Whether to scale values between 0 and 1. Defaults to True. Returns: tuple: Both training and test sets are returned. """ workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath, self.size) batchdir = os.path.join(workdir, 'cifar-100-python') if not os.path.exists(os.path.join(batchdir)): assert os.path.exists(filepath), "Must have cifar-100-python.tar.gz" with tarfile.open(filepath, 'r:gz') as f: f.extractall(workdir) train_batches = [os.path.join(batchdir, 'train')] Xlist, ylist = [], [] for batch in train_batches: with open(batch, 'rb') as f: train_dict = pickle_load(f) Xlist.append(train_dict['data']) ylist.append(train_dict['coarse_labels']) X_train = np.vstack(Xlist).reshape(-1, 3, 32, 32) y_train = np.vstack(ylist).ravel() with open(os.path.join(batchdir, 'test'), 'rb') as f: test_dict = pickle_load(f) X_test, y_test = test_dict['data'], test_dict['coarse_labels'] X_test = X_test.reshape(-1, 3, 32, 32) self.train_set = {'image': {'data': X_train, 'axes': ('N', 'C', 'H', 'W')}, 'label': {'data': y_train, 'axes': ('N',)}} self.valid_set = {'image': {'data': X_test, 'axes': ('N', 'C', 'H', 'W')}, 'label': {'data': np.array(y_test), 'axes': ('N',)}} return self.train_set, self.valid_set
def load_data(self, test_split=0.2): self.data_dict = {} self.vocab = None workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath, self.filesize) with open(filepath, 'rb') as f: X, y = pickle_load(f) X = preprocess_text(X, self.vocab_size) X = pad_sentences( X, pad_idx=self.pad_idx, pad_to_len=self.sentence_length, pad_from='left') if self.shuffle: indices = np.arange(len(y)) np.random.shuffle(indices) X = X[indices] y = np.asarray(y)[indices] # split the data X_train = X[:int(len(X) * (1 - test_split))] y_train = y[:int(len(X) * (1 - test_split))] X_test = X[int(len(X) * (1 - test_split)):] y_test = y[int(len(X) * (1 - test_split)):] y_train = np.array(y_train) y_test = np.array(y_test) self.nclass = 1 + max(np.max(y_train), np.max(y_test)) self.data_dict['train'] = {'review': {'data': X_train, 'axes': ('N', 'REC')}, 'label': {'data': y_train, 'axes': ('N',)}} self.data_dict['valid'] = {'review': {'data': X_test, 'axes': ('N', 'REC')}, 'label': {'data': y_test, 'axes': ('N',)}} return self.data_dict
def load_data(self): self.data_dict = {} for phase in ['train', 'test']: filename = self.filemap[phase]['filename'] workdir, filepath = valid_path_append(self.path, '', filename) if not os.path.exists(filepath): for file_name, file_id in GOOGLE_DRIVE_IDS.items(): destination = './' + file_name print( '\nDownloading and unzipping traveling salesman data {} released ' 'with Pointer Networks paper\n'.format(file_name)) self.download_file_from_google_drive(file_id, destination) with zipfile.ZipFile(destination, 'r') as z: z.extractall('./') cities = int(re.search(r'\d+', filename).group()) print('Loading and preprocessing tsp{} {} data...'.format( cities, phase)) with open(filepath, 'r') as f: X, y, y_teacher = [], [], [] for i, line in tqdm(enumerate(f)): inputs, outputs = line.split('output') X.append( np.array([float(j) for j in inputs.split()]).reshape([-1, 2])) y.append( np.array([int(j) - 1 for j in outputs.split() ])[:-1]) # delete last # teacher forcing array as decoder's input while training y_teacher.append([X[i][j - 1] for j in y[i]]) X = np.array(X) y = np.array(y) y_teacher = np.array(y_teacher) self.data_dict[phase] = { 'inp_txt': X, 'tgt_txt': y, 'teacher_tgt': y_teacher } return self.data_dict
def load_data(self, data_directory=None, manifest_file=None): """ Create a manifest file for the requested dataset. First downloads the dataset and extracts it, if necessary. Arguments: data_directory (str): Path to data directory. Defaults to <path>/<version> manifest_file (str): Path to manifest file. Defaults to <data_directory>/manifest.tsv Returns: Path to manifest file """ if manifest_file is None: if self.manifest_file is not None: manifest_file = self.manifest_file else: manifest_file = os.path.join(self.path, "manifest.tsv") if os.path.exists(manifest_file): return manifest_file # Download the file workdir, filepath = valid_path_append(self.path, '', self.source_file) if not os.path.exists(filepath): fetch_file(self.url, self.source_file, filepath) # Untar the file if data_directory is None: data_directory = os.path.join(self.path, self.version) if not os.path.exists(data_directory): print("Extracting tar file to {}".format(data_directory)) with contextlib.closing(tarfile.open(filepath)) as tf: tf.extractall(data_directory) # Ingest the file ingest_librispeech(data_directory, manifest_file) return manifest_file
def load_data(self): self.data_dict = {} self.vocab = None for phase in ['train', 'test', 'valid']: filename, filesize = self.filemap[phase]['filename'], self.filemap[ phase]['size'] workdir, filepath = valid_path_append(self.path, '', filename) if not os.path.exists(filepath): fetch_file(self.url, filename, filepath, filesize) tokens = open( filepath).read() # add tokenization here if necessary if self.use_words: tokens = tokens.strip().split() self.vocab = sorted( set(tokens)) if self.vocab is None else self.vocab # vocab dicts self.token_to_index = dict( (t, i) for i, t in enumerate(self.vocab)) self.index_to_token = dict( (i, t) for i, t in enumerate(self.vocab)) # map tokens to indices X = np.asarray([self.token_to_index[t] for t in tokens], dtype=np.uint32) if self.shift_target: y = np.concatenate((X[1:], X[:1])) else: y = X.copy() self.data_dict[phase] = {'inp_txt': X, 'tgt_txt': y} return self.data_dict