def pickle_save(variable, filename='result', sliceno=None, temp=None): filename = full_filename(filename, '.pickle', sliceno) if temp == Temp.DEBUG and temp is not True and '--debug' not in argv: return with FileWriteMove(filename, temp) as fh: # use protocol version 2 so python2 can read the pickles too. pickle.dump(variable, fh, 2)
def _download_norb_small(dataset): """ Download the Norb dataset """ print('Downloading small resized norb data') urllib.urlretrieve('http://dl.dropbox.com/u/13294233/smallnorb/smallnorb-' '5x46789x9x18x6x2x32x32-training-dat-matlab-bicubic.mat', dataset + '/smallnorb_train_x.mat') urllib.urlretrieve('http://dl.dropbox.com/u/13294233/smallnorb/smallnorb-' '5x46789x9x18x6x2x96x96-training-cat-matlab.mat', dataset + '/smallnorb_train_t.mat') urllib.urlretrieve('http://dl.dropbox.com/u/13294233/smallnorb/smallnorb-' '5x01235x9x18x6x2x32x32-testing-dat-matlab-bicubic.mat', dataset + '/smallnorb_test_x.mat') urllib.urlretrieve('http://dl.dropbox.com/u/13294233/smallnorb/smallnorb-' '5x01235x9x18x6x2x96x96-testing-cat-matlab.mat', dataset + '/smallnorb_test_t.mat') data = loadmat(dataset + '/smallnorb_train_x.mat')['traindata'] train_x = np.concatenate([data[:,0,:].T, data[:,0,:].T]).astype('float32') data = loadmat(dataset + '/smallnorb_train_t.mat') train_t = data['trainlabels'].flatten().astype('float32') train_t = np.concatenate([train_t, train_t]) data = loadmat(dataset + '/smallnorb_test_x.mat')['testdata'] test_x = np.concatenate([data[:,0,:].T, data[:,0,:].T]).astype('float32') data = loadmat(dataset + '/smallnorb_test_t.mat') test_t = data['testlabels'].flatten().astype('float32') test_t = np.concatenate([test_t, test_t]) with open(dataset+'/norbsmall32x32.cpkl','w') as f: cPkl.dump([train_x, train_t, test_x, test_t], f, protocol=cPkl.HIGHEST_PROTOCOL)
def save_state(): """Save the program state, for debugging purposes.""" # relax data store singleton import. Must be done here! try: from data_store import Relax_data_store; ds = Relax_data_store() # Ok, this is not relax so don't do anything! except ImportError: return # Append the date and time to the save file. now = time.localtime() file_name = "relax_state_%i%02i%02i_%02i%02i%02i" % (now[0], now[1], now[2], now[3], now[4], now[5]) # Open the file for writing. if bz2: sys.stderr.write("\n\nStoring the relax state in the file '%s.bz2'.\n\n\n" % file_name) file = BZ2File(file_name+'.bz2', 'w') else: sys.stderr.write("\n\nStoring the relax state in the file '%s'.\n\n\n" % file_name) file = open(file_name, 'w') # Pickle the data class and write it to file pickle.dump(ds, file, 1) # Close the file. file.close()
def _download_omniglot(dataset): """ Download the omniglot dataset if it is not present. :return: The train, test and validation set. """ from scipy.misc import imread,imresize origin_eval = ( "https://github.com/brendenlake/omniglot/" "raw/master/python/images_evaluation.zip" ) origin_back = ( "https://github.com/brendenlake/omniglot/" "raw/master/python/images_background.zip" ) print('Downloading data from %s' % origin_eval) urllib.urlretrieve(origin_eval, dataset + '/images_evaluation.zip') print('Downloading data from %s' % origin_back) urllib.urlretrieve(origin_back, dataset + '/images_background.zip') with zipfile.ZipFile(dataset + '/images_evaluation.zip', "r") as z: z.extractall(dataset) with zipfile.ZipFile(dataset + '/images_background.zip', "r") as z: z.extractall(dataset) background = dataset + '/images_background' evaluation = dataset + '/images_evaluation' matches = [] for root, dirnames, filenames in os.walk(background): for filename in fnmatch.filter(filenames, '*.png'): matches.append(os.path.join(root, filename)) for root, dirnames, filenames in os.walk(evaluation): for filename in fnmatch.filter(filenames, '*.png'): matches.append(os.path.join(root, filename)) train = [] test = [] def _load_image(fn): image = imread(fn, True) image = imresize(image, (32, 32), interp='bicubic') image = image.reshape((-1)) image = np.abs(image-255.)/255. return image for p in matches: if any(x in p for x in ['16.png','17.png','18.png','19.png','20.png']): test.append(_load_image(p)) else: train.append(_load_image(p)) shutil.rmtree(background+'/') shutil.rmtree(evaluation+'/') test = np.asarray(test) train = np.asarray(train) with open(dataset+'/omniglot.cpkl','w') as f: cPkl.dump([train, test],f,protocol=cPkl.HIGHEST_PROTOCOL)
def _download_lwf(dataset,size): from sklearn.datasets import fetch_lfw_people ''' :param dataset: :return: ''' lfw_people = fetch_lfw_people(color=True,resize=size) f = gzip.open(dataset, 'w') cPkl.dump([lfw_people.images.astype('uint8'),lfw_people.target], f, protocol=cPkl.HIGHEST_PROTOCOL) f.close()
def store(self, key, value): path = self._get_path(key) self.lock.acquire() try: # acquire lock and open file f_lock = self._lock_file(path) datafile = open(path, 'wb') # write data pickle.dump((time.time(), value), datafile) # close and unlock file datafile.close() self._unlock_file(f_lock) finally: self.lock.release()
def _download_svhn(dataset, extra): """ Download the SVHN dataset """ print('Downloading data from http://ufldl.stanford.edu/housenumbers/, ' \ 'this may take a while...') if extra: print("Downloading extra data...") urllib.urlretrieve('http://ufldl.stanford.edu/housenumbers/extra_32x32.mat', dataset+'extra_32x32.mat') extra = loadmat(dataset+'extra_32x32.mat') extra_x = extra['X'].swapaxes(2,3).swapaxes(1,2).swapaxes(0,1) extra_y = extra['y'].reshape((-1)) - 1 print("Saving extra data") with open(dataset +'svhn_extra.cpkl', 'w') as f: pkl.dump([extra_x,extra_y],f,protocol=cPkl.HIGHEST_PROTOCOL) os.remove(dataset+'extra_32x32.mat') else: print("Downloading train data...") urllib.urlretrieve('http://ufldl.stanford.edu/housenumbers/train_32x32.mat', dataset+'train_32x32.mat') print("Downloading test data...") urllib.urlretrieve('http://ufldl.stanford.edu/housenumbers/test_32x32.mat', dataset+'test_32x32.mat') train = loadmat(dataset+'train_32x32.mat') train_x = train['X'].swapaxes(2,3).swapaxes(1,2).swapaxes(0,1) train_y = train['y'].reshape((-1)) - 1 test = loadmat(dataset+'test_32x32.mat') test_x = test['X'].swapaxes(2,3).swapaxes(1,2).swapaxes(0,1) test_y = test['y'].reshape((-1)) - 1 print("Saving train data") with open(dataset +'svhn_train.cpkl', 'w') as f: cPkl.dump([train_x,train_y],f,protocol=cPkl.HIGHEST_PROTOCOL) print("Saving test data") with open(dataset +'svhn_test.cpkl', 'w') as f: pkl.dump([test_x,test_y],f,protocol=cPkl.HIGHEST_PROTOCOL) os.remove(dataset+'train_32x32.mat') os.remove(dataset+'test_32x32.mat') # helper function for converting chars to matrix format def create_matrix(reviews, y_cls): num_seqs = len(reviews) X = np.zeros((num_seqs, max_len), dtype='int32') -1 # set all to -1 for row in range(num_seqs): review = reviews[row] for col in range(len(review)): # try to look up key otherwise use unk_idx if review[col] in char2idx: char_idx = char2idx[review[col]] else: char_idx = unk_idx X[row, col] = char_idx mask = (X != -1).astype('float32') X[X==-1] = 0 y = np.ones(num_seqs, dtype='int32')*y_cls return X, y, mask X_pos, y_pos, mask_pos = create_matrix(pos_lst, 1) X_neg, y_neg, mask_neg = create_matrix(neg_lst, 0) X = np.concatenate([X_pos, X_neg], axis=0) y = np.concatenate([y_pos, y_neg], axis=0) mask = np.concatenate([mask_pos, mask_neg]) print("-"*40) print("Minium length filter :", minimum_len) print("Maximum length filter:", maximum_len) if minimum_len is not None: seq_lens = mask.sum(axis=1) keep = seq_lens >= minimum_len print("Seqs below minimum : %i" % np.invert(keep).sum()) X = X[keep, :] y = y[keep] mask = mask[keep, :] if maximum_len is not None: seq_lens = mask.sum(axis=1) keep = seq_lens <= maximum_len print("Seqs above maximum : %i" % np.invert(keep).sum()) X = X[keep, :] y = y[keep] mask = mask[keep, :] np.random.seed(seed) p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] mask = mask[p] seq_lens = mask.sum(axis=1).astype('int32') print("X :", X.shape, X.dtype) print("y :", y.shape, y.dtype) print("mask :", mask.shape, mask.dtype) print("MIN length : ", seq_lens.min()) print("MAX length : ", seq_lens.max()) print("MEAN length : ", seq_lens.mean()) print("UNKOWN chars : ", np.sum(X==unk_idx)) print("-"*40) # check that idx's in X is the number of vocab_size + unk_idx n = vocab_size if isinstance(vocab_size, int) else len(vocab_size) assert len(np.unique(X)) == n + 1 assert sum(np.unique(y)) == 1 # check that y is 0,1 return X, y, mask, vocab