def main(params_, **kwargs): print 'Params: ', params_ #raise IndexError('aaaaaaaaaaaaaaaaaaaaaaaaaaaaa!') dataset = skdata.mnist.dataset.MNIST() dataset.fetch(True) convex_inputs = skdata.mnist.dataset.read( gfile(os.path.join(dataset.home(), "train-images-idx3-ubyte.gz"))) convex_labels = skdata.mnist.dataset.read( gfile(os.path.join(dataset.home(), "train-labels-idx1-ubyte.gz"))) convex_test = skdata.mnist.dataset.read( gfile(os.path.join(dataset.home(), "t10k-images-idx3-ubyte.gz"))) convex_test_labels = skdata.mnist.dataset.read( gfile(os.path.join(dataset.home(), "t10k-labels-idx1-ubyte.gz"))) convex_inputs = numpy.array(convex_inputs.reshape((-1, 784)), dtype=numpy.float32) convex_test = numpy.array(convex_test.reshape((-1, 784)), dtype=numpy.float32) fold = kwargs['fold'] folds = kwargs['folds'] downsize = kwargs['downsize'] if folds == 1: train = convex_inputs[:int(50000 * downsize)] valid = convex_inputs[50000:60000] test = convex_test train_targets = convex_labels[:int(50000 * downsize)] valid_targets = convex_labels[50000:60000] test_targets = convex_test_labels elif folds > 1: cv_data = numpy.copy(convex_inputs[60000]) train, valid = data_util.prepare_cv_for_fold(cv_data, fold, folds) cv_labels = numpy.copy(convex_labels[:60000]) train_targets, valid_targets = data_util.prepare_cv_for_fold( cv_labels, fold, folds) test = convex_test test_targets = convex_test_labels else: raise ValueError("Folds cannot be less than 1") # I do not know why 256, but this gives us equal data than the one obtained # by Jasper Snoek train /= 256 valid /= 256 kwargs['train'] = train kwargs['train_targets'] = train_targets kwargs['valid'] = valid kwargs['valid_targets'] = valid_targets y = logistic(params_, **kwargs) print 'Result: ', y return y
def gsave(p, filename): """ Same as ``save(p,filname)``, but saves into a gzipped file. """ f = gfile(filename, 'wb') cPickle.dump(p, f, cPickle.HIGHEST_PROTOCOL) f.close()
def gsave(p, filename): """ Same as ``save(p,filname)``, but saves into a gzipped file. """ f=gfile(filename,'wb') cPickle.dump(p,f,cPickle.HIGHEST_PROTOCOL) f.close()
def gload(filename): """ Same as ``load(filname)``, but loads from a gzipped file. """ f = gfile(filename, 'rb') y = cPickle.load(f) f.close() return y
def gload(filename): """ Same as ``load(filname)``, but loads from a gzipped file. """ f=gfile(filename,'rb') y=cPickle.load(f) f.close() return y
def obtain(dir_path): """ Downloads the dataset to ``dir_path``. """ dir_path = os.path.expanduser(dir_path) print 'Downloading the dataset' import urllib urllib.urlretrieve('http://ai.stanford.edu/~btaskar/ocr/letter.data.gz', os.path.join(dir_path, 'letter.data.gz')) print 'Splitting dataset into training/validation/test sets' file = gfile(os.path.join(dir_path, 'letter.data.gz')) train_file, valid_file, test_file = [ open(os.path.join(dir_path, 'ocr_letters_' + ds + '.txt'), 'w') for ds in ['train', 'valid', 'test'] ] letters = 'abcdefghijklmnopqrstuvwxyz' all_data = [] s = '' # Putting all data in memory for line in file: tokens = line.strip('\n').strip('\t').split('\t') s += ' '.join(tokens[6:]) target = letters.find(tokens[1]) if target < 0: print 'Target ' + tokens[1] + ' not found!' s = s + ' ' + str(target) if int(tokens[2]) == -1: # new word starts next s = s + '\n' all_data += [s] s = '' else: s = s + ' ' # Shuffle data import random random.seed(12345) perm = range(len(all_data)) random.shuffle(perm) line_id = 0 train_valid_split = 5502 valid_test_split = 5502 + 688 for i in perm: s = all_data[i] if line_id < train_valid_split: train_file.write(s) elif line_id < valid_test_split: valid_file.write(s) else: test_file.write(s) line_id += 1 train_file.close() valid_file.close() test_file.close() print 'Done '
def main(params_, **kwargs): print 'Params: ', params_ #raise IndexError('aaaaaaaaaaaaaaaaaaaaaaaaaaaaa!') dataset = skdata.mnist.dataset.MNIST() dataset.fetch(True) convex_inputs = skdata.mnist.dataset.read(gfile(os.path.join(dataset.home(),"train-images-idx3-ubyte.gz"))) convex_labels = skdata.mnist.dataset.read(gfile(os.path.join(dataset.home(), "train-labels-idx1-ubyte.gz"))) convex_test = skdata.mnist.dataset.read(gfile(os.path.join(dataset.home(), "t10k-images-idx3-ubyte.gz"))) convex_test_labels = skdata.mnist.dataset.read(gfile(os.path.join(dataset.home(), "t10k-labels-idx1-ubyte.gz"))) convex_inputs = numpy.array(convex_inputs.reshape((-1, 784)), dtype=numpy.float32) convex_test = numpy.array(convex_test.reshape((-1, 784)), dtype=numpy.float32) fold = kwargs['fold'] folds = kwargs['folds'] downsize=kwargs['downsize'] if folds == 1: train = convex_inputs[:int(50000*downsize)] valid = convex_inputs[50000:60000] test = convex_test train_targets = convex_labels[:int(50000*downsize)] valid_targets = convex_labels[50000:60000] test_targets = convex_test_labels elif folds > 1: cv_data = numpy.copy(convex_inputs[60000]) train, valid = data_util.prepare_cv_for_fold(cv_data, fold, folds) cv_labels = numpy.copy(convex_labels[:60000]) train_targets, valid_targets = data_util.prepare_cv_for_fold(cv_labels, fold, folds) test = convex_test test_targets = convex_test_labels else: raise ValueError("Folds cannot be less than 1") # I do not know why 256, but this gives us equal data than the one obtained # by Jasper Snoek train /= 256 valid /= 256 kwargs['train'] = train kwargs['train_targets'] = train_targets kwargs['valid'] = valid kwargs['valid_targets'] = valid_targets y = logistic(params_, **kwargs) print 'Result: ', y return y
def obtain(dir_path): """ Downloads the dataset to ``dir_path``. """ dir_path = os.path.expanduser(dir_path) print 'Downloading the dataset' import urllib urllib.urlretrieve( 'http://info.usherbrooke.ca/hlarochelle/public/letter.data.gz', os.path.join(dir_path, 'letter.data.gz')) print 'Splitting dataset into training/validation/test sets' file = gfile(os.path.join(dir_path, 'letter.data.gz')) train_file, valid_file, test_file = [ open(os.path.join(dir_path, 'ocr_letters_' + ds + '.txt'), 'w') for ds in ['train', 'valid', 'test'] ] letters = 'abcdefghijklmnopqrstuvwxyz' all_data = [] # Putting all data in memory for line in file: tokens = line.strip('\n').strip('\t').split('\t') s = '' for t in range(6, len(tokens)): s = s + tokens[t] + ' ' target = letters.find(tokens[1]) if target < 0: print 'Target ' + tokens[1] + ' not found!' s = s + str(target) + '\n' all_data += [s] # Shuffle data import random random.seed(12345) perm = range(len(all_data)) random.shuffle(perm) line_id = 0 train_valid_split = 32152 valid_test_split = 42152 for i in perm: s = all_data[i] if line_id < train_valid_split: train_file.write(s) elif line_id < valid_test_split: valid_file.write(s) else: test_file.write(s) line_id += 1 train_file.close() valid_file.close() test_file.close() print 'Done '
def obtain(dir_path): """ Downloads the dataset to ``dir_path``. """ dir_path = os.path.expanduser(dir_path) print 'Downloading the dataset' import urllib urllib.urlretrieve('http://ai.stanford.edu/~btaskar/ocr/letter.data.gz',os.path.join(dir_path,'letter.data.gz')) print 'Splitting dataset into training/validation/test sets' file = gfile(os.path.join(dir_path,'letter.data.gz')) train_file,valid_file,test_file = [open(os.path.join(dir_path, 'ocr_letters_' + ds + '.txt'),'w') for ds in ['train','valid','test']] letters = 'abcdefghijklmnopqrstuvwxyz' all_data = [] # Putting all data in memory for line in file: tokens = line.strip('\n').strip('\t').split('\t') s = '' for t in range(6,len(tokens)): s = s + tokens[t] + ' ' target = letters.find(tokens[1]) if target < 0: print 'Target ' + tokens[1] + ' not found!' s = s + str(target) + '\n' all_data += [s] # Shuffle data import random random.seed(12345) perm = range(len(all_data)) random.shuffle(perm) line_id = 0 train_valid_split = 32152 valid_test_split = 42152 for i in perm: s = all_data[i] if line_id < train_valid_split: train_file.write(s) elif line_id < valid_test_split: valid_file.write(s) else: test_file.write(s) line_id += 1 train_file.close() valid_file.close() test_file.close() print 'Done '
def load_file(filename, file_format, use_percentage): if not os.path.exists(filename): raise IOError("File %s not found", filename) if file_format == "gfile": logger.info("Loading file: %s", filename) fh = gfile(filename, "rb") data = pickle.load(fh) if use_percentage >= 100.: pass else: max_data = int(len(data) / 100. * use_percentage) data = data[:max_data] fh.close() logger.info("Done loading file: %s has %d datapoints", filename, len(data)) elif file_format == "pickle": logger.info("Loading file: %s", filename) fh = open(filename, "r") data = pickle.load(fh) if use_percentage >= 100.: pass else: data = data[:len(data) / 100. * use_percentage] fh.close() logger.info("Done loading file: %s has %d datapoints", filename, len(data)) elif file_format == "numpy": logger.info("Loading file: %s", filename) fh = open(filename, "r") data = np.load(fh) if use_percentage >= 100.: pass else: data = data[:len(data) / 100. * use_percentage] fh.close() logger.info("Done loading file: %s has %d datapoints", filename, len(data)) else: raise ValueError("%s is an unknown training_data_format", file_format) return data
def load_file(filename, file_format, use_percentage): if not os.path.exists(filename): raise IOError("File %s not found", filename) if file_format == "gfile": print "Loading file:", filename fh = gfile(filename, "rb") data = cPickle.load(fh) if use_percentage >= 100.: pass else: max_data = int(len(data) / 100. * use_percentage) data = data[:max_data] fh.close() print "Done loading file:", filename, "has", len(data), "datapoints" sys.stdout.flush() elif file_format == "pickle": print "Loading file:", filename fh = open(filename, "r") data = cPickle.load(fh) if use_percentage >= 100.: pass else: data = data[:len(data) / 100. * use_percentage] fh.close() print "Done loading file:", filename, "has", len(data), "datapoints" elif file_format == "numpy": print "Loading file:", filename fh = open(filename, "r") data = np.load(fh) if use_percentage >= 100.: pass else: data = data[:len(data) / 100. * use_percentage] fh.close() print "Done loading file:", filename, "has", len(data), "datapoints" else: raise ValueError("%s is an unknown training_data_format", file_format) return data
def load_file(filename, file_format, use_percentage): if not os.path.exists(filename): raise IOError("File %s not found", filename) if file_format == "gfile": logger.info("Loading file: %s", filename) fh = gfile(filename, "rb") data = cPickle.load(fh) if use_percentage >= 100.0: pass else: max_data = int(len(data) / 100.0 * use_percentage) data = data[:max_data] fh.close() logger.info("Done loading file: %s has %d datapoints", filename, len(data)) elif file_format == "pickle": logger.info("Loading file: %s", filename) fh = open(filename, "r") data = cPickle.load(fh) if use_percentage >= 100.0: pass else: data = data[: len(data) / 100.0 * use_percentage] fh.close() logger.info("Done loading file: %s has %d datapoints", filename, len(data)) elif file_format == "numpy": logger.info("Loading file: %s", filename) fh = open(filename, "r") data = np.load(fh) if use_percentage >= 100.0: pass else: data = data[: len(data) / 100.0 * use_percentage] fh.close() logger.info("Done loading file: %s has %d datapoints", filename, len(data)) else: raise ValueError("%s is an unknown training_data_format", file_format) return data
def test_load_file(self): # Test numpy arrays train_data = np.zeros((100, 100)) np.save("train_data.npy", train_data) data = data_util.load_file("train_data.npy", "numpy", 100) self.assertTrue((train_data == data).all()) data = data_util.load_file("train_data.npy", "numpy", 10) self.assertTrue((train_data[:10] == data).all()) os.remove("train_data.npy") # Test pickle files train_data = np.zeros((100, 100)) fh = open("train_data.pkl", "w") cPickle.dump(train_data, fh) fh.close() data = data_util.load_file("train_data.pkl", "pickle", 100) self.assertTrue((train_data == data).all()) data = data_util.load_file("train_data.pkl", "pickle", 10) self.assertTrue((train_data[:10] == data).all()) os.remove("train_data.pkl") # Test zipped pickle files train_data = np.zeros((100, 100)) from gzip import GzipFile as gfile fh = gfile("train_data.pkl.gz", "w") cPickle.dump(train_data, fh) fh.close() data = data_util.load_file("train_data.pkl.gz", "gfile", 100) self.assertTrue((train_data == data).all()) data = data_util.load_file("train_data.pkl.gz", "gfile", 10) self.assertTrue((train_data[:10] == data).all()) os.remove("train_data.pkl.gz") # Test wrong data file type self.assertRaises(IOError, data_util.load_file, "test.sh", "uditare", 1) self.assertRaises(IOError, data_util.load_file, "", "uditare", 1)
def gsave(p, filename): f=gfile(filename,'wb') cPickle.dump(p,f,cPickle.HIGHEST_PROTOCOL) f.close()
def gload(filename): f=gfile(filename,'rb') y=cPickle.load(f) f.close() return y