def load_test(CVsplit): print "loadTest started!" xb_test = utils.load_gz('data/numpy/test/Btst%s.npy.gz' % CVsplit).astype('float32') tb_test = np.zeros((xb_test.shape[0],1), dtype='float32') xs_test = utils.load_gz('data/numpy/test/Stst%s.npy.gz' % CVsplit).astype('float32') ts_test = np.ones((xs_test.shape[0],1), dtype='float32') return xb_test, tb_test, xs_test, ts_test
def load_test(CVsplit): print "loadTest started!" xb_test = utils.load_gz('data/numpy/test/Btst%s.npy.gz' % CVsplit).astype('float32') tb_test = np.zeros((xb_test.shape[0], 1), dtype='float32') xs_test = utils.load_gz('data/numpy/test/Stst%s.npy.gz' % CVsplit).astype('float32') ts_test = np.ones((xs_test.shape[0], 1), dtype='float32') return xb_test, tb_test, xs_test, ts_test
def load_data(CVsplit): print "loadData started!" xb_train = utils.load_gz('data/numpy/train/Btrn%s.npy.gz' % CVsplit).astype('float32') tb_train = np.zeros((xb_train.shape[0],1), dtype='float32') xs_train = utils.load_gz('data/numpy/train/Strn%s.npy.gz' % CVsplit).astype('float32') ts_train = np.ones((xs_train.shape[0],1), dtype='float32') xb_valid = utils.load_gz('data/numpy/train/Bval%s.npy.gz' % CVsplit).astype('float32') tb_valid = np.zeros((xb_valid.shape[0],1), dtype='float32') xs_valid = utils.load_gz('data/numpy/train/Sval%s.npy.gz' % CVsplit).astype('float32') ts_valid = np.ones((xs_valid.shape[0],1), dtype='float32') return xb_train, xb_valid, tb_train, tb_valid, xs_train, xs_valid, ts_train, ts_valid
def load_test(CVsplit): if not os.path.isfile(TEST_PATH): print("Downloading and extracting test ...") subprocess.call("bash download_test.sh", shell=True) subprocess.call("python create_data.py test", shell=True) else: print "Test already downloaded ..." xb_test = utils.load_gz('data/numpy/test/Btst%s.npy.gz' % CVsplit).astype('float32') tb_test = np.zeros((xb_test.shape[0]), dtype='float32') xs_test = utils.load_gz('data/numpy/test/Stst%s.npy.gz' % CVsplit).astype('float32') ts_test = np.ones((xs_test.shape[0]), dtype='float32') return xb_test, tb_test, xs_test, ts_test
def load_data(CVsplit): print "loadData started!" xb_train = utils.load_gz('data/numpy/train/Btrn%s.npy.gz' % CVsplit).astype('float32') tb_train = np.zeros((xb_train.shape[0], 1), dtype='float32') xs_train = utils.load_gz('data/numpy/train/Strn%s.npy.gz' % CVsplit).astype('float32') ts_train = np.ones((xs_train.shape[0], 1), dtype='float32') xb_valid = utils.load_gz('data/numpy/train/Bval%s.npy.gz' % CVsplit).astype('float32') tb_valid = np.zeros((xb_valid.shape[0], 1), dtype='float32') xs_valid = utils.load_gz('data/numpy/train/Sval%s.npy.gz' % CVsplit).astype('float32') ts_valid = np.ones((xs_valid.shape[0], 1), dtype='float32') return xb_train, xb_valid, tb_train, tb_valid, xs_train, xs_valid, ts_train, ts_valid
def get_train(): print('[Info] Build training and validation dataset ...') X_in = load_gz(TRAIN_PATH) X = np.reshape(X_in, (5534, 700, 57)) del X_in X = X[:, :, :] X_total, labels_total = get_instances_from_file(X) # getting meta num_seqs = np.size(X_total, 0) seq_names = np.arange(0, num_seqs) print('[Info] Num of training and validation dataset: ', num_seqs) #boundary = 5278 boundary = int(0.9 * num_seqs) X_train = X_total[seq_names[0:boundary]] labels_train = labels_total[seq_names[0:boundary]] X_valid = X_total[seq_names[boundary:num_seqs]] labels_valid = labels_total[seq_names[boundary:num_seqs]] return X_train.tolist(), labels_train.tolist(), X_valid.tolist( ), labels_valid.tolist()
def load_train(CVsplit): if not os.path.isfile(TRAIN_PATH): print("Downloading and extracting train ...") subprocess.call("bash download_train.sh", shell=True) subprocess.call("python create_data.py train", shell=True) else: print "Train already downloaded ..." xb_train = utils.load_gz('data/numpy/train/Btrn%s.npy.gz' % CVsplit).astype('float32') tb_train = np.zeros((xb_train.shape[0]), dtype='float32') xs_train = utils.load_gz('data/numpy/train/Strn%s.npy.gz' % CVsplit).astype('float32') ts_train = np.ones((xs_train.shape[0]), dtype='float32') xb_valid = utils.load_gz('data/numpy/train/Bval%s.npy.gz' % CVsplit).astype('float32') tb_valid = np.zeros((xb_valid.shape[0]), dtype='float32') xs_valid = utils.load_gz('data/numpy/train/Sval%s.npy.gz' % CVsplit).astype('float32') ts_valid = np.ones((xs_valid.shape[0]), dtype='float32') return xb_train, xb_valid, tb_train, tb_valid, xs_train, xs_valid, ts_train, ts_valid
def get_train(path=TRAIN_PATH): if not os.path.isfile(path): print("Train path is not downloaded ...") subprocess.call("./download_train.sh", shell=True) else: print("Train path is downloaded ...") print("Loading train data ...") X_in = utils.load_gz(path) X = np.reshape(X_in, (5534, 700, 57)) del X_in X = X[:, :, :] labels = X[:, :, 22:30] mask = X[:, :, 30] * -1 + 1 a = np.arange(0, 21) b = np.arange(35, 56) c = np.hstack((a, b)) X = X[:, :, c] # getting meta num_seqs = np.size(X, 0) seqlen = np.size(X, 1) d = np.size(X, 2) num_classes = 8 #### REMAKING LABELS #### X = X.astype(theano.config.floatX) mask = mask.astype(theano.config.floatX) # Dummy -> concat vals = np.arange(0, 8) labels_new = np.zeros((num_seqs, seqlen)) for i in range(np.size(labels, axis=0)): labels_new[i, :] = np.dot(labels[i, :, :], vals) labels_new = labels_new.astype('int32') labels = labels_new print("Loading splits ...") ##### SPLITS ##### # getting splits (cannot run before splits are made) # split = np.load("data/split.pkl") seq_names = np.arange(0, num_seqs) # np.random.shuffle(seq_names) X_train = X[seq_names[0:5278]] X_valid = X[seq_names[5278:5534]] labels_train = labels[seq_names[0:5278]] labels_valid = labels[seq_names[5278:5534]] mask_train = mask[seq_names[0:5278]] mask_valid = mask[seq_names[5278:5534]] num_seq_train = np.size(X_train, 0) num_seq_valid = np.size(X_valid, 0) return X_train, X_valid, labels_train, labels_valid, mask_train, \ mask_valid, num_seq_train
def get_test(): print('[Info] Build testing dataset ...') X_test_in = load_gz(TEST_PATH) X_test = np.reshape(X_test_in, (514, 700, 57)) del X_test_in X_test = X_test[:, :, :] X_test, labels_test = get_instances_from_file(X_test) print('[Info] Num of test dataset: ', np.size(X_test, 0)) return X_test.tolist(), labels_test.tolist()
def get_train(): if not os.path.isfile(TRAIN_PATH): print("No Training Data Available ...") else: print("Training Data is Available ...") print("Loading train data ...") x_in = utils.load_gz(TRAIN_PATH) x = np.reshape(x_in, (5534, 700, 57)) del x_in x = x[:, :, :] labels = x[:, :, 22:30] mask = x[:, :, 30] * -1 + 1 amino_acid_residues = np.arange(0, 21) sequence_profile = np.arange(35, 56) horizontal_stack = np.hstack((amino_acid_residues, sequence_profile)) x = x[:, :, horizontal_stack] print('x: ', x) # getting meta # num_seqs_row = np.size(x, 0) seqlen_column = np.size(x, 1) # REMAKING LABELS # x = x.astype(theano.config.floatX) mask = mask.astype(theano.config.floatX) # Dummy -> concat vals = np.arange(0, 8) labels_new = np.zeros((num_seqs_row, seqlen_column)) for i in xrange(np.size(labels, axis=0)): labels_new[i, :] = np.dot(labels[i, :, :], vals) labels_new = labels_new.astype('int32') labels = labels_new print("labels: ", labels) print("Loading splits ...") # SPLITS # seq_names = np.arange(0, num_seqs_row) x_train = x[seq_names[0:5278]] x_valid = x[seq_names[5278:5534]] labels_train = labels[seq_names[0:5278]] labels_valid = labels[seq_names[5278:5534]] mask_train = mask[seq_names[0:5278]] mask_valid = mask[seq_names[5278:5534]] num_seq_train = np.size(x_train, 0) return x_train, x_valid, labels_train, labels_valid, mask_train, mask_valid, num_seq_train
def get_test(seq_len=None): if not os.path.isfile(TEST_PATH): subprocess.call("./download_test.sh", shell=True) print("Loading test data ...") X_test_in = utils.load_gz(TEST_PATH) X_test = np.reshape(X_test_in, (514, 700, 57)) del X_test_in X_test = X_test[:, :, :].astype("float32") labels_test = X_test[:, :, 22:30].astype('int32') mask_test = X_test[:, :, 30].astype("float32") * -1 + 1 a = np.arange(0, 21) b = np.arange(35, 56) c = np.hstack((a, b)) X_test = X_test[:, :, c] # getting meta seqlen = np.size(X_test, 1) d = np.size(X_test, 2) num_classes = 8 num_seq_test = np.size(X_test, 0) del a, b, c ## DUMMY -> CONCAT ## vals = np.arange(0, 8) labels_new = np.zeros((num_seq_test, seqlen)) for i in xrange(np.size(labels_test, axis=0)): labels_new[i, :] = np.dot(labels_test[i, :, :], vals) labels_new = labels_new.astype('int32') labels_test = labels_new ### ADDING BATCH PADDING ### X_add = np.zeros((126, seqlen, d)) label_add = np.zeros((126, seqlen)) mask_add = np.zeros((126, seqlen)) # X_test = np.concatenate((X_test, X_add), axis=0).astype("float32") labels_test = np.concatenate((labels_test, label_add), axis=0).astype('int32') mask_test = np.concatenate((mask_test, mask_add), axis=0).astype("float32") if seq_len is not None: X_test = X_test[:, :seq_len] labels_test = labels_test[:, :seq_len] mask_test = mask_test[:, :seq_len] len_test = np.sum(mask_test, axis=1) len_test[-126:] = np.ones((126, ), dtype='int32') return X_test, mask_test, labels_test, num_seq_test, len_test
def make_dataset(path): data = load_gz(path) data = data.reshape(-1, 700, 57) idx = np.append(np.arange(21), np.arange(35, 56)) X = data[:, :, idx] X = X.transpose(0, 2, 1) X = X.astype('float32') y = data[:, :, 22:30] y = np.array([np.dot(yi, np.arange(8)) for yi in y]) y = y.astype('float32') mask = data[:, :, 30] * -1 + 1 seq_len = mask.sum(axis=1) seq_len = seq_len.astype('float32') return X, y, seq_len
def get_test(): if not os.path.isfile(TEST_PATH): print("Test Data Unavailable") print("Loading test data ...") x_test_in = utils.load_gz(TEST_PATH) x_test = np.reshape(x_test_in, (514, 700, 57)) del x_test_in x_test = x_test[:, :, :].astype(theano.config.floatX) labels_test = x_test[:, :, 22:30].astype('int32') mask_test = x_test[:, :, 30].astype(theano.config.floatX) * -1 + 1 a = np.arange(0, 21) b = np.arange(35, 56) c = np.hstack((a, b)) x_test = x_test[:, :, c] # getting meta seqlen = np.size(x_test, 1) d = np.size(x_test, 2) num_classes = 8 num_seq_test = np.size(x_test, 0) del a, b, c # DUMMY -> CONCAT # vals = np.arange(0, 8) labels_new = np.zeros((num_seq_test, seqlen)) for i in xrange(np.size(labels_test, axis=0)): labels_new[i, :] = np.dot(labels_test[i, :, :], vals) labels_new = labels_new.astype('int32') labels_test = labels_new # ADDING BATCH PADDING # x_add = np.zeros((126, seqlen, d)) label_add = np.zeros((126, seqlen)) mask_add = np.zeros((126, seqlen)) x_test = np.concatenate((x_test, x_add), axis=0).astype(theano.config.floatX) labels_test = np.concatenate((labels_test, label_add), axis=0).astype('int32') mask_test = np.concatenate((mask_test, mask_add), axis=0).astype(theano.config.floatX) return x_test, mask_test, labels_test, num_seq_test
def load_data(CVsplit): print "loadData started!" if(len(glob.glob('./data/csv/*'))!=len(glob.glob('./data/numpy/*'))): print "converting data ..." convert_data(); xb_train = utils.load_gz('data/numpy/Btrn%s.npy.gz' % CVsplit).astype('float32') tb_train = np.zeros((xb_train.shape[0],1), dtype='float32') xs_train = utils.load_gz('data/numpy/Strn%s.npy.gz' % CVsplit).astype('float32') ts_train = np.ones((xs_train.shape[0],1), dtype='float32') xb_test = utils.load_gz('data/numpy/Btst%s.npy.gz' % CVsplit).astype('float32') tb_test = np.zeros((xb_test.shape[0],1), dtype='float32') xs_test = utils.load_gz('data/numpy/Stst%s.npy.gz' % CVsplit).astype('float32') ts_test = np.ones((xs_test.shape[0],1), dtype='float32') xb_valid = utils.load_gz('data/numpy/Bval%s.npy.gz' % CVsplit).astype('float32') tb_valid = np.zeros((xb_valid.shape[0],1), dtype='float32') xs_valid = utils.load_gz('data/numpy/Sval%s.npy.gz' % CVsplit).astype('float32') ts_valid = np.ones((xs_valid.shape[0],1), dtype='float32') return xb_train, xb_valid, xb_test, tb_train, tb_valid, tb_test, xs_train, xs_valid, xs_test, ts_train, ts_valid, ts_test
def _prepare_data(self): if not os.path.exists("data/train.npy.gz"): self._load_data() self._preprocess_data() else: self.samples = utils.load_gz("data/train.npy.gz")
""" Created on Wed Jun 17 12:00:27 2015 @author: alexander """ import numpy as np import cPickle as pickle import sklearn import sklearn.cross_validation import theano import utils ## From data print("Loading train data ...") X_in = utils.load_gz('data/cullpdb+profile_6133_filtered.npy.gz') X = np.reshape(X_in, (5534, 700, 57)) del X_in X = X[:, 0:100, :] labels = X[:, :, 22:30] mask = X[:, :, 30] a = np.arange(0, 21) b = np.arange(35, 56) c = np.hstack((a, b)) X = X[:, :, c] # getting meta num_seqs = np.size(X, 0) seqlen = np.size(X, 1) d = np.size(X, 2)
paths_train = glob.glob("data/train/*/*") paths_train.sort() paths_test = glob.glob("data/test/*") paths_test.sort() paths = { 'train': paths_train, 'test': paths_test, } # labels_train = np.zeros(len(paths['train']), dtype='int32') # for k, path in enumerate(paths['train']): # class_name = os.path.basename(os.path.dirname(path)) # labels_train[k] = class_names.index(class_name) labels_train = utils.load_gz("data/labels_train.npy.gz") default_augmentation_params = { 'zoom_range': (1 / 1.1, 1.1), 'rotation_range': (0, 360), 'shear_range': (0, 0), 'translation_range': (-4, 4), 'do_flip': True, 'allow_stretch': False, } no_augmentation_params = { 'zoom_range': (1.0, 1.0), 'rotation_range': (0, 0), 'shear_range': (0, 0), 'translation_range': (0, 0),
""" Created on Wed Jun 17 12:00:27 2015 @author: alexander """ import numpy as np import cPickle as pickle import sklearn import sklearn.cross_validation import theano import utils ## From data print("Loading train data ...") X_in = utils.load_gz('data/cullpdb+profile_6133_filtered.npy.gz') X = np.reshape(X_in,(5534,700,57)) del X_in X = X[:,0:100,:] labels = X[:,:,22:30] mask = X[:,:,30] a = np.arange(0,21) b = np.arange(35,56) c = np.hstack((a,b)) X = X[:,:,c] # getting meta num_seqs = np.size(X,0) seqlen = np.size(X,1)
import numpy as np import theano import utils import load_protvec ##### TRAIN DATA ##### print("Loading train data ...") protein_vector_file = "data/protVec_100d_3grams_clean.csv" addProtVec = False X_in = utils.load_gz("data/cullpdb+profile_6133_filtered.npy.gz") X = np.reshape(X_in, (5534, 700, 57)) del X_in X = X[:, :, :] labels = X[:, :, 22:30] mask = X[:, :, 30] * -1 + 1 a = np.arange(0, 21) b = np.arange(35, 56) c = np.hstack((a, b)) X = X[:, :, c] # If using ProtVec # http://arxiv.org/abs/1503.05140 if addProtVec: ProtVec_train = load_protvec.load_protvec_encoding(X, mask, protein_vector_file, protvec_dim=100) newX = np.zeros((X.shape[0], X.shape[1], X.shape[2] + ProtVec_train.shape[2])) newX[:, :, : X.shape[2]] = X newX[:, :, X.shape[2] :] = ProtVec_train X = newX
paths_train.sort() paths_test = glob.glob("data/test/*") paths_test.sort() paths = { 'train': paths_train, 'test': paths_test, } # labels_train = np.zeros(len(paths['train']), dtype='int32') # for k, path in enumerate(paths['train']): # class_name = os.path.basename(os.path.dirname(path)) # labels_train[k] = class_names.index(class_name) labels_train = utils.load_gz("data/labels_train.npy.gz") default_augmentation_params = { 'zoom_range': (1 / 1.1, 1.1), 'rotation_range': (0, 360), 'shear_range': (0, 0), 'translation_range': (-4, 4), 'do_flip': True, 'allow_stretch': False, } no_augmentation_params = { 'zoom_range': (1.0, 1.0), 'rotation_range': (0, 0), 'shear_range': (0, 0),
def load(subset='train'): """ Load all images into memory for faster processing """ return utils.load_gz("data/images_%s.npy.gz" % subset)
import numpy as np import theano import utils ##### TRAIN DATA ##### print("Loading train data ...") X_in = utils.load_gz("data/cullpdb+profile_6133_filtered.npy.gz") X = np.reshape(X_in, (5534, 700, 57)) del X_in X = X[:, :, :] labels = X[:, :, 22:30] mask = X[:, :, 30] * -1 + 1 a = np.arange(0, 21) b = np.arange(35, 56) c = np.hstack((a, b)) X = X[:, :, c] # If using ProtVec # http://arxiv.org/abs/1503.05140 addProtVec = True if addProtVec: ProtVec = utils.load_gz("data/X_train_protvec.npy") newX = np.zeros((X.shape[0], X.shape[1], X.shape[2] + ProtVec.shape[2])) newX[:, :, : X.shape[2]] = X newX[:, :, X.shape[2] :] = ProtVec X = newX del newX # getting meta
class_names = [os.path.basename(d) for d in directories] class_names.sort() num_classes = len(class_names) DEFAULT_VALIDATION_SPLIT = './data/validation_split_v1.pkl' paths_train = glob.glob("data/train/*/*") paths_train.sort() paths_test = glob.glob("data/test/*") paths_test.sort() paths = {'train': paths_train, 'test': paths_test} labels_train_path = 'data/labels_train.npy.gz' labels_train = utils.load_gz(labels_train_path) class LoadMethod(object): def __init__(self, paths, labels=None): self.paths = paths self.labels = labels def __call__(self, idx): if self.labels is not None: return skimage.io.imread(self.paths[idx], as_grey=True).astype( 'float32'), self.labels[idx].astype('int32') else: return skimage.io.imread(self.paths[idx], as_grey=True).astype('float32')