Пример #1
0
def load_test(CVsplit):
	print "loadTest started!"
	xb_test = utils.load_gz('data/numpy/test/Btst%s.npy.gz' % CVsplit).astype('float32')
	tb_test = np.zeros((xb_test.shape[0],1), dtype='float32')
	xs_test = utils.load_gz('data/numpy/test/Stst%s.npy.gz' % CVsplit).astype('float32')
	ts_test = np.ones((xs_test.shape[0],1), dtype='float32')

	return xb_test, tb_test, xs_test, ts_test
Пример #2
0
def load_test(CVsplit):
    print "loadTest started!"
    xb_test = utils.load_gz('data/numpy/test/Btst%s.npy.gz' %
                            CVsplit).astype('float32')
    tb_test = np.zeros((xb_test.shape[0], 1), dtype='float32')
    xs_test = utils.load_gz('data/numpy/test/Stst%s.npy.gz' %
                            CVsplit).astype('float32')
    ts_test = np.ones((xs_test.shape[0], 1), dtype='float32')

    return xb_test, tb_test, xs_test, ts_test
Пример #3
0
def load_data(CVsplit):
	print "loadData started!"
	xb_train = utils.load_gz('data/numpy/train/Btrn%s.npy.gz' % CVsplit).astype('float32')
	tb_train = np.zeros((xb_train.shape[0],1), dtype='float32')
	xs_train = utils.load_gz('data/numpy/train/Strn%s.npy.gz' % CVsplit).astype('float32')
	ts_train = np.ones((xs_train.shape[0],1), dtype='float32')
	xb_valid = utils.load_gz('data/numpy/train/Bval%s.npy.gz' % CVsplit).astype('float32')
	tb_valid = np.zeros((xb_valid.shape[0],1), dtype='float32')
	xs_valid = utils.load_gz('data/numpy/train/Sval%s.npy.gz' % CVsplit).astype('float32')
	ts_valid = np.ones((xs_valid.shape[0],1), dtype='float32')

	return xb_train, xb_valid, tb_train, tb_valid, xs_train, xs_valid, ts_train, ts_valid
Пример #4
0
def load_test(CVsplit):
    if not os.path.isfile(TEST_PATH):
        print("Downloading and extracting test ...")
        subprocess.call("bash download_test.sh", shell=True)
        subprocess.call("python create_data.py test", shell=True)
    else:
        print "Test already downloaded ..."
    xb_test = utils.load_gz('data/numpy/test/Btst%s.npy.gz' % CVsplit).astype('float32')
    tb_test = np.zeros((xb_test.shape[0]), dtype='float32')
    xs_test = utils.load_gz('data/numpy/test/Stst%s.npy.gz' % CVsplit).astype('float32')
    ts_test = np.ones((xs_test.shape[0]), dtype='float32')

    return xb_test, tb_test, xs_test, ts_test
Пример #5
0
def load_test(CVsplit):
    if not os.path.isfile(TEST_PATH):
        print("Downloading and extracting test ...")
        subprocess.call("bash download_test.sh", shell=True)
        subprocess.call("python create_data.py test", shell=True)
    else:
        print "Test already downloaded ..."
    xb_test = utils.load_gz('data/numpy/test/Btst%s.npy.gz' %
                            CVsplit).astype('float32')
    tb_test = np.zeros((xb_test.shape[0]), dtype='float32')
    xs_test = utils.load_gz('data/numpy/test/Stst%s.npy.gz' %
                            CVsplit).astype('float32')
    ts_test = np.ones((xs_test.shape[0]), dtype='float32')

    return xb_test, tb_test, xs_test, ts_test
Пример #6
0
def load_data(CVsplit):
    print "loadData started!"
    xb_train = utils.load_gz('data/numpy/train/Btrn%s.npy.gz' %
                             CVsplit).astype('float32')
    tb_train = np.zeros((xb_train.shape[0], 1), dtype='float32')
    xs_train = utils.load_gz('data/numpy/train/Strn%s.npy.gz' %
                             CVsplit).astype('float32')
    ts_train = np.ones((xs_train.shape[0], 1), dtype='float32')
    xb_valid = utils.load_gz('data/numpy/train/Bval%s.npy.gz' %
                             CVsplit).astype('float32')
    tb_valid = np.zeros((xb_valid.shape[0], 1), dtype='float32')
    xs_valid = utils.load_gz('data/numpy/train/Sval%s.npy.gz' %
                             CVsplit).astype('float32')
    ts_valid = np.ones((xs_valid.shape[0], 1), dtype='float32')

    return xb_train, xb_valid, tb_train, tb_valid, xs_train, xs_valid, ts_train, ts_valid
Пример #7
0
def get_train():
    print('[Info] Build training and validation dataset ...')
    X_in = load_gz(TRAIN_PATH)
    X = np.reshape(X_in, (5534, 700, 57))
    del X_in
    X = X[:, :, :]

    X_total, labels_total = get_instances_from_file(X)

    # getting meta
    num_seqs = np.size(X_total, 0)
    seq_names = np.arange(0, num_seqs)

    print('[Info] Num of training and validation dataset: ', num_seqs)

    #boundary = 5278
    boundary = int(0.9 * num_seqs)

    X_train = X_total[seq_names[0:boundary]]
    labels_train = labels_total[seq_names[0:boundary]]

    X_valid = X_total[seq_names[boundary:num_seqs]]
    labels_valid = labels_total[seq_names[boundary:num_seqs]]

    return X_train.tolist(), labels_train.tolist(), X_valid.tolist(
    ), labels_valid.tolist()
Пример #8
0
def load_train(CVsplit):
    if not os.path.isfile(TRAIN_PATH):
        print("Downloading and extracting train ...")
        subprocess.call("bash download_train.sh", shell=True)
        subprocess.call("python create_data.py train", shell=True)
    else:
        print "Train already downloaded ..."
    xb_train = utils.load_gz('data/numpy/train/Btrn%s.npy.gz' % CVsplit).astype('float32')
    tb_train = np.zeros((xb_train.shape[0]), dtype='float32')
    xs_train = utils.load_gz('data/numpy/train/Strn%s.npy.gz' % CVsplit).astype('float32')
    ts_train = np.ones((xs_train.shape[0]), dtype='float32')
    xb_valid = utils.load_gz('data/numpy/train/Bval%s.npy.gz' % CVsplit).astype('float32')
    tb_valid = np.zeros((xb_valid.shape[0]), dtype='float32')
    xs_valid = utils.load_gz('data/numpy/train/Sval%s.npy.gz' % CVsplit).astype('float32')
    ts_valid = np.ones((xs_valid.shape[0]), dtype='float32')

    return xb_train, xb_valid, tb_train, tb_valid, xs_train, xs_valid, ts_train, ts_valid
Пример #9
0
def get_train(path=TRAIN_PATH):
    if not os.path.isfile(path):
        print("Train path is not downloaded ...")
        subprocess.call("./download_train.sh", shell=True)
    else:
        print("Train path is downloaded ...")
    print("Loading train data ...")
    X_in = utils.load_gz(path)
    X = np.reshape(X_in, (5534, 700, 57))
    del X_in
    X = X[:, :, :]
    labels = X[:, :, 22:30]
    mask = X[:, :, 30] * -1 + 1

    a = np.arange(0, 21)
    b = np.arange(35, 56)
    c = np.hstack((a, b))
    X = X[:, :, c]

    # getting meta
    num_seqs = np.size(X, 0)
    seqlen = np.size(X, 1)
    d = np.size(X, 2)
    num_classes = 8

    #### REMAKING LABELS ####
    X = X.astype(theano.config.floatX)
    mask = mask.astype(theano.config.floatX)
    # Dummy -> concat
    vals = np.arange(0, 8)
    labels_new = np.zeros((num_seqs, seqlen))
    for i in range(np.size(labels, axis=0)):
        labels_new[i, :] = np.dot(labels[i, :, :], vals)
    labels_new = labels_new.astype('int32')
    labels = labels_new

    print("Loading splits ...")
    ##### SPLITS #####
    # getting splits (cannot run before splits are made)
    # split = np.load("data/split.pkl")

    seq_names = np.arange(0, num_seqs)
    # np.random.shuffle(seq_names)

    X_train = X[seq_names[0:5278]]
    X_valid = X[seq_names[5278:5534]]
    labels_train = labels[seq_names[0:5278]]
    labels_valid = labels[seq_names[5278:5534]]
    mask_train = mask[seq_names[0:5278]]
    mask_valid = mask[seq_names[5278:5534]]
    num_seq_train = np.size(X_train, 0)
    num_seq_valid = np.size(X_valid, 0)
    return X_train, X_valid, labels_train, labels_valid, mask_train, \
           mask_valid, num_seq_train
Пример #10
0
def get_test():
    print('[Info] Build testing dataset ...')
    X_test_in = load_gz(TEST_PATH)
    X_test = np.reshape(X_test_in, (514, 700, 57))
    del X_test_in
    X_test = X_test[:, :, :]

    X_test, labels_test = get_instances_from_file(X_test)

    print('[Info] Num of test dataset: ', np.size(X_test, 0))

    return X_test.tolist(), labels_test.tolist()
Пример #11
0
def load_train(CVsplit):
    if not os.path.isfile(TRAIN_PATH):
        print("Downloading and extracting train ...")
        subprocess.call("bash download_train.sh", shell=True)
        subprocess.call("python create_data.py train", shell=True)
    else:
        print "Train already downloaded ..."
    xb_train = utils.load_gz('data/numpy/train/Btrn%s.npy.gz' %
                             CVsplit).astype('float32')
    tb_train = np.zeros((xb_train.shape[0]), dtype='float32')
    xs_train = utils.load_gz('data/numpy/train/Strn%s.npy.gz' %
                             CVsplit).astype('float32')
    ts_train = np.ones((xs_train.shape[0]), dtype='float32')
    xb_valid = utils.load_gz('data/numpy/train/Bval%s.npy.gz' %
                             CVsplit).astype('float32')
    tb_valid = np.zeros((xb_valid.shape[0]), dtype='float32')
    xs_valid = utils.load_gz('data/numpy/train/Sval%s.npy.gz' %
                             CVsplit).astype('float32')
    ts_valid = np.ones((xs_valid.shape[0]), dtype='float32')

    return xb_train, xb_valid, tb_train, tb_valid, xs_train, xs_valid, ts_train, ts_valid
Пример #12
0
def get_train():
    if not os.path.isfile(TRAIN_PATH):
        print("No Training Data Available ...")
    else:
        print("Training Data is Available ...")
    print("Loading train data ...")
    x_in = utils.load_gz(TRAIN_PATH)
    x = np.reshape(x_in, (5534, 700, 57))
    del x_in
    x = x[:, :, :]
    labels = x[:, :, 22:30]
    mask = x[:, :, 30] * -1 + 1

    amino_acid_residues = np.arange(0, 21)
    sequence_profile = np.arange(35, 56)
    horizontal_stack = np.hstack((amino_acid_residues, sequence_profile))
    x = x[:, :, horizontal_stack]
    print('x: ', x)

    # getting meta #
    num_seqs_row = np.size(x, 0)
    seqlen_column = np.size(x, 1)

    # REMAKING LABELS #
    x = x.astype(theano.config.floatX)
    mask = mask.astype(theano.config.floatX)

    # Dummy -> concat
    vals = np.arange(0, 8)
    labels_new = np.zeros((num_seqs_row, seqlen_column))
    for i in xrange(np.size(labels, axis=0)):
        labels_new[i, :] = np.dot(labels[i, :, :], vals)
    labels_new = labels_new.astype('int32')
    labels = labels_new
    print("labels: ", labels)

    print("Loading splits ...")
    # SPLITS #
    seq_names = np.arange(0, num_seqs_row)

    x_train = x[seq_names[0:5278]]
    x_valid = x[seq_names[5278:5534]]
    labels_train = labels[seq_names[0:5278]]
    labels_valid = labels[seq_names[5278:5534]]
    mask_train = mask[seq_names[0:5278]]
    mask_valid = mask[seq_names[5278:5534]]
    num_seq_train = np.size(x_train, 0)
    return x_train, x_valid, labels_train, labels_valid, mask_train, mask_valid, num_seq_train
Пример #13
0
def get_test(seq_len=None):
    if not os.path.isfile(TEST_PATH):
        subprocess.call("./download_test.sh", shell=True)
    print("Loading test data ...")
    X_test_in = utils.load_gz(TEST_PATH)
    X_test = np.reshape(X_test_in, (514, 700, 57))
    del X_test_in
    X_test = X_test[:, :, :].astype("float32")
    labels_test = X_test[:, :, 22:30].astype('int32')
    mask_test = X_test[:, :, 30].astype("float32") * -1 + 1

    a = np.arange(0, 21)
    b = np.arange(35, 56)
    c = np.hstack((a, b))
    X_test = X_test[:, :, c]

    # getting meta
    seqlen = np.size(X_test, 1)
    d = np.size(X_test, 2)
    num_classes = 8
    num_seq_test = np.size(X_test, 0)
    del a, b, c

    ## DUMMY -> CONCAT ##
    vals = np.arange(0, 8)
    labels_new = np.zeros((num_seq_test, seqlen))
    for i in xrange(np.size(labels_test, axis=0)):
        labels_new[i, :] = np.dot(labels_test[i, :, :], vals)
    labels_new = labels_new.astype('int32')
    labels_test = labels_new

    ### ADDING BATCH PADDING ###
    X_add = np.zeros((126, seqlen, d))
    label_add = np.zeros((126, seqlen))
    mask_add = np.zeros((126, seqlen))
    #
    X_test = np.concatenate((X_test, X_add), axis=0).astype("float32")
    labels_test = np.concatenate((labels_test, label_add),
                                 axis=0).astype('int32')
    mask_test = np.concatenate((mask_test, mask_add), axis=0).astype("float32")
    if seq_len is not None:
        X_test = X_test[:, :seq_len]
        labels_test = labels_test[:, :seq_len]
        mask_test = mask_test[:, :seq_len]
    len_test = np.sum(mask_test, axis=1)
    len_test[-126:] = np.ones((126, ), dtype='int32')
    return X_test, mask_test, labels_test, num_seq_test, len_test
def make_dataset(path):
    data = load_gz(path)
    data = data.reshape(-1, 700, 57)

    idx = np.append(np.arange(21), np.arange(35, 56))
    X = data[:, :, idx]
    X = X.transpose(0, 2, 1)
    X = X.astype('float32')

    y = data[:, :, 22:30]
    y = np.array([np.dot(yi, np.arange(8)) for yi in y])
    y = y.astype('float32')

    mask = data[:, :, 30] * -1 + 1
    seq_len = mask.sum(axis=1)
    seq_len = seq_len.astype('float32')

    return X, y, seq_len
Пример #15
0
def get_test():
    if not os.path.isfile(TEST_PATH):
        print("Test Data Unavailable")
    print("Loading test data ...")
    x_test_in = utils.load_gz(TEST_PATH)
    x_test = np.reshape(x_test_in, (514, 700, 57))
    del x_test_in
    x_test = x_test[:, :, :].astype(theano.config.floatX)
    labels_test = x_test[:, :, 22:30].astype('int32')
    mask_test = x_test[:, :, 30].astype(theano.config.floatX) * -1 + 1

    a = np.arange(0, 21)
    b = np.arange(35, 56)
    c = np.hstack((a, b))
    x_test = x_test[:, :, c]

    # getting meta
    seqlen = np.size(x_test, 1)
    d = np.size(x_test, 2)
    num_classes = 8
    num_seq_test = np.size(x_test, 0)
    del a, b, c

    # DUMMY -> CONCAT #
    vals = np.arange(0, 8)
    labels_new = np.zeros((num_seq_test, seqlen))
    for i in xrange(np.size(labels_test, axis=0)):
        labels_new[i, :] = np.dot(labels_test[i, :, :], vals)
    labels_new = labels_new.astype('int32')
    labels_test = labels_new

    # ADDING BATCH PADDING #
    x_add = np.zeros((126, seqlen, d))
    label_add = np.zeros((126, seqlen))
    mask_add = np.zeros((126, seqlen))

    x_test = np.concatenate((x_test, x_add),
                            axis=0).astype(theano.config.floatX)
    labels_test = np.concatenate((labels_test, label_add),
                                 axis=0).astype('int32')
    mask_test = np.concatenate((mask_test, mask_add),
                               axis=0).astype(theano.config.floatX)
    return x_test, mask_test, labels_test, num_seq_test
Пример #16
0
def load_data(CVsplit):
	print "loadData started!"
	if(len(glob.glob('./data/csv/*'))!=len(glob.glob('./data/numpy/*'))):
		print "converting data ..."
		convert_data();
	xb_train = utils.load_gz('data/numpy/Btrn%s.npy.gz' % CVsplit).astype('float32')
	tb_train = np.zeros((xb_train.shape[0],1), dtype='float32')
	xs_train = utils.load_gz('data/numpy/Strn%s.npy.gz' % CVsplit).astype('float32')
	ts_train = np.ones((xs_train.shape[0],1), dtype='float32')
	xb_test = utils.load_gz('data/numpy/Btst%s.npy.gz' % CVsplit).astype('float32')
	tb_test = np.zeros((xb_test.shape[0],1), dtype='float32')
	xs_test = utils.load_gz('data/numpy/Stst%s.npy.gz' % CVsplit).astype('float32')
	ts_test = np.ones((xs_test.shape[0],1), dtype='float32')
	xb_valid = utils.load_gz('data/numpy/Bval%s.npy.gz' % CVsplit).astype('float32')
	tb_valid = np.zeros((xb_valid.shape[0],1), dtype='float32')
	xs_valid = utils.load_gz('data/numpy/Sval%s.npy.gz' % CVsplit).astype('float32')
	ts_valid = np.ones((xs_valid.shape[0],1), dtype='float32')

	return xb_train, xb_valid, xb_test, tb_train, tb_valid, tb_test, xs_train, xs_valid, xs_test, ts_train, ts_valid, ts_test
Пример #17
0
 def _prepare_data(self):
     if not os.path.exists("data/train.npy.gz"):
         self._load_data()
         self._preprocess_data()
     else:
         self.samples = utils.load_gz("data/train.npy.gz")
Пример #18
0
"""
Created on Wed Jun 17 12:00:27 2015

@author: alexander
"""
import numpy as np
import cPickle as pickle
import sklearn
import sklearn.cross_validation
import theano

import utils

## From data
print("Loading train data ...")
X_in = utils.load_gz('data/cullpdb+profile_6133_filtered.npy.gz')
X = np.reshape(X_in, (5534, 700, 57))
del X_in
X = X[:, 0:100, :]
labels = X[:, :, 22:30]
mask = X[:, :, 30]

a = np.arange(0, 21)
b = np.arange(35, 56)
c = np.hstack((a, b))
X = X[:, :, c]

# getting meta
num_seqs = np.size(X, 0)
seqlen = np.size(X, 1)
d = np.size(X, 2)
Пример #19
0
paths_train = glob.glob("data/train/*/*")
paths_train.sort()

paths_test = glob.glob("data/test/*")
paths_test.sort()

paths = {
    'train': paths_train,
    'test': paths_test,
}

# labels_train = np.zeros(len(paths['train']), dtype='int32')
# for k, path in enumerate(paths['train']):
#     class_name = os.path.basename(os.path.dirname(path))
#     labels_train[k] = class_names.index(class_name)
labels_train = utils.load_gz("data/labels_train.npy.gz")

default_augmentation_params = {
    'zoom_range': (1 / 1.1, 1.1),
    'rotation_range': (0, 360),
    'shear_range': (0, 0),
    'translation_range': (-4, 4),
    'do_flip': True,
    'allow_stretch': False,
}

no_augmentation_params = {
    'zoom_range': (1.0, 1.0),
    'rotation_range': (0, 0),
    'shear_range': (0, 0),
    'translation_range': (0, 0),
"""
Created on Wed Jun 17 12:00:27 2015

@author: alexander
"""
import numpy as np 
import cPickle as pickle
import sklearn
import sklearn.cross_validation
import theano

import utils

## From data
print("Loading train data ...")
X_in = utils.load_gz('data/cullpdb+profile_6133_filtered.npy.gz')
X = np.reshape(X_in,(5534,700,57))
del X_in
X = X[:,0:100,:]
labels = X[:,:,22:30]
mask = X[:,:,30]

a = np.arange(0,21)
b = np.arange(35,56)
c = np.hstack((a,b))
X = X[:,:,c]


# getting meta
num_seqs = np.size(X,0)
seqlen = np.size(X,1)
Пример #21
0
import numpy as np
import theano

import utils
import load_protvec

##### TRAIN DATA #####
print("Loading train data ...")
protein_vector_file = "data/protVec_100d_3grams_clean.csv"
addProtVec = False
X_in = utils.load_gz("data/cullpdb+profile_6133_filtered.npy.gz")
X = np.reshape(X_in, (5534, 700, 57))
del X_in
X = X[:, :, :]
labels = X[:, :, 22:30]
mask = X[:, :, 30] * -1 + 1

a = np.arange(0, 21)
b = np.arange(35, 56)
c = np.hstack((a, b))
X = X[:, :, c]

# If using ProtVec
# http://arxiv.org/abs/1503.05140

if addProtVec:
    ProtVec_train = load_protvec.load_protvec_encoding(X, mask, protein_vector_file, protvec_dim=100)
    newX = np.zeros((X.shape[0], X.shape[1], X.shape[2] + ProtVec_train.shape[2]))
    newX[:, :, : X.shape[2]] = X
    newX[:, :, X.shape[2] :] = ProtVec_train
    X = newX
Пример #22
0
paths_train.sort()

paths_test = glob.glob("data/test/*")
paths_test.sort()

paths = {
    'train': paths_train,
    'test': paths_test,
}


# labels_train = np.zeros(len(paths['train']), dtype='int32')
# for k, path in enumerate(paths['train']):
#     class_name = os.path.basename(os.path.dirname(path))
#     labels_train[k] = class_names.index(class_name)
labels_train = utils.load_gz("data/labels_train.npy.gz")


default_augmentation_params = {
    'zoom_range': (1 / 1.1, 1.1),
    'rotation_range': (0, 360),
    'shear_range': (0, 0),
    'translation_range': (-4, 4),
    'do_flip': True,
    'allow_stretch': False,
}

no_augmentation_params = {
    'zoom_range': (1.0, 1.0),
    'rotation_range': (0, 0),
    'shear_range': (0, 0),
Пример #23
0
def load(subset='train'):
    """
    Load all images into memory for faster processing
    """
    return utils.load_gz("data/images_%s.npy.gz" % subset)
def load(subset='train'):
    """
    Load all images into memory for faster processing
    """
    return utils.load_gz("data/images_%s.npy.gz" % subset)
Пример #25
0
import numpy as np
import theano

import utils

##### TRAIN DATA #####
print("Loading train data ...")
X_in = utils.load_gz("data/cullpdb+profile_6133_filtered.npy.gz")
X = np.reshape(X_in, (5534, 700, 57))
del X_in
X = X[:, :, :]
labels = X[:, :, 22:30]
mask = X[:, :, 30] * -1 + 1

a = np.arange(0, 21)
b = np.arange(35, 56)
c = np.hstack((a, b))
X = X[:, :, c]

# If using ProtVec
# http://arxiv.org/abs/1503.05140
addProtVec = True
if addProtVec:
    ProtVec = utils.load_gz("data/X_train_protvec.npy")
    newX = np.zeros((X.shape[0], X.shape[1], X.shape[2] + ProtVec.shape[2]))
    newX[:, :, : X.shape[2]] = X
    newX[:, :, X.shape[2] :] = ProtVec
    X = newX
    del newX

# getting meta
Пример #26
0
class_names = [os.path.basename(d) for d in directories]
class_names.sort()
num_classes = len(class_names)

DEFAULT_VALIDATION_SPLIT = './data/validation_split_v1.pkl'

paths_train = glob.glob("data/train/*/*")
paths_train.sort()

paths_test = glob.glob("data/test/*")
paths_test.sort()

paths = {'train': paths_train, 'test': paths_test}

labels_train_path = 'data/labels_train.npy.gz'
labels_train = utils.load_gz(labels_train_path)


class LoadMethod(object):
    def __init__(self, paths, labels=None):
        self.paths = paths
        self.labels = labels

    def __call__(self, idx):
        if self.labels is not None:
            return skimage.io.imread(self.paths[idx], as_grey=True).astype(
                'float32'), self.labels[idx].astype('int32')
        else:
            return skimage.io.imread(self.paths[idx],
                                     as_grey=True).astype('float32')