示例#1
0
def load_dataset(organism):
    from ml_data import SequenceNucsData, SimpleHistData
    global max_features
    global maxlen

    print('Load organism: {}'.format(organism))
    npath, ppath = './fasta/{}_neg.fa'.format(
        organism), './fasta/{}_pos.fa'.format(organism)
    print(npath, ppath)

    ini = 59

    k = 1
    max_features = 4**k
    samples = SequenceNucsData(npath, ppath, k=k)
    samples2 = SimpleHistData(npath, ppath, k=3, upto=True)

    X, y = samples.getX(), samples.getY()
    X = np.hstack((X, samples2.getX()))
    #    X = X.reshape(-1, 38, 79, 1).astype('float32')
    np.random.seed(int(time.time()))
    mask = np.array(np.random.randint(2, size=X.shape[1]) + 1, dtype=bool)
    # mask = np.array(np.zeros(X.shape[1]) , dtype=bool)
    # mask[59]=1
    # mask[60]=1
    # mask[61]=1
    # mask[49]=1
    # mask[50]=1
    # mask[51]=1
    # mask[24]=1
    # mask[25]=1
    # mask[26]=1

    mask[ini] = 1
    print(mask)
    X = X[:, mask]
    X = X.astype('int32')
    #    ini = 199
    # X = X[:, (ini-30):(ini+11)]
    y = y.astype('int32')
    print('Input Shapes\nX: {} | y: {}'.format(X.shape, y.shape))
    maxlen = X.shape[1]
    return X, y
示例#2
0
    def setup_data(self, npath, ppath):
        from ml_data import SimpleHistData
        from ml_data import DinucAutoCovarData

        self.data_list = []

        self.data_list.append(SimpleHistData(npath, ppath, k=4))
        self.data_list.append(DinucAutoCovarData(npath, ppath, k=4))

        return self.data_list
示例#3
0
 def setup_data(self, npath, ppath):
     from ml_data import SimpleHistData
     from ml_data import DinucAutoCovarData
     
     # Join data into a single input vector
     data = self.join_data(
             SimpleHistData(npath, ppath, k=4),
             DinucAutoCovarData(npath, ppath),
     )
     
     # Get lenghts of input vectors
     lengths = self.get_input_lengths(data)
     
     # Define limits of data type on single vector
     limits = self.calc_limits(data)
     
     self.single_input_length = limits[-1][-1]
     self.data = data
     self.limits = limits
     self.lengths = lengths
示例#4
0
def setup_data(npath, ppath):
    from ml_data import SequenceDinucProperties
    from ml_data import SimpleHistData

    global data
    global limits
    global lengths
    global shared_input_length

    # Join data into a single input vector
    data = join_data(SimpleHistData(npath, ppath, k=3))

    # Get lenghts of input vectors
    lengths = get_input_lengths(data)

    # Define limits of data type on single vector
    limits = calc_limits(data)

    # Compute shared input vector length
    shared_input_length = 64
示例#5
0
def setup_data(npath, ppath):
    from ml_data import SimpleHistData
    from ml_data import DinucCrossCovarData

    global data
    global limits
    global lengths
    global shared_input_length

    # Join data into a single input vector
    data = join_data(
        SimpleHistData(npath, ppath, k=4, upto=True),
        DinucCrossCovarData(npath, ppath, k=3, upto=True),
    )

    # Get lenghts of input vectors
    lengths = get_input_lengths(data)

    # Define limits of data type on single vector
    limits = calc_limits(data)

    # Compute shared input vector length
    shared_input_length = limits[-1][-1]
示例#6
0
                             epsilon=0.001,
                             cooldown=0,
                             min_lr=0)

#npath = "fasta/Bacillus_non_prom.fa"
#ppath = "fasta/Bacillus_prom.fa"

#npath = "fasta/Arabidopsis_non_prom_big.fa"
#ppath = "fasta/Arabidopsis_non_tata.fa"

npath = "fasta/Ecoli_non_prom.fa"
ppath = "fasta/Ecoli_prom.fa"

#mldata = SequenceNucsData(npath, ppath, k=3)
mldata = SequenceDinucProperties(npath, ppath)
mldata2 = SimpleHistData(npath, ppath, k=4, upto=True)

X = mldata.getX()
X2 = mldata2.getX()
Y = mldata.getY()

print X.shape
print Y.shape

posIndex = numpy.where(Y[:] == 1)[0]
negIndex = numpy.where(Y[:] == 0)[0]

diff = len(negIndex) - len(posIndex)
diff = len(negIndex) - diff
print 'DIFF', diff
示例#7
0
                              patience=10,
                              verbose=0,
                              mode='auto')
reduceLR = ReduceLROnPlateau(monitor='val_loss',
                             factor=0.1,
                             patience=5,
                             verbose=0,
                             mode='auto',
                             epsilon=0.001,
                             cooldown=0,
                             min_lr=0)

npath = "fasta/Bacillus_non_prom.fa"
ppath = "fasta/Bacillus_prom.fa"
#mldata = SequenceNucsData(npath, ppath, k=3)
mldata = SimpleHistData(npath, ppath, k=4)
mldata2 = SequenceNucsData(npath, ppath, k=3)
mldata3 = DinucAutoCovarData(npath, ppath)

X = mldata.getX()
Y = mldata.getY()

X2 = mldata2.getX()
X3 = mldata3.getX()

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
kf.get_n_splits(X, Y)

cvscores = []
for train_index, test_index in kf.split(X, Y):