def load_dataset(organism): from ml_data import SequenceNucsData, SimpleHistData global max_features global maxlen print('Load organism: {}'.format(organism)) npath, ppath = './fasta/{}_neg.fa'.format( organism), './fasta/{}_pos.fa'.format(organism) print(npath, ppath) ini = 59 k = 1 max_features = 4**k samples = SequenceNucsData(npath, ppath, k=k) samples2 = SimpleHistData(npath, ppath, k=3, upto=True) X, y = samples.getX(), samples.getY() X = np.hstack((X, samples2.getX())) # X = X.reshape(-1, 38, 79, 1).astype('float32') np.random.seed(int(time.time())) mask = np.array(np.random.randint(2, size=X.shape[1]) + 1, dtype=bool) # mask = np.array(np.zeros(X.shape[1]) , dtype=bool) # mask[59]=1 # mask[60]=1 # mask[61]=1 # mask[49]=1 # mask[50]=1 # mask[51]=1 # mask[24]=1 # mask[25]=1 # mask[26]=1 mask[ini] = 1 print(mask) X = X[:, mask] X = X.astype('int32') # ini = 199 # X = X[:, (ini-30):(ini+11)] y = y.astype('int32') print('Input Shapes\nX: {} | y: {}'.format(X.shape, y.shape)) maxlen = X.shape[1] return X, y
def setup_data(self, npath, ppath): from ml_data import SimpleHistData from ml_data import DinucAutoCovarData self.data_list = [] self.data_list.append(SimpleHistData(npath, ppath, k=4)) self.data_list.append(DinucAutoCovarData(npath, ppath, k=4)) return self.data_list
def setup_data(self, npath, ppath): from ml_data import SimpleHistData from ml_data import DinucAutoCovarData # Join data into a single input vector data = self.join_data( SimpleHistData(npath, ppath, k=4), DinucAutoCovarData(npath, ppath), ) # Get lenghts of input vectors lengths = self.get_input_lengths(data) # Define limits of data type on single vector limits = self.calc_limits(data) self.single_input_length = limits[-1][-1] self.data = data self.limits = limits self.lengths = lengths
def setup_data(npath, ppath): from ml_data import SequenceDinucProperties from ml_data import SimpleHistData global data global limits global lengths global shared_input_length # Join data into a single input vector data = join_data(SimpleHistData(npath, ppath, k=3)) # Get lenghts of input vectors lengths = get_input_lengths(data) # Define limits of data type on single vector limits = calc_limits(data) # Compute shared input vector length shared_input_length = 64
def setup_data(npath, ppath): from ml_data import SimpleHistData from ml_data import DinucCrossCovarData global data global limits global lengths global shared_input_length # Join data into a single input vector data = join_data( SimpleHistData(npath, ppath, k=4, upto=True), DinucCrossCovarData(npath, ppath, k=3, upto=True), ) # Get lenghts of input vectors lengths = get_input_lengths(data) # Define limits of data type on single vector limits = calc_limits(data) # Compute shared input vector length shared_input_length = limits[-1][-1]
epsilon=0.001, cooldown=0, min_lr=0) #npath = "fasta/Bacillus_non_prom.fa" #ppath = "fasta/Bacillus_prom.fa" #npath = "fasta/Arabidopsis_non_prom_big.fa" #ppath = "fasta/Arabidopsis_non_tata.fa" npath = "fasta/Ecoli_non_prom.fa" ppath = "fasta/Ecoli_prom.fa" #mldata = SequenceNucsData(npath, ppath, k=3) mldata = SequenceDinucProperties(npath, ppath) mldata2 = SimpleHistData(npath, ppath, k=4, upto=True) X = mldata.getX() X2 = mldata2.getX() Y = mldata.getY() print X.shape print Y.shape posIndex = numpy.where(Y[:] == 1)[0] negIndex = numpy.where(Y[:] == 0)[0] diff = len(negIndex) - len(posIndex) diff = len(negIndex) - diff print 'DIFF', diff
patience=10, verbose=0, mode='auto') reduceLR = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=0, mode='auto', epsilon=0.001, cooldown=0, min_lr=0) npath = "fasta/Bacillus_non_prom.fa" ppath = "fasta/Bacillus_prom.fa" #mldata = SequenceNucsData(npath, ppath, k=3) mldata = SimpleHistData(npath, ppath, k=4) mldata2 = SequenceNucsData(npath, ppath, k=3) mldata3 = DinucAutoCovarData(npath, ppath) X = mldata.getX() Y = mldata.getY() X2 = mldata2.getX() X3 = mldata3.getX() kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234) kf.get_n_splits(X, Y) cvscores = [] for train_index, test_index in kf.split(X, Y):