def test_janggu_use_dnaconv_max(tmpdir): os.environ['JANGGU_OUTPUT']=tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') posfile = os.path.join(data_path, 'positive.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1) @inputlayer def _cnn_model1(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'), merge_mode='max', name='bothstrands')(layer) return inputs, layer bwm1 = Janggu.create(_cnn_model1, modelparams=(2,), inputs=dna, name='dna_ctcf_HepG2-cnn1') p1 = bwm1.predict(dna[1:2]) w = bwm1.kerasmodel.get_layer('bothstrands').get_weights() @inputlayer def _cnn_model2(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer conv = Conv2D(5, (3, 1), name='singlestrand') fl = conv(layer) rl = Reverse()(conv(Complement()(Reverse()(inlayer)))) layer = Maximum()([fl, rl]) return inputs, layer bwm2 = Janggu.create(_cnn_model2, modelparams=(2,), inputs=dna, name='dna_ctcf_HepG2-cnn2') bwm2.kerasmodel.get_layer('singlestrand').set_weights(w) p2 = bwm2.predict(dna[1:2]) np.testing.assert_allclose(p1, p2, rtol=1e-4, atol=1e-3) bwm1.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm1._storage_path(bwm1.name, outputdir=tmpdir.strpath) bwm1.save() bwm1.summary() assert os.path.exists(storage) Janggu.create_by_name('dna_ctcf_HepG2-cnn1')
def test_janggu_generate_name(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath def _cnn_model(inputs, inp, oup, params): inputs = Input((10, 1)) layer = Flatten()(inputs) output = Dense(params[0])(layer) return inputs, output bwm = Janggu.create(_cnn_model, modelparams=(2, )) bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=bwm.outputdir) bwm.save() bwm.summary() assert os.path.exists(storage) Janggu.create_by_name(bwm.name)
def objective(params): print(params) try: train_data = get_data(params) train_data, test = split_train_test(train_data, [test_chrom]) train, val = split_train_test(train_data, [params['val_chrom']]) # define a keras model only based on DNA K.clear_session() if params['inputs'] == 'epi_dna': dnam = Janggu.create_by_name('cage_promoters_dna_only') epim = Janggu.create_by_name('cage_promoters_epi_only') layer = Concatenate()([ dnam.kerasmodel.layers[-2].output, epim.kerasmodel.layers[-2].output ]) layer = Dense(1, name='geneexpr')(layer) model = Janggu([dnam.kerasmodel.input] + epim.kerasmodel.input, layer, name='cage_promoters_epi_dna') if not params['pretrained']: # This part randomly reinitializes the network # so that we can train it from scratch newjointmodel = model_from_json(model.kerasmodel.to_json()) newjointmodel = Janggu( newjointmodel.inputs, newjointmodel.outputs, name='cage_promoters_epi_dna_randominit') model = newjointmodel else: model = Janggu.create(get_model, params, train_data[0], train_data[1], name='cage_promoters_{}'.format( params['inputs'])) except ValueError: main_logger.exception('objective:') return {'status': 'fail'} model.compile(optimizer=get_opt(params['opt']), loss='mae', metrics=['mse']) hist = model.fit( train_data[0], train_data[1], epochs=params['epochs'], batch_size=64, validation_data=[params['val_chrom']], callbacks=[EarlyStopping(patience=5, restore_best_weights=True)]) print('#' * 40) for key in hist.history: print('{}: {}'.format(key, hist.history[key][-1])) print('#' * 40) pred_train = model.predict(train[0]) pred_val = model.predict(val[0]) pred_test = model.predict(test[0]) model.evaluate(train[0], train[1], callbacks=['var_explained', 'mse', 'mae', 'cor'], datatags=['train']) mae_val = model.evaluate(val[0], val[1], callbacks=['var_explained', 'mse', 'mae', 'cor'], datatags=['val']) mae_val = mae_val[0] model.evaluate(test[0], test[1], callbacks=['var_explained', 'mse', 'mae', 'cor'], datatags=['test']) cor_train = np.corrcoef(train[1][:][:, 0], pred_train[:, 0])[0, 1] cor_val = np.corrcoef(val[1][:][:, 0], pred_val[:, 0])[0, 1] cor_test = np.corrcoef(test[1][:][:, 0], pred_test[:, 0])[0, 1] model.summary() main_logger.info('cor [train/val/test]: {:.2f}/{:.2f}/{:.2f}'.format( cor_train, cor_val, cor_test)) return { 'loss': mae_val, 'status': 'ok', 'all_losses': hist.history, 'cor_train': cor_train, 'cor_val': cor_val, 'cor_test': cor_test, 'model_config': model.kerasmodel.to_json(), 'model_weights': model.kerasmodel.get_weights(), 'concrete_params': params }
shared_space['pretrained'] = False res = objective(shared_space) write_results(shared_space, res) else: print('no training') shared_space['val_chrom'] = "chr22" shared_space['order'] = dnaorder shared_space['pretrained'] = False shared_space['seq_dropout'] = 0.2 shared_space['inputs'] = 'epi_dna' params = shared_space train_data = get_data(params) train, test = split_train_test(train_data, [test_chrom]) model = Janggu.create_by_name('cage_promoters_epi_dna') testpred = model.predict(test[0]) fig, ax = plt.subplots() ax.scatter(test[1][:], testpred) ax.set_xlabel('Observed normalized CAGE signal') ax.set_ylabel('Predicted normalized CAGE signal') fig.savefig( os.path.join(os.environ['JANGGU_OUTPUT'], 'cage_promoter_testchrom_agreement.png')) fig, ax = plt.subplots() ax.scatter(test[1][:], testpred) ax.set_xlabel('Observed normalized CAGE signal') ax.set_ylabel('Predicted normalized CAGE signal')
def test_janggu_instance_dense(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') csvfile = os.path.join(data_path, 'sample.csv') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1) df = pd.read_csv(csvfile, header=None) ctcf = Array('ctcf', df.values, conditions=['peaks']) @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs['.'] layer = Complement()(layer) layer = Reverse()(layer) layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output with pytest.raises(Exception): # due to No input name . defined bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs[list()] layer = Complement()(layer) layer = Reverse()(layer) layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output with pytest.raises(Exception): # due to Wrong type for indexing bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs()[0] layer = Complement()(layer) layer = Reverse()(layer) layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output with pytest.raises(Exception): # name with must be string bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name=12342134) # test with given model name bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') # test with auto. generated modelname. bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs[0] layer = Complement()(layer) layer = Reverse()(layer) layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') @inputlayer @outputdense('sigmoid') def _cnn_model(inputs, inp, oup, params): layer = inputs['dna'] layer = Complement()(layer) layer = Reverse()(layer) layer = Flatten()(layer) output = Dense(params[0])(layer) return inputs, output bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') kbwm2 = model_from_json(bwm.kerasmodel.to_json()) kbwm3 = model_from_yaml(bwm.kerasmodel.to_yaml()) bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath) bwm.save() bwm.summary() assert os.path.exists(storage) Janggu.create_by_name('dna_ctcf_HepG2-cnn')
def test_janggu_instance_conv(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') posfile = os.path.join(data_path, 'scored_sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1, binsize=200, stepsize=50) ctcf = Cover.create_from_bed("positives", bedfiles=posfile, roi=bed_file, binsize=200, stepsize=50, resolution=50, store_whole_genome=False, flank=0, collapser=None, storage='ndarray') ctcf = Cover.create_from_bed("positives", bedfiles=posfile, roi=bed_file, binsize=200, stepsize=50, resolution=50, store_whole_genome=True, flank=0, collapser=None, storage='ndarray') @inputlayer @outputconv('sigmoid') def _cnn_model(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer layer = Complement()(layer) layer = Reverse()(layer) return inputs, layer bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath) bwm.save() bwm.summary() assert os.path.exists(storage) Janggu.create_by_name('dna_ctcf_HepG2-cnn')
val_data = DATA[1] test_data = DATA[2] auprc_pre_val = [] auprc_pre_test = [] auprc_rand_val = [] auprc_rand_test = [] # Next, we concatenate the individual models and fine-tune them. # Furthermore, the combined models are reset with random weights and trained from scratch # as a comparison. for dnarun, dnaserun in zip([1, 2, 3, 4, 5], [1, 2, 3, 4, 5]): # load pre-trained models dnaname = dnamodelname.format(dnarun) dnasename = dnasemodelname.format(dnaserun) dnamodel = Janggu.create_by_name(dnaname) dnasemodel = Janggu.create_by_name(dnasename) # remove output layer, concatenate the top-hidden layers, append output hidden_dna = dnamodel.kerasmodel.layers[-2].output hidden_dnase = dnasemodel.kerasmodel.layers[-2].output joint_hidden = Concatenate(name='concat')([hidden_dna, hidden_dnase]) output = Dense(1, activation='sigmoid', name='peaks')(joint_hidden) # fit the model with preinitialized weights jointmodel = Janggu(dnamodel.kerasmodel.inputs + dnasemodel.kerasmodel.inputs, output, name='pretrained_dnase_dna_joint_model_{}_{}'.format( dnasename, dnaname))
pars, inputs=DATA[0][0], outputs=DATA[0][1], name=mname) model.summary() model.compile(optimizer=get_opt('amsgrad'), loss='binary_crossentropy', metrics=['accuracy']) train_data = DATA[0] val_data = DATA[1] test_data = DATA[2] hist = model.fit( train_data[0], train_data[1], epochs=epochs, batch_size=128, validation_data=val_data, callbacks=[EarlyStopping(patience=5, restore_best_weights=True)]) model.evaluate(test_data[0], test_data[1], callbacks=['auc', 'auprc']) if evaluate: test_data = DATA[2] model = Janggu.create_by_name(mname) model.compile(optimizer=get_opt('amsgrad'), loss='binary_crossentropy', metrics=['accuracy']) model.evaluate(test_data[0], test_data[1], callbacks=['auc', 'auprc'])