示例#1
0
 def _model():
     inputs = Input((10,), name='x')
     output = Dense(1, name='y', activation='sigmoid')(inputs)
     model = Janggu(inputs=inputs, outputs=output, name='test_model')
     model.compile(optimizer='adadelta', loss='binary_crossentropy',
                   metrics=['accuracy'])
     return model
示例#2
0
def test_janggu_use_dnaconv_max(tmpdir):
    os.environ['JANGGU_OUTPUT']=tmpdir.strpath

    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    posfile = os.path.join(data_path, 'positive.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
                                    storage='ndarray',
                                    roi=bed_file, order=1)

    @inputlayer
    def _cnn_model1(inputs, inp, oup, params):
        with inputs.use('dna') as inlayer:
            layer = inlayer
            layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'),
                              merge_mode='max', name='bothstrands')(layer)
        return inputs, layer

    bwm1 = Janggu.create(_cnn_model1, modelparams=(2,),
                        inputs=dna,
                        name='dna_ctcf_HepG2-cnn1')

    p1 = bwm1.predict(dna[1:2])
    w = bwm1.kerasmodel.get_layer('bothstrands').get_weights()

    @inputlayer
    def _cnn_model2(inputs, inp, oup, params):
        with inputs.use('dna') as inlayer:
            layer = inlayer
            conv = Conv2D(5, (3, 1), name='singlestrand')
            fl = conv(layer)
            rl = Reverse()(conv(Complement()(Reverse()(inlayer))))
            layer = Maximum()([fl, rl])
        return inputs, layer

    bwm2 = Janggu.create(_cnn_model2, modelparams=(2,),
                        inputs=dna,
                        name='dna_ctcf_HepG2-cnn2')

    bwm2.kerasmodel.get_layer('singlestrand').set_weights(w)

    p2 = bwm2.predict(dna[1:2])
    np.testing.assert_allclose(p1, p2, rtol=1e-4, atol=1e-3)

    bwm1.compile(optimizer='adadelta', loss='binary_crossentropy')
    storage = bwm1._storage_path(bwm1.name, outputdir=tmpdir.strpath)

    bwm1.save()
    bwm1.summary()

    assert os.path.exists(storage)

    Janggu.create_by_name('dna_ctcf_HepG2-cnn1')
示例#3
0
def test_create_from_array_whole_genome_false(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # load the dataset
    # The pseudo genome represents just a concatenation of all sequences
    # in sample.fa and sample2.fa. Therefore, the results should be almost
    # identically to the models obtained from classify_fasta.py.
    REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
    # ROI contains regions spanning positive and negative examples
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
    # PEAK_FILE only contains positive examples
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

    DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                       roi=ROI_FILE,
                                       binsize=200, stepsize=200,
                                       order=1,
                                       store_whole_genome=False,
                                       datatags=['ref'])

    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
                                   bedfiles=PEAK_FILE,
                                   binsize=200, stepsize=200,
                                   resolution=200,
                                   store_whole_genome=False,
                                   datatags=['train'])

    @inputlayer
    @outputconv('sigmoid')
    def double_stranded_model_dnaconv(inputs, inp, oup, params):
        with inputs.use('dna') as layer:
            layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
                                     activation=params[2]))(layer)
        output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1],
                                       name='motif')(layer)
        return inputs, output

    modeltemplate = double_stranded_model_dnaconv

    K.clear_session()

    # create a new model object
    model = Janggu.create(template=modeltemplate,
                          modelparams=(30, 21, 'relu'),
                          inputs=DNA,
                          outputs=LABELS)

    model.compile(optimizer='adadelta', loss='binary_crossentropy',
                  metrics=['acc'])

    pred = model.predict(DNA)

    cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer,
                                      store_whole_genome=False)

    assert pred.shape == cov_out.shape

    np.testing.assert_equal(pred, cov_out[:])

    assert len(cov_out.gindexer) == len(pred)
    assert len(cov_out.garray.handle) == len(pred)
示例#4
0
def test_janggu_train_predict_option1(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Train, predict and evaluate on dummy data.

    create: by_shape
    Input args: Dataset
    """

    inputs = Array("X", np.random.random((100, 10)))
    outputs = Array('y', np.random.randint(2, size=(100, 1)),
                    conditions=['random'])

    @inputlayer
    @outputdense('sigmoid')
    def test_model(inputs, inp, oup, params):
        return inputs, inputs[0]

    bwm = Janggu.create(test_model,
                        inputs=inputs,
                        outputs=outputs,
                        name='nptest')

    bwm.compile(optimizer='adadelta', loss='binary_crossentropy')

    storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath)
    assert not os.path.exists(storage)

    bwm.fit(inputs, outputs, epochs=2, batch_size=32)

    assert os.path.exists(storage)

    pred = bwm.predict(inputs)
    np.testing.assert_equal(len(pred[:, np.newaxis]), len(inputs))
    np.testing.assert_equal(pred.shape, outputs.shape)
    bwm.evaluate(inputs, outputs)
示例#5
0
def test_output_export_tsne(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    inputs = Array("x", numpy.random.random((100, 10)))
    outputs = Array('y',
                    numpy.random.randint(2, size=(100, 1)),
                    conditions=['random'])

    @inputlayer
    @outputdense('sigmoid')
    def _model(inputs, inp, oup, params):
        with inputs.use('x') as layer:
            outputs = Dense(3, name='hidden')(layer)
        return inputs, outputs

    bwm = Janggu.create(_model, inputs=inputs, outputs=outputs, name='nptest')

    bwm.compile(optimizer='adadelta', loss='binary_crossentropy')

    dummy_eval = Scorer('tsne', exporter=export_tsne)

    bwm.predict(inputs, layername='hidden', callbacks=[dummy_eval])
    bwm.predict(inputs,
                layername='hidden',
                callbacks=[dummy_eval],
                exporter_kwargs={'fform': 'eps'})
    # check if plot was produced
    assert os.path.exists(
        os.path.join(tmpdir.strpath, "evaluation", bwm.name, 'hidden',
                     "tsne.png"))
    assert os.path.exists(
        os.path.join(tmpdir.strpath, "evaluation", bwm.name, 'hidden',
                     "tsne.eps"))
示例#6
0
def test_janggu_variant_prediction(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    for order in [1, 2, 3]:
        refgenome = os.path.join(data_path, 'sample_genome.fa')
        vcffile = os.path.join(data_path, 'sample.vcf')

        dna = Bioseq.create_from_refgenome('dna',
                                           refgenome=refgenome,
                                           storage='ndarray',
                                           binsize=50,
                                           store_whole_genome=True,
                                           order=order)

        def _cnn_model(inputs, inp, oup, params):
            inputs = Input(
                (50 - params['order'] + 1, 1, pow(4, params['order'])))
            layer = Flatten()(inputs)
            layer = Dense(params['hiddenunits'])(layer)
            output = Dense(4, activation='sigmoid')(layer)
            return inputs, output

        model = Janggu.create(_cnn_model,
                              modelparams={
                                  'hiddenunits': 2,
                                  'order': order
                              },
                              name='dna_ctcf_HepG2-cnn')

        model.predict_variant_effect(
            dna,
            vcffile,
            conditions=['m' + str(i) for i in range(4)],
            output_folder=os.path.join(os.environ['JANGGU_OUTPUT']))
        assert os.path.exists(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'))
        assert os.path.exists(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'))

        f = h5py.File(os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'),
                      'r')

        gindexer = GenomicIndexer.create_from_file(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'), None,
            None)

        cov = Cover.create_from_array('snps',
                                      f['diffscore'],
                                      gindexer,
                                      store_whole_genome=True)

        print(cov['chr2', 55, 65].shape)
        print(cov['chr2', 55, 65])

        assert np.abs(cov['chr2', 59, 60]).sum() > 0.0
        assert np.abs(cov['chr2', 54, 55]).sum() == 0.0
        f.close()
示例#7
0
def test_janggu_generate_name(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    def _cnn_model(inputs, inp, oup, params):
        inputs = Input((10, 1))
        layer = Flatten()(inputs)
        output = Dense(params[0])(layer)
        return inputs, output

    bwm = Janggu.create(_cnn_model, modelparams=(2, ))
    bwm.compile(optimizer='adadelta', loss='binary_crossentropy')

    storage = bwm._storage_path(bwm.name, outputdir=bwm.outputdir)

    bwm.save()
    bwm.summary()

    assert os.path.exists(storage)

    Janggu.create_by_name(bwm.name)
示例#8
0
def test_janggu_influence_genomic(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    csvfile = os.path.join(data_path, 'sample.csv')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       binsize=50,
                                       roi=bed_file,
                                       order=1)

    df = pd.read_csv(csvfile, header=None)
    ctcf = Array('ctcf', df.values, conditions=['peaks'])

    @inputlayer
    @outputdense('sigmoid')
    def _cnn_model(inputs, inp, oup, params):
        layer = inputs['dna']
        layer = Flatten()(layer)
        output = Dense(params[0])(layer)
        return inputs, output

    model = Janggu.create(_cnn_model,
                          modelparams=(2, ),
                          inputs=dna,
                          outputs=ctcf,
                          name='dna_ctcf_HepG2-cnn')

    model.compile(optimizer='adadelta', loss='binary_crossentropy')

    # check with some nice offset
    iv = dna.gindexer[0]
    chrom, start, end = iv.chrom, iv.start, iv.end
    influence = input_attribution(model,
                                  dna,
                                  chrom=chrom,
                                  start=start,
                                  end=end)

    # check with an odd offset

    #    chrom, start, end =
    influence2 = input_attribution(model,
                                   dna,
                                   chrom=chrom,
                                   start=start - 1,
                                   end=end + 1)
    np.testing.assert_equal(influence[0][:], influence2[0][:][:, 1:-1])
示例#9
0
def test_localaveragepooling2D(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # some test data
    testin = np.ones((1, 10, 1, 3))
    testin[:, :, :, 1] += 1
    testin[:, :, :, 2] += 2

    # test local average pooling
    lin = Input((10, 1, 3))
    out = LocalAveragePooling2D(3)(lin)
    m = Janggu(lin, out)

    testout = m.predict(testin)
    np.testing.assert_equal(testout, testin[:, :8, :, :])

    # more tests
    testin = np.ones((1, 3, 1, 2))
    testin[:, 0, :, :] = 0
    testin[:, 2, :, :] = 2
    testin[:, :, :, 1] += 1

    # test local average pooling
    lin = Input((3, 1, 2))
    out = LocalAveragePooling2D(3)(lin)
    m = Janggu(lin, out)

    testout = m.predict(testin)
    np.testing.assert_equal(testout.shape, (1, 1, 1, 2))
    np.testing.assert_equal(testout[0, 0, 0, 0], 1)
    np.testing.assert_equal(testout[0, 0, 0, 1], 2)
示例#10
0
def get_janggu(inputs, outputs):
    @inputlayer
    @outputdense('sigmoid')
    def _model(inputs, inp, oup, params):
        return inputs, inputs[0]
    bwm = Janggu.create(_model,
                        inputs=inputs,
                        outputs=outputs,
                        name='nptest')
    bwm.compile(optimizer='adadelta', loss='binary_crossentropy')
    storage = bwm._storage_path(bwm.name, outputdir=bwm.outputdir)
    assert not os.path.exists(storage)
    return bwm
示例#11
0
def test_janggu_chr2_validation(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    posfile = os.path.join(data_path, 'scored_sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       binsize=200,
                                       stepsize=50,
                                       roi=bed_file,
                                       order=1)

    ctcf = Cover.create_from_bed("positives",
                                 bedfiles=posfile,
                                 roi=bed_file,
                                 binsize=200,
                                 stepsize=50,
                                 resolution=None,
                                 flank=0,
                                 collapser='max',
                                 storage='ndarray')

    @inputlayer
    @outputconv('sigmoid')
    def _cnn_model1(inputs, inp, oup, params):
        with inputs.use('dna') as inlayer:
            layer = inlayer
            layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'),
                              merge_mode='max',
                              name='bothstrands')(layer)
            layer = MaxPooling2D((198, 1))(layer)
        return inputs, layer

    bwm1 = Janggu.create(_cnn_model1,
                         modelparams=(2, ),
                         inputs=dna,
                         outputs=ctcf,
                         name='dna_ctcf_HepG2-cnn1')

    bwm1.compile(optimizer='adadelta', loss='binary_crossentropy')
    p1 = bwm1.fit(dna, ctcf, validation_data=['chr2'])
示例#12
0
def test_janggu_train_predict_sequence(tmpdir):
    """Train, predict and evaluate on dummy data.

    create: YES
    Input args: Dataset
    validation_set: YES
    batch_size: None
    """
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    inputs = {'x': Array("x", np.random.random((100, 10)))}
    outputs = {
        'y': Array('y',
                   np.random.randint(2, size=(100, 1)),
                   conditions=['random'])
    }

    jseq = JangguSequence(10, inputs, outputs)

    @inputlayer
    @outputdense('sigmoid')
    def _model(inputs, inp, oup, params):
        return inputs, inputs[0]

    bwm = Janggu.create(_model,
                        inputs=jseq.inputs['x'],
                        outputs=jseq.outputs['y'],
                        name='nptest')

    bwm.compile(optimizer='adadelta', loss='binary_crossentropy')

    storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath)
    print('storage', storage)
    print('env', os.environ['JANGGU_OUTPUT'])
    print('name', bwm.name)
    print('outputdir', bwm.outputdir)
    assert not os.path.exists(storage)

    bwm.fit(jseq, epochs=2, validation_data=jseq, use_multiprocessing=False)

    assert os.path.exists(storage)

    pred = bwm.predict(jseq, use_multiprocessing=False)
    np.testing.assert_equal(len(pred[:, np.newaxis]), len(inputs['x']))
    np.testing.assert_equal(pred.shape, outputs['y'].shape)
    bwm.evaluate(jseq, use_multiprocessing=False)
示例#13
0
def test_janggu_train_predict_option0(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Train, predict and evaluate on dummy data.

    create: by_shape
    Input args: Dataset
    """

    inputs = Array("X", np.random.random((100, 10)))
    outputs = ReduceDim(Array('y',
                              np.random.randint(2, size=(100, 1))[:, None],
                              conditions=['random']),
                        axis=(1, ))

    @inputlayer
    @outputdense('sigmoid')
    def test_model(inputs, inp, oup, params):
        return inputs, inputs[0]

    bwm = Janggu.create(test_model,
                        inputs=inputs,
                        outputs=outputs,
                        name='nptest')

    bwm.compile(optimizer='adadelta', loss='binary_crossentropy')

    storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath)
    assert not os.path.exists(storage)

    bwm.fit(inputs, outputs, epochs=2, batch_size=32)

    assert os.path.exists(storage)

    pred = bwm.predict(inputs)
    np.testing.assert_equal(len(pred[:, np.newaxis]), len(inputs))
    np.testing.assert_equal(pred.shape, outputs.shape)

    # test if the condition name is correctly used in the output table
    bwm.evaluate(inputs, outputs, callbacks=['auc'])

    outputauc = os.path.join(tmpdir.strpath, 'evaluation', 'nptest', 'auc.tsv')
    assert os.path.exists(outputauc)
    assert pd.read_csv(outputauc).columns[0] == 'random'
示例#14
0
def test_janggu_influence_fasta(tmpdir):

    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1
    filename = os.path.join(data_path, 'sample.fa')

    data = Bioseq.create_from_seq('dna',
                                  fastafile=filename,
                                  order=order,
                                  cache=False)

    dna = data

    @inputlayer
    def _cnn_model(inputs, inp, oup, params):
        layer = inputs['dna']
        layer = Flatten()(layer)
        output = Dense(params[0])(layer)
        output = Dense(1, activation='sigmoid')(output)
        return inputs, output

    model = Janggu.create(_cnn_model,
                          modelparams=(2, ),
                          inputs=data,
                          name='dna_ctcf_HepG2-cnn')

    #model.compile(optimizer='adadelta', loss='binary_crossentropy')

    # check with some nice offset
    iv = dna.gindexer[0]
    chrom, start, end = iv.chrom, iv.start, iv.end
    influence = input_attribution(model,
                                  dna,
                                  chrom=chrom,
                                  start=start,
                                  end=end)

    influence2 = input_attribution(model, dna, idx=0)
    np.testing.assert_equal(influence[0][:], influence2[0][:])
示例#15
0
def objective(params):
    train_data, val_data, test_data = get_data(params)
    # define a keras model only based on DNA

    try:
        K.clear_session()
        model = Janggu.create(get_model, params, train_data[0], train_data[1], name=params['name'])
        model.compile(optimizer=get_opt(params['opt']), loss='binary_crossentropy',
                      metrics=['acc'])
        hist = model.fit(train_data[0], train_data[1], epochs=params['epochs'], batch_size=64,
                         validation_data=val_data,
                         callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])
    except ValueError:
        traceback.print_stack()
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print(repr(traceback.extract_tb(exc_traceback)))
        return {'status': 'fail'}
    print('#' * 40)
    for key in hist.history:
        print('{}: {}'.format(key, hist.history[key][-1]))
    print('#' * 40)
    pred_test = model.predict(test_data[0])
    pred_val = model.predict(val_data[0])

    model.evaluate(val_data[0], val_data[1], callbacks=['auprc', 'auroc'], datatags=['val'])
    model.evaluate(test_data[0], test_data[1], callbacks=['auprc', 'auroc'], datatags=['test'])

    auprc_val = average_precision_score(val_data[1][:], pred_val)
    auprc_test = average_precision_score(test_data[1][:], pred_test)
    model.summary()
    print('auprc_val: {:.2%}'.format(auprc_val))
    print('auprc_test: {:.2%}'.format(auprc_test))
    return {'loss': hist.history['val_loss'][-1], 'status': 'ok', 'all_losses': hist.history,
            'auprc_val': auprc_val,
            'auprc_test': auprc_test,
            'model_config': model.kerasmodel.to_json(),
            'model_weights': model.kerasmodel.get_weights(),
            'concrete_params': params,
            'modelname': model.name}
示例#16
0
    """
    with inputs.use('dna') as layer:
        # the name in inputs.use() should be the same as the dataset name.
        layer = DnaConv2D(
            Conv2D(params[0], (params[1], 1), activation=params[2]))(layer)
    output = GlobalAveragePooling2D(name='motif')(layer)
    return inputs, output


modeltemplate = double_stranded_model_dnaconv

K.clear_session()

# create a new model object
model = Janggu.create(template=modeltemplate,
                      modelparams=(30, 21, 'relu'),
                      inputs=DNA,
                      outputs=ReduceDim(LABELS))

model.compile(optimizer='adadelta',
              loss='binary_crossentropy',
              metrics=['acc'])

hist = model.fit(DNA, ReduceDim(LABELS), epochs=100, shuffle=False)

print('#' * 40)
print('loss: {}, acc: {}'.format(hist.history['loss'][-1],
                                 hist.history['acc'][-1]))
print('#' * 40)

# clustering plots based on hidden features
heatmap_eval = Scorer('heatmap', exporter=ExportClustermap(z_score=1.))
示例#17
0
def test_janggu_instance_dense(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    csvfile = os.path.join(data_path, 'sample.csv')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       roi=bed_file,
                                       order=1)

    df = pd.read_csv(csvfile, header=None)
    ctcf = Array('ctcf', df.values, conditions=['peaks'])

    @inputlayer
    @outputdense('sigmoid')
    def _cnn_model(inputs, inp, oup, params):
        layer = inputs['.']
        layer = Complement()(layer)
        layer = Reverse()(layer)
        layer = Flatten()(layer)
        output = Dense(params[0])(layer)
        return inputs, output

    with pytest.raises(Exception):
        # due to No input name . defined
        bwm = Janggu.create(_cnn_model,
                            modelparams=(2, ),
                            inputs=dna,
                            outputs=ctcf,
                            name='dna_ctcf_HepG2-cnn')

    @inputlayer
    @outputdense('sigmoid')
    def _cnn_model(inputs, inp, oup, params):
        layer = inputs[list()]
        layer = Complement()(layer)
        layer = Reverse()(layer)
        layer = Flatten()(layer)
        output = Dense(params[0])(layer)
        return inputs, output

    with pytest.raises(Exception):
        # due to Wrong type for indexing
        bwm = Janggu.create(_cnn_model,
                            modelparams=(2, ),
                            inputs=dna,
                            outputs=ctcf,
                            name='dna_ctcf_HepG2-cnn')

    @inputlayer
    @outputdense('sigmoid')
    def _cnn_model(inputs, inp, oup, params):
        layer = inputs()[0]
        layer = Complement()(layer)
        layer = Reverse()(layer)
        layer = Flatten()(layer)
        output = Dense(params[0])(layer)
        return inputs, output

    with pytest.raises(Exception):
        # name with must be string
        bwm = Janggu.create(_cnn_model,
                            modelparams=(2, ),
                            inputs=dna,
                            outputs=ctcf,
                            name=12342134)

    # test with given model name
    bwm = Janggu.create(_cnn_model,
                        modelparams=(2, ),
                        inputs=dna,
                        outputs=ctcf,
                        name='dna_ctcf_HepG2-cnn')
    # test with auto. generated modelname.
    bwm = Janggu.create(_cnn_model,
                        modelparams=(2, ),
                        inputs=dna,
                        outputs=ctcf,
                        name='dna_ctcf_HepG2-cnn')

    @inputlayer
    @outputdense('sigmoid')
    def _cnn_model(inputs, inp, oup, params):
        layer = inputs[0]
        layer = Complement()(layer)
        layer = Reverse()(layer)
        layer = Flatten()(layer)
        output = Dense(params[0])(layer)
        return inputs, output

    bwm = Janggu.create(_cnn_model,
                        modelparams=(2, ),
                        inputs=dna,
                        outputs=ctcf,
                        name='dna_ctcf_HepG2-cnn')

    @inputlayer
    @outputdense('sigmoid')
    def _cnn_model(inputs, inp, oup, params):
        layer = inputs['dna']
        layer = Complement()(layer)
        layer = Reverse()(layer)
        layer = Flatten()(layer)
        output = Dense(params[0])(layer)
        return inputs, output

    bwm = Janggu.create(_cnn_model,
                        modelparams=(2, ),
                        inputs=dna,
                        outputs=ctcf,
                        name='dna_ctcf_HepG2-cnn')
    kbwm2 = model_from_json(bwm.kerasmodel.to_json())
    kbwm3 = model_from_yaml(bwm.kerasmodel.to_yaml())

    bwm.compile(optimizer='adadelta', loss='binary_crossentropy')
    storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath)

    bwm.save()
    bwm.summary()

    assert os.path.exists(storage)

    Janggu.create_by_name('dna_ctcf_HepG2-cnn')
示例#18
0
    'order': order,
    'stranded': strand,
    'flank': flank,
    'rep': 'r{}'.format(rep),
    'flatten': flatten
}

DATA = get_data(pars)
mname = '{}_s{}_o{}_f{}_a{}_r{}'.format(modelname, pars['stranded'],
                                        pars['order'], pars['flank'],
                                        pars['flatten'], pars['rep'])

if not evaluate:
    model = Janggu.create(dna_model,
                          pars,
                          inputs=DATA[0][0],
                          outputs=DATA[0][1],
                          name=mname)
    model.summary()

    model.compile(optimizer=get_opt('amsgrad'),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    train_data = DATA[0]
    val_data = DATA[1]
    test_data = DATA[2]
    hist = model.fit(
        train_data[0],
        train_data[1],
        epochs=epochs,
示例#19
0
val_data = DATA[1]
test_data = DATA[2]

auprc_pre_val = []
auprc_pre_test = []
auprc_rand_val = []
auprc_rand_test = []

# Next, we concatenate the individual models and fine-tune them.
# Furthermore, the combined models are reset with random weights and trained from scratch
# as a comparison.
for dnarun, dnaserun in zip([1, 2, 3, 4, 5], [1, 2, 3, 4, 5]):
    # load pre-trained models
    dnaname = dnamodelname.format(dnarun)
    dnasename = dnasemodelname.format(dnaserun)
    dnamodel = Janggu.create_by_name(dnaname)
    dnasemodel = Janggu.create_by_name(dnasename)

    # remove output layer, concatenate the top-hidden layers, append output
    hidden_dna = dnamodel.kerasmodel.layers[-2].output
    hidden_dnase = dnasemodel.kerasmodel.layers[-2].output

    joint_hidden = Concatenate(name='concat')([hidden_dna, hidden_dnase])
    output = Dense(1, activation='sigmoid', name='peaks')(joint_hidden)

    # fit the model with preinitialized weights
    jointmodel = Janggu(dnamodel.kerasmodel.inputs +
                        dnasemodel.kerasmodel.inputs,
                        output,
                        name='pretrained_dnase_dna_joint_model_{}_{}'.format(
                            dnasename, dnaname))
示例#20
0
        shared_space['pretrained'] = False
        res = objective(shared_space)
        write_results(shared_space, res)
else:
    print('no training')

shared_space['val_chrom'] = "chr22"
shared_space['order'] = dnaorder
shared_space['pretrained'] = False
shared_space['seq_dropout'] = 0.2
shared_space['inputs'] = 'epi_dna'
params = shared_space
train_data = get_data(params)
train, test = split_train_test(train_data, [test_chrom])

model = Janggu.create_by_name('cage_promoters_epi_dna')

testpred = model.predict(test[0])

fig, ax = plt.subplots()
ax.scatter(test[1][:], testpred)
ax.set_xlabel('Observed normalized CAGE signal')
ax.set_ylabel('Predicted normalized CAGE signal')
fig.savefig(
    os.path.join(os.environ['JANGGU_OUTPUT'],
                 'cage_promoter_testchrom_agreement.png'))

fig, ax = plt.subplots()
ax.scatter(test[1][:], testpred)
ax.set_xlabel('Observed normalized CAGE signal')
ax.set_ylabel('Predicted normalized CAGE signal')
示例#21
0
def objective(params):
    print(params)
    try:
        train_data = get_data(params)
        train_data, test = split_train_test(train_data, [test_chrom])
        train, val = split_train_test(train_data, [params['val_chrom']])
        # define a keras model only based on DNA
        K.clear_session()
        if params['inputs'] == 'epi_dna':
            dnam = Janggu.create_by_name('cage_promoters_dna_only')
            epim = Janggu.create_by_name('cage_promoters_epi_only')
            layer = Concatenate()([
                dnam.kerasmodel.layers[-2].output,
                epim.kerasmodel.layers[-2].output
            ])
            layer = Dense(1, name='geneexpr')(layer)
            model = Janggu([dnam.kerasmodel.input] + epim.kerasmodel.input,
                           layer,
                           name='cage_promoters_epi_dna')

            if not params['pretrained']:
                # This part randomly reinitializes the network
                # so that we can train it from scratch
                newjointmodel = model_from_json(model.kerasmodel.to_json())

                newjointmodel = Janggu(
                    newjointmodel.inputs,
                    newjointmodel.outputs,
                    name='cage_promoters_epi_dna_randominit')
                model = newjointmodel
        else:
            model = Janggu.create(get_model,
                                  params,
                                  train_data[0],
                                  train_data[1],
                                  name='cage_promoters_{}'.format(
                                      params['inputs']))
    except ValueError:
        main_logger.exception('objective:')
        return {'status': 'fail'}
    model.compile(optimizer=get_opt(params['opt']),
                  loss='mae',
                  metrics=['mse'])
    hist = model.fit(
        train_data[0],
        train_data[1],
        epochs=params['epochs'],
        batch_size=64,
        validation_data=[params['val_chrom']],
        callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])
    print('#' * 40)
    for key in hist.history:
        print('{}: {}'.format(key, hist.history[key][-1]))
    print('#' * 40)
    pred_train = model.predict(train[0])
    pred_val = model.predict(val[0])
    pred_test = model.predict(test[0])
    model.evaluate(train[0],
                   train[1],
                   callbacks=['var_explained', 'mse', 'mae', 'cor'],
                   datatags=['train'])
    mae_val = model.evaluate(val[0],
                             val[1],
                             callbacks=['var_explained', 'mse', 'mae', 'cor'],
                             datatags=['val'])
    mae_val = mae_val[0]
    model.evaluate(test[0],
                   test[1],
                   callbacks=['var_explained', 'mse', 'mae', 'cor'],
                   datatags=['test'])

    cor_train = np.corrcoef(train[1][:][:, 0], pred_train[:, 0])[0, 1]
    cor_val = np.corrcoef(val[1][:][:, 0], pred_val[:, 0])[0, 1]
    cor_test = np.corrcoef(test[1][:][:, 0], pred_test[:, 0])[0, 1]

    model.summary()
    main_logger.info('cor [train/val/test]: {:.2f}/{:.2f}/{:.2f}'.format(
        cor_train, cor_val, cor_test))
    return {
        'loss': mae_val,
        'status': 'ok',
        'all_losses': hist.history,
        'cor_train': cor_train,
        'cor_val': cor_val,
        'cor_test': cor_test,
        'model_config': model.kerasmodel.to_json(),
        'model_weights': model.kerasmodel.get_weights(),
        'concrete_params': params
    }
示例#22
0
def test_janggu_instance_conv(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    posfile = os.path.join(data_path, 'scored_sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       roi=bed_file,
                                       order=1,
                                       binsize=200,
                                       stepsize=50)

    ctcf = Cover.create_from_bed("positives",
                                 bedfiles=posfile,
                                 roi=bed_file,
                                 binsize=200,
                                 stepsize=50,
                                 resolution=50,
                                 store_whole_genome=False,
                                 flank=0,
                                 collapser=None,
                                 storage='ndarray')

    ctcf = Cover.create_from_bed("positives",
                                 bedfiles=posfile,
                                 roi=bed_file,
                                 binsize=200,
                                 stepsize=50,
                                 resolution=50,
                                 store_whole_genome=True,
                                 flank=0,
                                 collapser=None,
                                 storage='ndarray')

    @inputlayer
    @outputconv('sigmoid')
    def _cnn_model(inputs, inp, oup, params):
        with inputs.use('dna') as inlayer:
            layer = inlayer
        layer = Complement()(layer)
        layer = Reverse()(layer)
        return inputs, layer

    bwm = Janggu.create(_cnn_model,
                        modelparams=(2, ),
                        inputs=dna,
                        outputs=ctcf,
                        name='dna_ctcf_HepG2-cnn')

    bwm.compile(optimizer='adadelta', loss='binary_crossentropy')
    storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath)

    bwm.save()
    bwm.summary()

    assert os.path.exists(storage)

    Janggu.create_by_name('dna_ctcf_HepG2-cnn')
示例#23
0
    return inputs, output


if args.model == 'single':
    modeltemplate = single_stranded_model
elif args.model == 'double':
    modeltemplate = double_stranded_model
else:
    modeltemplate = double_stranded_model_dnaconv

K.clear_session()

# create a new model object
model = Janggu.create(template=modeltemplate,
                      modelparams=(30, 21, 'relu'),
                      inputs=DNA,
                      outputs=LABELS,
                      name='fasta_seqs_m{}_o{}'.format(args.model, args.order))

model.compile(optimizer='adadelta',
              loss='binary_crossentropy',
              metrics=['acc'])
model.summary()

# fit the model
hist = model.fit(DNA, LABELS, epochs=100)

print('#' * 40)
print('loss: {}, acc: {}'.format(hist.history['loss'][-1],
                                 hist.history['acc'][-1]))
print('#' * 40)
示例#24
0
    performs the convolution operation with the normal kernel weights
    and the reverse complemented weights.
    """
    with inputs.use('dna') as layer:
        # the name in inputs.use() should be the same as the dataset name.
        layer = DnaConv2D(
            Conv2D(params[0], (params[1], 1), activation=params[2]))(layer)
    output = GlobalAveragePooling2D(name='motif')(layer)
    return inputs, output


K.clear_session()

# create a new model object
model = Janggu.create(template=double_stranded_model_dnaconv,
                      modelparams=(30, 21, 'relu'),
                      inputs=DNA,
                      outputs=ReduceDim(LABELS))

model.compile(optimizer='adadelta',
              loss='binary_crossentropy',
              metrics=['acc'])

model.fit(DNA, ReduceDim(LABELS), epochs=100, validation_data=['pseudo2'])

# do the evaluation on the independent test data
model.evaluate(DNA_TEST,
               ReduceDim(LABELS_TEST),
               datatags=['test'],
               callbacks=['auc', 'auprc', 'roc', 'prc'])

pred = model.predict(DNA_TEST)