Exemplo n.º 1
0
def test_create_from_array_whole_genome_false(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # load the dataset
    # The pseudo genome represents just a concatenation of all sequences
    # in sample.fa and sample2.fa. Therefore, the results should be almost
    # identically to the models obtained from classify_fasta.py.
    REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
    # ROI contains regions spanning positive and negative examples
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
    # PEAK_FILE only contains positive examples
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

    DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                       roi=ROI_FILE,
                                       binsize=200, stepsize=200,
                                       order=1,
                                       store_whole_genome=False,
                                       datatags=['ref'])

    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
                                   bedfiles=PEAK_FILE,
                                   binsize=200, stepsize=200,
                                   resolution=200,
                                   store_whole_genome=False,
                                   datatags=['train'])

    @inputlayer
    @outputconv('sigmoid')
    def double_stranded_model_dnaconv(inputs, inp, oup, params):
        with inputs.use('dna') as layer:
            layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
                                     activation=params[2]))(layer)
        output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1],
                                       name='motif')(layer)
        return inputs, output

    modeltemplate = double_stranded_model_dnaconv

    K.clear_session()

    # create a new model object
    model = Janggu.create(template=modeltemplate,
                          modelparams=(30, 21, 'relu'),
                          inputs=DNA,
                          outputs=LABELS)

    model.compile(optimizer='adadelta', loss='binary_crossentropy',
                  metrics=['acc'])

    pred = model.predict(DNA)

    cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer,
                                      store_whole_genome=False)

    assert pred.shape == cov_out.shape

    np.testing.assert_equal(pred, cov_out[:])

    assert len(cov_out.gindexer) == len(pred)
    assert len(cov_out.garray.handle) == len(pred)
Exemplo n.º 2
0
def test_create_from_array_whole_genome_true(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    
    # load the dataset
    # The pseudo genome represents just a concatenation of all sequences
    # in sample.fa and sample2.fa. Therefore, the results should be almost
    # identically to the models obtained from classify_fasta.py.
    # ROI contains regions spanning positive and negative examples
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
    # PEAK_FILE only contains positive examples
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
                                   bedfiles=[PEAK_FILE]*5,
                                   binsize=200, stepsize=200,
                                   resolution=200,
                                   store_whole_genome=True)

    pred = LABELS[:]

    for storage in ['ndarray', 'sparse', 'hdf5']:
        print(storage)
        cov_out = Cover.create_from_array('BindingProba', pred,
                                          LABELS.gindexer,
                                          cache=True,
                                          storage=storage,
                                          store_whole_genome=True)

        np.testing.assert_equal(cov_out[:], LABELS[:])
        np.testing.assert_equal(cov_out.shape, LABELS.shape)
Exemplo n.º 3
0
def test_janggu_variant_prediction(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    for order in [1, 2, 3]:
        refgenome = os.path.join(data_path, 'sample_genome.fa')
        vcffile = os.path.join(data_path, 'sample.vcf')

        dna = Bioseq.create_from_refgenome('dna',
                                           refgenome=refgenome,
                                           storage='ndarray',
                                           binsize=50,
                                           store_whole_genome=True,
                                           order=order)

        def _cnn_model(inputs, inp, oup, params):
            inputs = Input(
                (50 - params['order'] + 1, 1, pow(4, params['order'])))
            layer = Flatten()(inputs)
            layer = Dense(params['hiddenunits'])(layer)
            output = Dense(4, activation='sigmoid')(layer)
            return inputs, output

        model = Janggu.create(_cnn_model,
                              modelparams={
                                  'hiddenunits': 2,
                                  'order': order
                              },
                              name='dna_ctcf_HepG2-cnn')

        model.predict_variant_effect(
            dna,
            vcffile,
            conditions=['m' + str(i) for i in range(4)],
            output_folder=os.path.join(os.environ['JANGGU_OUTPUT']))
        assert os.path.exists(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'))
        assert os.path.exists(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'))

        f = h5py.File(os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'),
                      'r')

        gindexer = GenomicIndexer.create_from_file(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'), None,
            None)

        cov = Cover.create_from_array('snps',
                                      f['diffscore'],
                                      gindexer,
                                      store_whole_genome=True)

        print(cov['chr2', 55, 65].shape)
        print(cov['chr2', 55, 65])

        assert np.abs(cov['chr2', 59, 60]).sum() > 0.0
        assert np.abs(cov['chr2', 54, 55]).sum() == 0.0
        f.close()
Exemplo n.º 4
0
model = Model(xin, output)

model.compile(optimizer='adadelta',
              loss='binary_crossentropy',
              metrics=['acc'])
model.summary()

hist = model.fit(DNA,
                 LABELS,
                 epochs=100,
                 validation_data=(DNA_TEST, LABELS_TEST))

print('#' * 40)
print('loss: {}, acc: {}'.format(hist.history['loss'][-1],
                                 hist.history['acc'][-1]))
print('#' * 40)

# convert the prediction to a cover object
pred = model.predict(DNA_TEST)
cov_pred = Cover.create_from_array('BindingProba', pred, LABELS_TEST.gindexer)

print('Prediction score examples for Oct4')
for i in range(4):
    print('{}.: {}'.format(i, cov_pred[i]))
print('Prediction score examples for Mafk')
for i in range(1, 5):
    print('{}.: {}'.format(i, cov_pred[-i]))

# predictions (or feature activities) can finally be exported to bigwig
cov_pred.export_to_bigwig(output_dir=args.path)
Exemplo n.º 5
0
                      inputs=DNA,
                      outputs=ReduceDim(LABELS))

model.compile(optimizer='adadelta',
              loss='binary_crossentropy',
              metrics=['acc'])

hist = model.fit(DNA, ReduceDim(LABELS), epochs=args.epochs)

print('#' * 40)
print('loss: {}, acc: {}'.format(hist.history['loss'][-1],
                                 hist.history['acc'][-1]))
print('#' * 40)

pred = model.predict(DNA_TEST)
cov_pred = Cover.create_from_array('BindingProba', pred, LABELS_TEST.gindexer)

print('Oct4 predictions scores should be greater than Mafk scores:')
print('Prediction score examples for Oct4')
for i in range(4):
    print('{}.: {}'.format(i, cov_pred[i]))
print('Prediction score examples for Mafk')
for i in range(1, 5):
    print('{}.: {}'.format(i, cov_pred[-i]))

# Extract the 4th interval to perform input feature importance attribution
# which represents an Oct4 bound region
gi = DNA.gindexer[3]
chrom = gi.chrom
start = gi.start
end = gi.end