示例#1
0
def test_dna_genomic_interval_access(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 2
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train',
                                        refgenome=refgenome,
                                        roi=bed_merged,
                                        storage='ndarray',
                                        order=order)

    with pytest.raises(Exception):
        # due to store_whole_genome = False
        data[data.gindexer[0]]

    data = Bioseq.create_from_refgenome('train',
                                        refgenome=refgenome,
                                        roi=bed_merged,
                                        storage='ndarray',
                                        order=order,
                                        store_whole_genome=True)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    chrom = data.gindexer[0].chrom
    start = data.gindexer[0].start
    end = data.gindexer[0].end
    np.testing.assert_equal(data[0], data[(chrom, start, end)])
    np.testing.assert_equal(data[0], data[chrom, start, end])
示例#2
0
def test_dnaconv():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       roi=bed_file,
                                       order=1)

    xin = Input(dna.shape[1:])
    l1 = DnaConv2D(Conv2D(30, (21, 1), activation='relu'))(xin)
    m1 = Model(xin, l1)
    res1 = m1.predict(dna[0])[0, 0, 0, :]

    clayer = m1.layers[1].forward_layer
    # forward only
    l1 = clayer(xin)
    m2 = Model(xin, l1)
    res2 = m2.predict(dna[0])[0, 0, 0, :]

    rxin = Reverse()(Complement()(xin))
    l1 = clayer(rxin)
    l1 = Reverse()(l1)
    m3 = Model(xin, l1)
    res3 = m3.predict(dna[0])[0, 0, 0, :]

    res4 = np.maximum(res3, res2)
    np.testing.assert_allclose(res1, res4, rtol=1e-4)
示例#3
0
def test_dnaconv2():
    # this checks if DnaConv2D layer is instantiated correctly if
    # the conv2d layer has been instantiated beforehand.
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       roi=bed_file,
                                       order=1)

    xin = Input(dna.shape[1:])
    clayer = Conv2D(30, (21, 1), activation='relu')

    clayer(xin)

    l1 = DnaConv2D(clayer)(xin)
    m1 = Model(xin, l1)
    res1 = m1.predict(dna[0])[0, 0, 0, :]

    np.testing.assert_allclose(clayer.get_weights()[0],
                               m1.layers[1].forward_layer.get_weights()[0])
    assert len(clayer.weights) == 2
示例#4
0
def test_create_from_array_whole_genome_false(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # load the dataset
    # The pseudo genome represents just a concatenation of all sequences
    # in sample.fa and sample2.fa. Therefore, the results should be almost
    # identically to the models obtained from classify_fasta.py.
    REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
    # ROI contains regions spanning positive and negative examples
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
    # PEAK_FILE only contains positive examples
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

    DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                       roi=ROI_FILE,
                                       binsize=200, stepsize=200,
                                       order=1,
                                       store_whole_genome=False,
                                       datatags=['ref'])

    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
                                   bedfiles=PEAK_FILE,
                                   binsize=200, stepsize=200,
                                   resolution=200,
                                   store_whole_genome=False,
                                   datatags=['train'])

    @inputlayer
    @outputconv('sigmoid')
    def double_stranded_model_dnaconv(inputs, inp, oup, params):
        with inputs.use('dna') as layer:
            layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
                                     activation=params[2]))(layer)
        output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1],
                                       name='motif')(layer)
        return inputs, output

    modeltemplate = double_stranded_model_dnaconv

    K.clear_session()

    # create a new model object
    model = Janggu.create(template=modeltemplate,
                          modelparams=(30, 21, 'relu'),
                          inputs=DNA,
                          outputs=LABELS)

    model.compile(optimizer='adadelta', loss='binary_crossentropy',
                  metrics=['acc'])

    pred = model.predict(DNA)

    cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer,
                                      store_whole_genome=False)

    assert pred.shape == cov_out.shape

    np.testing.assert_equal(pred, cov_out[:])

    assert len(cov_out.gindexer) == len(pred)
    assert len(cov_out.garray.handle) == len(pred)
示例#5
0
def get_data(refgenome, flank):
    dna = Bioseq.create_from_refgenome(name='dna',
                                       refgenome=refgenome,
                                       roi="input.bed",
                                       flank=flank)
    print(dna.shape[0])
    return np.reshape(dna, (dna.shape[0], flank * 2 + 1, 4, 1))
示例#6
0
def complement_layer(order):
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train',
                                        refgenome=refgenome,
                                        roi=bed_file,
                                        storage='ndarray',
                                        binsize=binsize,
                                        flank=flank,
                                        order=order)

    dna_in = Input(shape=data.shape[1:], name='dna')
    cdna_layer = Complement()(dna_in)
    cmod = Model(dna_in, cdna_layer)

    # actual shape of DNA
    dna = data[0]

    cdna = cmod.predict(dna)
    ccdna = cmod.predict(cdna)

    with pytest.raises(Exception):
        np.testing.assert_equal(dna, cdna)
    np.testing.assert_equal(dna, ccdna)
示例#7
0
def test_split_train_test():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       roi=bed_file,
                                       binsize=200,
                                       stepsize=200,
                                       order=1,
                                       store_whole_genome=True)

    traindna, testdna = split_train_test(dna, holdout_chroms='chr2')

    assert len(traindna) == 50
    assert len(testdna) == 50
    assert len(dna) == len(traindna) + len(testdna)

    traindna, testdna = split_train_test([dna, dna], holdout_chroms='chr2')

    assert len(traindna[0]) == 50
    assert len(testdna[0]) == 50
    assert len(dna) == len(traindna[0]) + len(testdna[0])
示例#8
0
def get_data(params):
    train_labels = Cover.create_from_bed('labels',
                                         bedfiles=bedfiles,
                                         roi=train_roi,
                                         resolution=200,
                                         store_whole_genome=True,
                                         storage='sparse',
                                         cache=True,
                                         dtype='int8',
                                         minoverlap=.5,
                                         verbose=True)
    test_labels = view(train_labels, test_roi)
    val_labels = view(train_labels, val_roi)
    train_seq = Bioseq.create_from_refgenome('dna',
                                             refgenome=refgenome,
                                             roi=train_roi,
                                             store_whole_genome=True,
                                             storage='ndarray',
                                             cache=True,
                                             order=params['order'],
                                             flank=params['flank'],
                                             verbose=True)
    test_seq = view(train_seq, test_roi)
    val_seq = view(train_seq, val_roi)
    return ((train_seq, ReduceDim(train_labels)), (val_seq,
                                                   ReduceDim(val_labels)),
            (test_seq, ReduceDim(test_labels)))
示例#9
0
def test_dna_dims_order_2(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 2
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.bed')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train',
                                        refgenome=refgenome,
                                        roi=bed_merged,
                                        binsize=200,
                                        storage='ndarray',
                                        order=order)

    # for order 1
    assert len(data) == 100
    assert data.shape == (100, 199, 1, 16)
    # the correctness of the sequence extraction was also
    # validated using:
    # >bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr1:15000-25000
    # ATTGTGGTGAC...
    np.testing.assert_equal(
        data[0][0, :10, 0, :],
        np.asarray(
            [
                [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # AT
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # TT
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],  # GT
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],  # GG
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],  # GT
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
                [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],  # GA
                [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # AC
            dtype='int8'))

    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr2:15000-25000
    # ggggaagcaag...
    # this sequence is read from the reverse strand
    # so we have ...cttgcttcccc
    np.testing.assert_equal(
        data[50][0, -10:, 0, :],
        np.asarray(
            [
                [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],  # CT
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # TT
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],  # TG
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],  # GC
                [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],  # CT
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],  # TT
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],  # TC
                [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # CC
                [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # CC
                [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # CC
            dtype='int8'))
示例#10
0
def test_janggu_variant_prediction(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    for order in [1, 2, 3]:
        refgenome = os.path.join(data_path, 'sample_genome.fa')
        vcffile = os.path.join(data_path, 'sample.vcf')

        dna = Bioseq.create_from_refgenome('dna',
                                           refgenome=refgenome,
                                           storage='ndarray',
                                           binsize=50,
                                           store_whole_genome=True,
                                           order=order)

        def _cnn_model(inputs, inp, oup, params):
            inputs = Input(
                (50 - params['order'] + 1, 1, pow(4, params['order'])))
            layer = Flatten()(inputs)
            layer = Dense(params['hiddenunits'])(layer)
            output = Dense(4, activation='sigmoid')(layer)
            return inputs, output

        model = Janggu.create(_cnn_model,
                              modelparams={
                                  'hiddenunits': 2,
                                  'order': order
                              },
                              name='dna_ctcf_HepG2-cnn')

        model.predict_variant_effect(
            dna,
            vcffile,
            conditions=['m' + str(i) for i in range(4)],
            output_folder=os.path.join(os.environ['JANGGU_OUTPUT']))
        assert os.path.exists(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'))
        assert os.path.exists(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'))

        f = h5py.File(os.path.join(os.environ['JANGGU_OUTPUT'], 'scores.hdf5'),
                      'r')

        gindexer = GenomicIndexer.create_from_file(
            os.path.join(os.environ['JANGGU_OUTPUT'], 'snps.bed.gz'), None,
            None)

        cov = Cover.create_from_array('snps',
                                      f['diffscore'],
                                      gindexer,
                                      store_whole_genome=True)

        print(cov['chr2', 55, 65].shape)
        print(cov['chr2', 55, 65])

        assert np.abs(cov['chr2', 59, 60]).sum() > 0.0
        assert np.abs(cov['chr2', 54, 55]).sum() == 0.0
        f.close()
示例#11
0
def test_janggu_use_dnaconv_max(tmpdir):
    os.environ['JANGGU_OUTPUT']=tmpdir.strpath

    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    posfile = os.path.join(data_path, 'positive.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome,
                                    storage='ndarray',
                                    roi=bed_file, order=1)

    @inputlayer
    def _cnn_model1(inputs, inp, oup, params):
        with inputs.use('dna') as inlayer:
            layer = inlayer
            layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'),
                              merge_mode='max', name='bothstrands')(layer)
        return inputs, layer

    bwm1 = Janggu.create(_cnn_model1, modelparams=(2,),
                        inputs=dna,
                        name='dna_ctcf_HepG2-cnn1')

    p1 = bwm1.predict(dna[1:2])
    w = bwm1.kerasmodel.get_layer('bothstrands').get_weights()

    @inputlayer
    def _cnn_model2(inputs, inp, oup, params):
        with inputs.use('dna') as inlayer:
            layer = inlayer
            conv = Conv2D(5, (3, 1), name='singlestrand')
            fl = conv(layer)
            rl = Reverse()(conv(Complement()(Reverse()(inlayer))))
            layer = Maximum()([fl, rl])
        return inputs, layer

    bwm2 = Janggu.create(_cnn_model2, modelparams=(2,),
                        inputs=dna,
                        name='dna_ctcf_HepG2-cnn2')

    bwm2.kerasmodel.get_layer('singlestrand').set_weights(w)

    p2 = bwm2.predict(dna[1:2])
    np.testing.assert_allclose(p1, p2, rtol=1e-4, atol=1e-3)

    bwm1.compile(optimizer='adadelta', loss='binary_crossentropy')
    storage = bwm1._storage_path(bwm1.name, outputdir=tmpdir.strpath)

    bwm1.save()
    bwm1.summary()

    assert os.path.exists(storage)

    Janggu.create_by_name('dna_ctcf_HepG2-cnn1')
示例#12
0
def test_dna_dims_order_1_from_reference(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 1
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    gindexer = GenomicIndexer.create_from_file(bed_merged, 200, 200)

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                        storage='ndarray',
                                        order=order,
                                        store_whole_genome=True)
    data.gindexer = gindexer
    assert len(data.garray.handle) == 2
    assert 'chr1' in data.garray.handle
    assert 'chr2' in data.garray.handle

    # for order 1
    assert len(data) == 100
    assert data.shape == (100, 200, 1, 4)
    # the correctness of the sequence extraction was also
    # validated using:
    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr1:15000-25000
    # ATTGTGGTGA...
    # this sequence is read from the forward strand
    np.testing.assert_equal(data[0][0, :10, 0, :],
                            np.asarray([[1, 0, 0, 0],  # A
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # C
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [0, 0, 1, 0],  # G
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [1, 0, 0, 0]],  # A
                            dtype='int8'))

    # bedtools getfasta -fi sample_genome.fa -bed sample.bed
    # >chr2:15000-25000
    # ggggaagcaa...
    # this sequence is read from the reverse strand
    # so we have ...ttgcttcccc
    np.testing.assert_equal(data[50][0, -10:, 0, :],
                            np.asarray([[0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 1, 0],  # G
                                        [0, 1, 0, 0],  # C
                                        [0, 0, 0, 1],  # T
                                        [0, 0, 0, 1],  # T
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0],  # C
                                        [0, 1, 0, 0]],  # C
                            dtype='int8'))
示例#13
0
def get_data(params):
    zscore = ZScore()
    LABELS_TRAIN = ReduceDim(Cover.create_from_bam(
        'geneexpr',
        bamfiles=RNA.format(params['traincell'], params['trainrep']),
        roi=ROI_INPUT_TRAIN,
        flank=params['cageflank'],
        conditions=['GeneExpr'],
        resolution=None,
        store_whole_genome=False,
        storage='ndarray',
        normalizer=[LogTransform(), zscore],
        stranded=False,
        cache=True),
                             aggregator="mean")
    train_labels = LABELS_TRAIN
    train_input = []
    if params['inputs'] in ['dna_only', 'epi_dna']:
        dnaflank = params['dnaflank']
        order = params['order']
        # DNA
        DNA_TRAIN = Bioseq.create_from_refgenome('dna',
                                                 refgenome=REFGENOME,
                                                 roi=ROI_INPUT_TRAIN,
                                                 flank=dnaflank,
                                                 order=order,
                                                 cache=True,
                                                 store_whole_genome=False)
        train_input += [DNA_TRAIN]
    if params['inputs'] in ['epi_only', 'epi_dna']:
        zscore = ZScore()
        dnase_TRAIN = ReduceDim(Cover.create_from_bam(
            'dnase',
            bamfiles=DNASE.format(params['traincell']),
            roi=ROI_INPUT_TRAIN,
            flank=params['dnaseflank'],
            resolution=None,
            store_whole_genome=False,
            normalizer=[LogTransform(), zscore],
            cache=True),
                                aggregator="mean")
        train_input += [dnase_TRAIN]
        zscore = ZScore()
        h3k4_TRAIN = ReduceDim(Cover.create_from_bigwig(
            'h3k4',
            bigwigfiles=[H3K4me3.format(params['traincell'])],
            roi=ROI_INPUT_TRAIN,
            flank=params['dnaseflank'],
            store_whole_genome=False,
            normalizer=[LogTransform(), zscore],
            cache=True),
                               aggregator="mean")
        train_input += [h3k4_TRAIN]
    if len(train_input) == 0:
        raise ValueError('no input')
    return (train_input, train_labels)
示例#14
0
def test_janggu_influence_genomic(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    csvfile = os.path.join(data_path, 'sample.csv')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       binsize=50,
                                       roi=bed_file,
                                       order=1)

    df = pd.read_csv(csvfile, header=None)
    ctcf = Array('ctcf', df.values, conditions=['peaks'])

    @inputlayer
    @outputdense('sigmoid')
    def _cnn_model(inputs, inp, oup, params):
        layer = inputs['dna']
        layer = Flatten()(layer)
        output = Dense(params[0])(layer)
        return inputs, output

    model = Janggu.create(_cnn_model,
                          modelparams=(2, ),
                          inputs=dna,
                          outputs=ctcf,
                          name='dna_ctcf_HepG2-cnn')

    model.compile(optimizer='adadelta', loss='binary_crossentropy')

    # check with some nice offset
    iv = dna.gindexer[0]
    chrom, start, end = iv.chrom, iv.start, iv.end
    influence = input_attribution(model,
                                  dna,
                                  chrom=chrom,
                                  start=start,
                                  end=end)

    # check with an odd offset

    #    chrom, start, end =
    influence2 = input_attribution(model,
                                   dna,
                                   chrom=chrom,
                                   start=start - 1,
                                   end=end + 1)
    np.testing.assert_equal(influence[0][:], influence2[0][:][:, 1:-1])
示例#15
0
def test_dna_loading_from_seqrecord(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 2
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')
    seqs = sequences_from_fasta(refgenome)

    data = Bioseq.create_from_refgenome('train', refgenome=seqs,
                                     roi=bed_merged,
                                     storage='ndarray',
                                     order=order)
示例#16
0
def test_dna_first_last_channel():

    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data1 = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_merged,
                                     storage='ndarray',
                                     channel_last=True)
    assert data1.shape == (2, 10000, 1, 4)
    assert data1[0].shape == (1, 10000, 1, 4)

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_merged,
                                     storage='ndarray',
                                     channel_last=False)
    assert data.shape == (2, 4, 10000, 1)
    assert data[0].shape == (1, 4, 10000, 1)

    np.testing.assert_equal(data1[0], np.transpose(data[0], (0, 2, 3, 1)))
示例#17
0
def test_janggu_variant_streamer_order_12_ignore_ref_match(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    for order in [1, 2]:

        dna = Bioseq.create_from_refgenome('dna',
                                           refgenome=refgenome,
                                           storage='ndarray',
                                           binsize=50,
                                           store_whole_genome=True,
                                           order=order)

        # even binsize
        vcf = VariantStreamer(dna,
                              vcffile,
                              binsize=10,
                              batch_size=1,
                              ignore_reference_match=True)
        it_vcf = iter(vcf.flow())
        names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
        # C to T
        print(names, chroms, poss, ra, aa)
        print(reference)
        print(alternative)

        assert names[0] == 'refmismatch'
        #np.testing.assert_equal(reference, alternative)
        np.testing.assert_equal(
            np.abs(reference - alternative).sum(), 2 * order)
        #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

        # odd binsize
        vcf = VariantStreamer(dna,
                              vcffile,
                              binsize=3,
                              batch_size=1,
                              ignore_reference_match=True)
        it_vcf = iter(vcf.flow())

        names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
        # C to T
        print(names, chroms, poss, ra, aa)
        print(reference)
        print(alternative)
        assert names[0] == 'refmismatch'
        np.testing.assert_equal(
            np.abs(reference - alternative).sum(), 2 * order)
示例#18
0
def test_dnabed_overreaching_ends_partial_genome():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "bed_test.bed")
    filename = os.path.join(data_path, 'sample_genome.fa')

    bioseq = Bioseq.create_from_refgenome('test',
                                          refgenome=filename,
                                          roi=bed_file,
                                          binsize=2,
                                          flank=20,
                                          store_whole_genome=False,
                                          storage='ndarray')
    assert len(bioseq) == 9
    assert bioseq.shape == (9, 2 + 2 * 20, 1, 4)
    np.testing.assert_equal(bioseq[0].sum(), 22)
    np.testing.assert_equal(bioseq[-1].sum(), 42 - 4)
示例#19
0
def test_janggu_chr2_validation(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    posfile = os.path.join(data_path, 'scored_sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       binsize=200,
                                       stepsize=50,
                                       roi=bed_file,
                                       order=1)

    ctcf = Cover.create_from_bed("positives",
                                 bedfiles=posfile,
                                 roi=bed_file,
                                 binsize=200,
                                 stepsize=50,
                                 resolution=None,
                                 flank=0,
                                 collapser='max',
                                 storage='ndarray')

    @inputlayer
    @outputconv('sigmoid')
    def _cnn_model1(inputs, inp, oup, params):
        with inputs.use('dna') as inlayer:
            layer = inlayer
            layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'),
                              merge_mode='max',
                              name='bothstrands')(layer)
            layer = MaxPooling2D((198, 1))(layer)
        return inputs, layer

    bwm1 = Janggu.create(_cnn_model1,
                         modelparams=(2, ),
                         inputs=dna,
                         outputs=ctcf,
                         name='dna_ctcf_HepG2-cnn1')

    bwm1.compile(optimizer='adadelta', loss='binary_crossentropy')
    p1 = bwm1.fit(dna, ctcf, validation_data=['chr2'])
示例#20
0
def test_subset_exclude_chrname_test():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       roi=bed_file,
                                       binsize=200,
                                       stepsize=200,
                                       order=1,
                                       store_whole_genome=True)

    subdna = subset(dna, exclude_regions='chr2')

    assert len(subdna) == 50
示例#21
0
def test_dna_dims_order_1_from_subset(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 1
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_merged,
                                     storage='ndarray',
                                     order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
示例#22
0
def test_view_bed_test():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')
    bedsub_file = os.path.join(data_path, 'scored_sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       roi=bed_file,
                                       binsize=200,
                                       stepsize=200,
                                       order=1,
                                       store_whole_genome=True)

    subdna = view(dna, use_regions=bedsub_file)

    assert len(subdna) == 4
示例#23
0
def test_dnabed_overreaching_ends_whole_genome():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "bed_test.bed")
    filename = os.path.join(data_path, 'sample_genome.fa')

    bioseq = Bioseq.create_from_refgenome('test',
                                          refgenome=filename,
                                          roi=bed_file,
                                          binsize=2,
                                          flank=20,
                                          store_whole_genome=True,
                                          storage='ndarray',
                                          cache=False)
    assert len(bioseq) == 9
    assert bioseq.shape == (9, 2 + 2 * 20, 1, 4)
    # test if beginning is correctly padded
    np.testing.assert_equal(bioseq[0].sum(), 22)
    # test if end is correctly padded
    np.testing.assert_equal(bioseq['chr1', 29990, 30010].sum(), 10)
示例#24
0
def test_dna_props_extraction(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       roi=bed_file,
                                       binsize=200,
                                       stepsize=200,
                                       order=1)

    props = _data_props(dna)
    assert 'dna' in props
    assert props['dna']['shape'] == (200, 1, 4)

    with pytest.raises(Exception):
        _data_props((0, ))
示例#25
0
def test_dna_loading_from_seqrecord(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 2
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')
    seqs = sequences_from_fasta(refgenome)

    data = Bioseq.create_from_refgenome('train',
                                        refgenome=seqs,
                                        roi=bed_merged,
                                        storage='ndarray',
                                        store_whole_genome=True,
                                        order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    chrom = data.gindexer[0].chrom
    start = data.gindexer[0].start
    end = data.gindexer[0].end
    np.testing.assert_equal(data[0], data[(chrom, start, end)])
    np.testing.assert_equal(data[0], data[chrom, start, end])
示例#26
0
def reverse_layer(order):
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    data = Bioseq.create_from_refgenome('train', refgenome=refgenome,
                                     roi=bed_file,
                                     storage='ndarray',
                                     binsize=binsize,
                                     flank=flank,
                                     order=order)

    dna_in = Input(shape=data.shape[1:], name='dna')
    rdna_layer = Reverse()(dna_in)

    rmod = Model(dna_in, rdna_layer)

    # actual shape of DNA
    dna = data[0]
    np.testing.assert_equal(dna[:, ::-1, :, :], rmod.predict(dna))
示例#27
0
def test_janggu_variant_streamer_order_1_revcomp(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       binsize=50,
                                       store_whole_genome=True,
                                       order=order)

    annot = BedTool([Interval('chr2', 110, 130, '-')])

    # even binsize
    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1)
    it_vcf = iter(vcf.flow())
    next(it_vcf)
    # C to T
    #print(names, chroms, poss, ra, aa)
    #print(reference)
    #print(alternative)
    #assert names[0] == 'refmismatch'
    #np.testing.assert_equal(reference, alternative)
    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    next(it_vcf)
    # C to T
    #print(names, chroms, poss, ra, aa)
    #print(reference)
    #print(alternative)
    #np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0]))
    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    #    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1]))
    #    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    # even binsize
    vcf = VariantStreamer(dna,
                          vcffile,
                          binsize=10,
                          batch_size=1,
                          annotation=annot)
    it_vcf = iter(vcf.flow())
    next(it_vcf)
    # C to T

    next(it_vcf)
    # C to T

    names, chroms, poss, ra, aa, reference2, alternative2 = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference, reference2[:, ::-1, :, ::-1])
    np.testing.assert_equal(alternative, alternative2[:, ::-1, :, ::-1])
示例#28
0
def test_dna_dims_order_1_from_subset_dataframe(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 1
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    roi = pandas.read_csv(
        bed_merged,
        sep='\t',
        header=None,
        usecols=[0, 2, 3, 4, 5, 6],
        skiprows=2,
        names=['chrom', 'name', 'start', 'end', 'score', 'strand'])
    roi.start -= 1
    print(roi)

    data = Bioseq.create_from_refgenome('train',
                                        refgenome=refgenome,
                                        roi=roi,
                                        storage='ndarray',
                                        store_whole_genome=True,
                                        order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
    assert data[:].sum() == 20000

    roi = BedTool(bed_merged)
    data = Bioseq.create_from_refgenome('train',
                                        refgenome=refgenome,
                                        roi=roi,
                                        storage='ndarray',
                                        store_whole_genome=True,
                                        order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
    assert data[:].sum() == 20000

    roi = [iv for iv in BedTool(bed_merged)]
    data = Bioseq.create_from_refgenome('train',
                                        refgenome=refgenome,
                                        roi=roi,
                                        storage='ndarray',
                                        store_whole_genome=True,
                                        order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
    assert data[:].sum() == 20000
示例#29
0
def test_dna_dataset_sanity(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    with pytest.raises(Exception):
        # name must be a string
        Bioseq.create_from_refgenome(1.23,
                                     refgenome='',
                                     storage='ndarray',
                                     roi=bed_file,
                                     order=1)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train',
                                     refgenome='',
                                     storage='ndarray',
                                     roi=bed_file,
                                     order=1)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train',
                                     refgenome='test',
                                     storage='ndarray',
                                     roi=bed_file,
                                     order=1)

    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train',
                                     refgenome=refgenome,
                                     storage='ndarray',
                                     roi=bed_file,
                                     order=0)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train',
                                     refgenome=refgenome,
                                     storage='ndarray',
                                     roi=bed_file,
                                     flank=-1)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train',
                                     refgenome=refgenome,
                                     storage='ndarray',
                                     roi=bed_file,
                                     binsize=0)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train',
                                     refgenome=refgenome,
                                     storage='ndarray',
                                     roi=bed_file,
                                     stepsize=0)

    with pytest.warns(FutureWarning):
        Bioseq.create_from_refgenome('train',
                                     refgenome=refgenome,
                                     storage='ndarray',
                                     roi=bed_file,
                                     datatags=['help'])

    with pytest.warns(FutureWarning):
        Bioseq.create_from_refgenome('train',
                                     refgenome=refgenome,
                                     storage='ndarray',
                                     roi=bed_file,
                                     overwrite=True)
    with pytest.raises(Exception):
        Bioseq.create_from_refgenome('train',
                                     refgenome=refgenome,
                                     storage='step',
                                     roi=bed_file,
                                     order=1)

    assert not os.path.exists(
        os.path.join(tmpdir.strpath, 'train', 'storage.h5'))
    with pytest.raises(ValueError):
        Bioseq.create_from_refgenome('train',
                                     refgenome=refgenome,
                                     storage='sparse',
                                     roi=None,
                                     order=1,
                                     store_whole_genome=True)
    with pytest.raises(ValueError):
        Bioseq.create_from_refgenome('train',
                                     refgenome=refgenome,
                                     roi=bed_file,
                                     order=0,
                                     store_whole_genome=True)
    with pytest.raises(ValueError):
        Bioseq.create_from_refgenome('train',
                                     refgenome=refgenome,
                                     roi=None,
                                     store_whole_genome=False)

    Bioseq.create_from_refgenome('train',
                                 refgenome=refgenome,
                                 storage='ndarray',
                                 roi=None,
                                 order=1,
                                 store_whole_genome=True)
    file_ = glob.glob(os.path.join(tmpdir.strpath, 'datasets', 'train',
                                   '*.h5'))
    assert len(file_) == 0
    print(refgenome)
    print(bed_file)
    Bioseq.create_from_refgenome('train',
                                 refgenome=refgenome,
                                 storage='ndarray',
                                 roi=bed_file,
                                 order=1,
                                 cache=True)
    Bioseq.create_from_refgenome('train',
                                 refgenome=refgenome,
                                 storage='hdf5',
                                 roi=bed_file,
                                 order=1,
                                 cache=True)
    # a cache file must exist now
    file_ = glob.glob(os.path.join(tmpdir.strpath, 'datasets', 'train',
                                   '*.h5'))
    assert len(file_) == 1

    # reload the cached file
    Bioseq.create_from_refgenome('train',
                                 refgenome=refgenome,
                                 storage='hdf5',
                                 roi=bed_file,
                                 order=1,
                                 cache=True)
示例#30
0
def test_janggu_variant_streamer_order_2(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 2

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       binsize=50,
                                       store_whole_genome=True,
                                       order=order)

    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1)
    it_vcf = iter(vcf.flow())

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    assert names[0] == 'refmismatch'
    np.testing.assert_equal(reference, alternative)

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0, 3, 0, 1], 1)
    np.testing.assert_equal(reference[0, 4, 0, 7], 1)
    np.testing.assert_equal(alternative[0, 3, 0, 3], 1)
    np.testing.assert_equal(alternative[0, 4, 0, 15], 1)

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # CTC -> CCC
    np.testing.assert_equal(reference[0, 3, 0, 7], 1)
    np.testing.assert_equal(reference[0, 4, 0, 13], 1)
    np.testing.assert_equal(alternative[0, 3, 0, 5], 1)
    np.testing.assert_equal(alternative[0, 4, 0, 5], 1)

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # GAC -> GGC
    np.testing.assert_equal(reference[0, 3, 0, 8], 1)
    np.testing.assert_equal(reference[0, 4, 0, 1], 1)
    np.testing.assert_equal(alternative[0, 3, 0, 10], 1)
    np.testing.assert_equal(alternative[0, 4, 0, 9], 1)

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # CGG -> CAG
    np.testing.assert_equal(reference[0, 3, 0, 6], 1)
    np.testing.assert_equal(reference[0, 4, 0, 10], 1)
    np.testing.assert_equal(alternative[0, 3, 0, 4], 1)
    np.testing.assert_equal(alternative[0, 4, 0, 2], 1)

    vcf = VariantStreamer(dna, vcffile, binsize=5, batch_size=1)
    it_vcf = iter(vcf.flow())

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    assert names[0] == 'refmismatch'
    np.testing.assert_equal(reference, alternative)

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # ACT -> ATT
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference[0, 1, 0, 1], 1)
    np.testing.assert_equal(reference[0, 2, 0, 7], 1)
    np.testing.assert_equal(alternative[0, 1, 0, 3], 1)
    np.testing.assert_equal(alternative[0, 2, 0, 15], 1)

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    # CTC -> CCC
    np.testing.assert_equal(reference[0, 1, 0, 7], 1)
    np.testing.assert_equal(reference[0, 2, 0, 13], 1)
    np.testing.assert_equal(alternative[0, 1, 0, 5], 1)
    np.testing.assert_equal(alternative[0, 2, 0, 5], 1)