예제 #1
0
def test_bed_unsync_roi_targets():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "positive.bed")
    bed_shift_file = os.path.join(data_path, "positive_shift.bed")

    cover = Cover.create_from_bed(
        'test',
        bedfiles=bed_shift_file,
        roi=bed_file,
        resolution=None,
        storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 1, 1, 1)
    assert cover[:].sum() == 25

    cover = Cover.create_from_bed(
        'test',
        bedfiles=bed_shift_file,
        roi=bed_file,
        resolution=50,
        storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 4, 1, 1)
    assert cover[:].sum() == 25 * 4


    cover = Cover.create_from_bed(
        'test',
        bedfiles=bed_shift_file,
        roi=bed_file,
        resolution=50,
        store_whole_genome=True,
        storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 4, 1, 1)
    assert cover[:].sum() == 25 * 4

    cover = Cover.create_from_bed(
        'test',
        bedfiles=bed_shift_file,
        roi=bed_file,
        resolution=1,
        store_whole_genome=False,
        storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 200, 1, 1)
    assert cover[:].sum() == 25 * 200 - 2

    cover = Cover.create_from_bed(
        'test',
        bedfiles=bed_shift_file,
        roi=bed_file,
        resolution=1,
        store_whole_genome=True,
        storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 200, 1, 1)
    assert cover[:].sum() == 25 * 200 - 2
예제 #2
0
def test_load_cover_bed_binary(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    bed_file = pkg_resources.resource_filename('janggu',
                                               'resources/sample.bed')
    score_file = pkg_resources.resource_filename(
        'janggu', 'resources/scored_sample.bed')

    for store in ['ndarray', 'hdf5', 'sparse']:
        print('store', store)
        cover = Cover.create_from_bed("cov",
                                      bedfiles=score_file,
                                      regions=bed_file,
                                      binsize=200,
                                      stepsize=200,
                                      resolution=200,
                                      storage=store,
                                      mode='binary',
                                      cache=True)
        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 1)

        cover = Cover.create_from_bed("cov50",
                                      bedfiles=score_file,
                                      regions=bed_file,
                                      binsize=200,
                                      stepsize=200,
                                      storage=store,
                                      resolution=50,
                                      mode='binary',
                                      cache=True)
        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 4, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 4 * 1)

        cover = Cover.create_from_bed(
            "cov50_firstdim",
            bedfiles=score_file,
            regions=bed_file,
            binsize=200,
            stepsize=200,
            storage=store,
            #resolution=50,
            dimmode='first',
            mode='binary',
            cache=True)
        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 1)
예제 #3
0
def test_load_cover_bed_categorical():
    bed_file = pkg_resources.resource_filename('janggu',
                                               'resources/sample.bed')
    score_file = pkg_resources.resource_filename(
        'janggu', 'resources/scored_sample.bed')

    for store in ['ndarray', 'sparse']:
        cover = Cover.create_from_bed("cov",
                                      bedfiles=score_file,
                                      regions=bed_file,
                                      binsize=200,
                                      stepsize=200,
                                      resolution=200,
                                      storage=store,
                                      mode='categorical')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 6))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 1)

        cover = Cover.create_from_bed("cov50",
                                      bedfiles=score_file,
                                      regions=bed_file,
                                      binsize=200,
                                      stepsize=200,
                                      resolution=50,
                                      storage=store,
                                      mode='categorical')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 4, 1, 6))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 4 * 1)

        cover = Cover.create_from_bed(
            "cov50",
            bedfiles=score_file,
            regions=bed_file,
            #            resolution=50,
            binsize=200,
            stepsize=200,
            storage=store,
            dimmode='first',
            mode='categorical')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 6))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 1)
예제 #4
0
def test_load_cover_bed_scored():
    bed_file = pkg_resources.resource_filename('janggu', 'resources/sample.bed')
    score_file = pkg_resources.resource_filename('janggu',
                                                 'resources/scored_sample.bed')

    for store in ['ndarray', 'sparse']:
        cover = Cover.create_from_bed(
            "cov",
            bedfiles=score_file,
            roi=bed_file,
            binsize=200, stepsize=200,
            resolution=200,
            storage=store,
            mode='score')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 5)

        cover = Cover.create_from_bed(
            "cov50",
            bedfiles=score_file,
            roi=bed_file,
            binsize=200, stepsize=200,
            storage=store,
            resolution=50,
            mode='score')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 4, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 4*5)

        cover = Cover.create_from_bed(
            "cov50",
            bedfiles=score_file,
            roi=bed_file,
            storage=store,
            resolution=None,
            binsize=200, stepsize=200,
            collapser='max',
            mode='score')

        np.testing.assert_equal(len(cover), 100)
        np.testing.assert_equal(cover.shape, (100, 1, 1, 1))
        np.testing.assert_equal(cover[0].sum(), 0)
        np.testing.assert_equal(cover[4].sum(), 5)
예제 #5
0
def get_data(params):
    train_labels = Cover.create_from_bed('labels',
                                         bedfiles=bedfiles,
                                         roi=train_roi,
                                         resolution=200,
                                         store_whole_genome=True,
                                         storage='sparse',
                                         cache=True,
                                         dtype='int8',
                                         minoverlap=.5,
                                         verbose=True)
    test_labels = view(train_labels, test_roi)
    val_labels = view(train_labels, val_roi)
    train_seq = Bioseq.create_from_refgenome('dna',
                                             refgenome=refgenome,
                                             roi=train_roi,
                                             store_whole_genome=True,
                                             storage='ndarray',
                                             cache=True,
                                             order=params['order'],
                                             flank=params['flank'],
                                             verbose=True)
    test_seq = view(train_seq, test_roi)
    val_seq = view(train_seq, val_roi)
    return ((train_seq, ReduceDim(train_labels)), (val_seq,
                                                   ReduceDim(val_labels)),
            (test_seq, ReduceDim(test_labels)))
예제 #6
0
def test_create_from_array_whole_genome_true(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    
    # load the dataset
    # The pseudo genome represents just a concatenation of all sequences
    # in sample.fa and sample2.fa. Therefore, the results should be almost
    # identically to the models obtained from classify_fasta.py.
    # ROI contains regions spanning positive and negative examples
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
    # PEAK_FILE only contains positive examples
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
                                   bedfiles=[PEAK_FILE]*5,
                                   binsize=200, stepsize=200,
                                   resolution=200,
                                   store_whole_genome=True)

    pred = LABELS[:]

    for storage in ['ndarray', 'sparse', 'hdf5']:
        print(storage)
        cov_out = Cover.create_from_array('BindingProba', pred,
                                          LABELS.gindexer,
                                          cache=True,
                                          storage=storage,
                                          store_whole_genome=True)

        np.testing.assert_equal(cov_out[:], LABELS[:])
        np.testing.assert_equal(cov_out.shape, LABELS.shape)
예제 #7
0
def test_create_from_array_whole_genome_false(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # load the dataset
    # The pseudo genome represents just a concatenation of all sequences
    # in sample.fa and sample2.fa. Therefore, the results should be almost
    # identically to the models obtained from classify_fasta.py.
    REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
    # ROI contains regions spanning positive and negative examples
    ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
    # PEAK_FILE only contains positive examples
    PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

    DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                       roi=ROI_FILE,
                                       binsize=200, stepsize=200,
                                       order=1,
                                       store_whole_genome=False,
                                       datatags=['ref'])

    LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
                                   bedfiles=PEAK_FILE,
                                   binsize=200, stepsize=200,
                                   resolution=200,
                                   store_whole_genome=False,
                                   datatags=['train'])

    @inputlayer
    @outputconv('sigmoid')
    def double_stranded_model_dnaconv(inputs, inp, oup, params):
        with inputs.use('dna') as layer:
            layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
                                     activation=params[2]))(layer)
        output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1],
                                       name='motif')(layer)
        return inputs, output

    modeltemplate = double_stranded_model_dnaconv

    K.clear_session()

    # create a new model object
    model = Janggu.create(template=modeltemplate,
                          modelparams=(30, 21, 'relu'),
                          inputs=DNA,
                          outputs=LABELS)

    model.compile(optimizer='adadelta', loss='binary_crossentropy',
                  metrics=['acc'])

    pred = model.predict(DNA)

    cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer,
                                      store_whole_genome=False)

    assert pred.shape == cov_out.shape

    np.testing.assert_equal(pred, cov_out[:])

    assert len(cov_out.gindexer) == len(pred)
    assert len(cov_out.garray.handle) == len(pred)
예제 #8
0
def test_bed_store_whole_genome_option():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "positive.bed")

    cover1 = Cover.create_from_bed('test',
                                   bedfiles=bed_file,
                                   regions=bed_file,
                                   store_whole_genome=True,
                                   storage='ndarray')
    cover2 = Cover.create_from_bed('test2',
                                   bedfiles=bed_file,
                                   regions=bed_file,
                                   store_whole_genome=False,
                                   storage='ndarray')

    assert len(cover1) == 25
    assert len(cover2) == len(cover1)
    assert cover1.shape == (25, 200, 1, 1)
    assert cover1.shape == cover2.shape
    np.testing.assert_equal(cover1[:], np.ones(cover1.shape))
    np.testing.assert_equal(cover2[:], np.ones(cover1.shape))
예제 #9
0
def test_bed_inferred_binsize():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "positive.bed")

    #file_ = os.path.join(data_path, "sample.bw")

    cover = Cover.create_from_bed('test',
                                  bedfiles=bed_file,
                                  regions=bed_file,
                                  resolution=1,
                                  storage='ndarray')
    assert len(cover) == 25
    assert cover.shape == (25, 200, 1, 1)
예제 #10
0
def test_bed_genomic_interval_access_part_genome():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")

    bamfile_ = os.path.join(data_path, "sample.bed")

    storage = False
    for reso in [1, 50]:
        for shift in [0, 1]:
            cover = Cover.create_from_bed(
                'test',
                bedfiles=bamfile_,
                roi=bed_file,
                flank=0,
                storage='ndarray',
                store_whole_genome=storage,
                resolution=reso)

            for i in range(len(cover)):
                print('storage :',storage,'/ resolution :',reso,'/ shift :',shift)
                print(i, cover.gindexer[i])


                np.testing.assert_equal(np.repeat(cover[i],
                                    cover.garray.resolution,
                                    axis=1), cover[cover.gindexer[i]])

                chrom, start, end, strand = cover.gindexer[i].chrom, \
                    cover.gindexer[i].start, \
                    cover.gindexer[i].end, \
                    cover.gindexer[i].strand

                np.testing.assert_equal(np.repeat(cover[i],
                                    cover.garray.resolution, axis=1),
                                    cover[chrom, start, end, strand])

                if shift != 0:
                    start += shift * reso
                    end += shift * reso

                    if strand != '-':
                        gicov = cover[chrom, start, end, strand][:, :(-shift*reso),:,:]
                        np.testing.assert_equal(cover[i][:, shift:,:, :],
                            gicov.reshape((1, gicov.shape[1]//reso, reso, 1, 1))[:, :, 0, :, :])
                    else:
                        gicov = cover[chrom, start, end, strand][:, (shift*reso):,:,:]
                        np.testing.assert_equal(cover[i][:, :-shift,:, :],
                        gicov.reshape((1, gicov.shape[1]//reso, reso, 1, 1))[:, :, 0, :, :])
예제 #11
0
def test_janggu_chr2_validation(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath

    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    posfile = os.path.join(data_path, 'scored_sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       binsize=200,
                                       stepsize=50,
                                       roi=bed_file,
                                       order=1)

    ctcf = Cover.create_from_bed("positives",
                                 bedfiles=posfile,
                                 roi=bed_file,
                                 binsize=200,
                                 stepsize=50,
                                 resolution=None,
                                 flank=0,
                                 collapser='max',
                                 storage='ndarray')

    @inputlayer
    @outputconv('sigmoid')
    def _cnn_model1(inputs, inp, oup, params):
        with inputs.use('dna') as inlayer:
            layer = inlayer
            layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'),
                              merge_mode='max',
                              name='bothstrands')(layer)
            layer = MaxPooling2D((198, 1))(layer)
        return inputs, layer

    bwm1 = Janggu.create(_cnn_model1,
                         modelparams=(2, ),
                         inputs=dna,
                         outputs=ctcf,
                         name='dna_ctcf_HepG2-cnn1')

    bwm1.compile(optimizer='adadelta', loss='binary_crossentropy')
    p1 = bwm1.fit(dna, ctcf, validation_data=['chr2'])
예제 #12
0
def test_bed_overreaching_ends_part_genome():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "bed_test.bed")

    for store in ['ndarray', 'sparse']:
        print(store)
        cover = Cover.create_from_bed(
            'test',
            bedfiles=bed_file,
            roi=bed_file,
            binsize=2,
            flank=20,
            resolution=1,
            store_whole_genome=False,
            storage=store)
        assert len(cover) == 9
        assert cover.shape == (9, 2+2*20, 1, 1)
        np.testing.assert_equal(cover[0].sum(), 18)
        np.testing.assert_equal(cover[:].sum(), 9*18)
예제 #13
0
def test_bed_genomic_interval_access():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "sample.bed")

    bamfile_ = os.path.join(data_path, "sample.bed")

    for storage in [True, False]:
        for reso in [1, 50]:
            for shift in [0, 1]:
                cover = Cover.create_from_bed('test',
                                              bedfiles=bamfile_,
                                              regions=bed_file,
                                              flank=0,
                                              storage='ndarray',
                                              store_whole_genome=storage,
                                              resolution=reso)

                for i in range(len(cover)):
                    print('storage :', storage, '/ resolution :', reso,
                          '/ shift :', shift)
                    print(i, cover.gindexer[i])

                    np.testing.assert_equal(cover[i], cover[cover.gindexer[i]])
                    chrom, start, end, strand = cover.gindexer[
                        i].chrom, cover.gindexer[i].start, cover.gindexer[
                            i].end, cover.gindexer[i].strand
                    np.testing.assert_equal(cover[i], cover[chrom, start, end,
                                                            strand])

                    if shift != 0:
                        start += shift * reso
                        end += shift * reso

                        if strand != '-':
                            np.testing.assert_equal(
                                cover[i][:, shift:, :, :],
                                cover[chrom, start, end,
                                      strand][:, :-shift, :, :])
                        else:
                            np.testing.assert_equal(
                                cover[i][:, :-shift, :, :],
                                cover[chrom, start, end, strand][:,
                                                                 shift:, :, :])
예제 #14
0
def test_bed_overreaching_ends():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, "positive.bed")

    cover = Cover.create_from_bed('test',
                                  bedfiles=bed_file,
                                  regions=bed_file,
                                  flank=2000,
                                  resolution=1,
                                  store_whole_genome=True,
                                  storage='ndarray')
    cover.garray.handle['chr1'][0] = 1
    assert len(cover) == 25
    assert cover.shape == (25, 200 + 2 * 2000, 1, 1)
    np.testing.assert_equal(cover[0][0, :550, 0, 0].sum(), 0)
    np.testing.assert_equal(cover[0][0, 550, 0, 0], 1.)
    np.testing.assert_equal(
        cover[0][0, 550:(550 + len(cover.garray.handle['chr1'])), :, :],
        cover.garray.handle['chr1'])
예제 #15
0
# to enforce synchronized datasets.
DNA = Bioseq.create_from_refgenome('dna',
                                   refgenome=REFGENOME,
                                   roi=ROI_TRAIN_FILE,
                                   binsize=200,
                                   order=args.order,
                                   storage='hdf5',
                                   cache=True,
                                   store_whole_genome=False,
                                   random_state=43)

LABELS = Cover.create_from_bed('peaks',
                               roi=ROI_TRAIN_FILE,
                               bedfiles=PEAK_FILE,
                               binsize=200,
                               resolution=200,
                               storage='sparse',
                               cache=True,
                               store_whole_genome=True,
                               random_state=43)

DNA_TEST = Bioseq.create_from_refgenome('dna',
                                        refgenome=REFGENOME,
                                        roi=ROI_TEST_FILE,
                                        binsize=200,
                                        order=args.order)

LABELS_TEST = Cover.create_from_bed('peaks',
                                    bedfiles=PEAK_FILE,
                                    roi=ROI_TEST_FILE,
                                    binsize=200,
예제 #16
0
def test_janggu_instance_conv(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    posfile = os.path.join(data_path, 'scored_sample.bed')

    refgenome = os.path.join(data_path, 'sample_genome.fa')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       roi=bed_file,
                                       order=1,
                                       binsize=200,
                                       stepsize=50)

    ctcf = Cover.create_from_bed("positives",
                                 bedfiles=posfile,
                                 roi=bed_file,
                                 binsize=200,
                                 stepsize=50,
                                 resolution=50,
                                 store_whole_genome=False,
                                 flank=0,
                                 collapser=None,
                                 storage='ndarray')

    ctcf = Cover.create_from_bed("positives",
                                 bedfiles=posfile,
                                 roi=bed_file,
                                 binsize=200,
                                 stepsize=50,
                                 resolution=50,
                                 store_whole_genome=True,
                                 flank=0,
                                 collapser=None,
                                 storage='ndarray')

    @inputlayer
    @outputconv('sigmoid')
    def _cnn_model(inputs, inp, oup, params):
        with inputs.use('dna') as inlayer:
            layer = inlayer
        layer = Complement()(layer)
        layer = Reverse()(layer)
        return inputs, layer

    bwm = Janggu.create(_cnn_model,
                        modelparams=(2, ),
                        inputs=dna,
                        outputs=ctcf,
                        name='dna_ctcf_HepG2-cnn')

    bwm.compile(optimizer='adadelta', loss='binary_crossentropy')
    storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath)

    bwm.save()
    bwm.summary()

    assert os.path.exists(storage)

    Janggu.create_by_name('dna_ctcf_HepG2-cnn')
예제 #17
0
def get_data(params):
    binsize = params['binsize']

    # PEAKS
    LABELS = ReduceDim(Cover.create_from_bed('peaks',
                                             bedfiles=PEAKS,
                                             roi=ROI,
                                             binsize=binsize,
                                             conditions=['JunD'],
                                             resolution=binsize,
                                             store_whole_genome=True,
                                             storage='sparse',
                                             cache=True),
                       aggregator='max')

    # training on chr1, validation on chr2, test on chr3 with swapped Dnase samples
    LABELS, LABELS_TEST = split_train_test(LABELS, 'chr3')
    LABELS_TRAIN, LABELS_VAL = split_train_test(LABELS, 'chr2')
    if params['type'] in ['dna_only', 'dnase_dna']:
        dnaflank = params['dnaflank']
        order = params['order']
        # DNA
        DNA = Bioseq.create_from_refgenome('dna',
                                           refgenome=REFGENOME,
                                           roi=ROI,
                                           binsize=binsize,
                                           flank=dnaflank,
                                           order=order,
                                           cache=True,
                                           store_whole_genome=True)

        DNA, DNA_TEST = split_train_test(DNA, 'chr3')
        DNA_TRAIN, DNA_VAL = split_train_test(DNA, 'chr2')
    if params['type'] in ['dnase_bam_only', 'dnase_dna']:

        dnaseflank = params['dnaseflank']
        # ACCESSIBILITY
        ACCESS_TEST = Cover.create_from_bam(
            'dnase',
            bamfiles=[DNASE_STAM_ENCODE, DNASE_STAM_ROADMAP],
            roi=ROI,
            binsize=binsize,
            conditions=['Encode', 'Roadmap'],
            flank=dnaseflank,
            resolution=50,
            normalizer=params['normalize'],
            store_whole_genome=True,
            cache=True)
        ACCESS = Cover.create_from_bam(
            'dnase',
            roi=ROI,
            bamfiles=[DNASE_STAM_ROADMAP, DNASE_STAM_ENCODE],
            binsize=binsize,
            conditions=['Roadmap', 'Encode'],
            resolution=50,
            flank=dnaseflank,
            normalizer=params['normalize'],
            store_whole_genome=True,
            cache=True)

        _, ACCESS_TEST = split_train_test(ACCESS_TEST, 'chr3')
        ACCESS, _ = split_train_test(ACCESS, 'chr3')
        ACCESS_TRAIN, ACCESS_VAL = split_train_test(ACCESS, 'chr2')

    if params['type'] in ['dna_dnase', 'dnase_bam_only']:
        if params['augment'] == 'orient':
            ACCESS_TRAIN = RandomOrientation(ACCESS_TRAIN)
        if params['augment'] == 'scale':
            ACCESS_TRAIN = RandomSignalScale(ACCESS_TRAIN, 0.1)
        if params['augment'] == 'both':
            ACCESS_TRAIN = RandomSignalScale(RandomOrientation(ACCESS_TRAIN),
                                             0.1)

    if params['type'] == 'dna_only':
        return (DNA_TRAIN, LABELS_TRAIN), (DNA_VAL, LABELS_VAL), \
               (DNA_TEST, LABELS_TEST)
    elif params['type'] == 'dnase_dna':
        return ([DNA_TRAIN, ACCESS_TRAIN], LABELS_TRAIN), \
                ([DNA_VAL, ACCESS_VAL], LABELS_VAL),\
               ([DNA_TEST, ACCESS_TEST], LABELS_TEST)
    elif params['type'] in ['dnase_bam_only']:
        return ([ACCESS_TRAIN], LABELS_TRAIN), \
               ([ACCESS_VAL], LABELS_VAL), \
               ([ACCESS_TEST], LABELS_TEST)
예제 #18
0
# ROI contains regions spanning positive and negative examples
ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed')
# PEAK_FILE only contains positive examples
PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')


# Training input and labels are purely defined genomic coordinates
DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                   roi=ROI_FILE,
                                   binsize=200,
                                   order=args.order,
                                   datatags=['ref'])

LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE,
                               bedfiles=PEAK_FILE,
                               binsize=200,
                               resolution=200,
                               datatags=['train'])


# evaluation metrics from sklearn.metrics
def wrap_roc(y_true, y_pred):
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    aux = str('({:.2%})'.format(roc_auc_score(y_true, y_pred)))
    print('roc', aux)
    return fpr, tpr, aux


def wrap_prc(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    aux = str('({:.2%})'.format(average_precision_score(y_true, y_pred)))
예제 #19
0
ROI_TRAIN_FILE = resource_filename('janggu', 'resources/roi_train.bed')
ROI_TEST_FILE = resource_filename('janggu', 'resources/roi_test.bed')
# PEAK_FILE only contains positive examples
PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

DNA = Bioseq.create_from_refgenome('dna',
                                   refgenome=REFGENOME,
                                   roi=ROI_FILE,
                                   order=args.order,
                                   binsize=200,
                                   store_whole_genome=True)

LABELS = Cover.create_from_bed('peaks',
                               roi=ROI_FILE,
                               bedfiles=PEAK_FILE,
                               binsize=200,
                               resolution=200,
                               storage='sparse',
                               store_whole_genome=True)

# in case the dataset has been loaded with store_whole_genome=True,
# it is possible to reuse the same dataset by subsetting on different
# regions of the genome.
DNA_TRAIN = view(DNA, ROI_TRAIN_FILE)
LABELS_TRAIN = view(LABELS, ROI_TRAIN_FILE)
DNA_TEST = view(DNA, ROI_TEST_FILE)
LABELS_TEST = view(LABELS, ROI_TEST_FILE)

# Define the model templates

예제 #20
0
# identically to the models obtained from classify_fasta.py.
REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
# ROI contains regions spanning positive and negative examples
ROI_TRAIN = resource_filename('janggu', 'resources/roi_train.bed')
ROI_TEST = resource_filename('janggu', 'resources/roi_test.bed')
# PEAK_FILE only contains positive examples
PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')

DNA_TEST = Bioseq.create_from_refgenome('dna',
                                        refgenome=REFGENOME,
                                        roi=ROI_TEST,
                                        binsize=200)

LABELS_TEST = Cover.create_from_bed('peaks',
                                    bedfiles=PEAK_FILE,
                                    roi=ROI_TEST,
                                    binsize=200,
                                    resolution=None)

# Training input and labels are purely defined genomic coordinates
DNA = Bioseq.create_from_refgenome('dna',
                                   refgenome=REFGENOME,
                                   roi=ROI_TRAIN,
                                   binsize=200)

LABELS = Cover.create_from_bed('peaks',
                               roi=ROI_TRAIN,
                               bedfiles=PEAK_FILE,
                               binsize=200,
                               resolution=None)
예제 #21
0
def test_cover_from_bed_sanity():
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_file = os.path.join(data_path, 'sample.bed')

    bwfile_ = os.path.join(data_path, "scored_sample.bed")
    cover = Cover.create_from_bed('test',
                                  bedfiles=bwfile_,
                                  regions=bed_file,
                                  binsize=200,
                                  stepsize=50,
                                  resolution=50,
                                  flank=0,
                                  storage='ndarray')
    cover[0]
    Cover.create_from_bed('test',
                          bedfiles=bwfile_,
                          regions=bed_file,
                          binsize=200,
                          stepsize=50,
                          resolution=50,
                          storage='ndarray')

    with pytest.raises(Exception):
        # name must be a string
        Cover.create_from_bed(1.2,
                              bedfiles=bwfile_,
                              regions=bed_file,
                              binsize=1,
                              stepsize=1,
                              storage='ndarray')

    with pytest.raises(Exception):
        Cover.create_from_bed('test',
                              bedfiles=bwfile_,
                              regions=bed_file,
                              binsize=1,
                              stepsize=1,
                              flank=-1,
                              storage='ndarray')
    with pytest.raises(Exception):
        Cover.create_from_bed('test',
                              bedfiles=bwfile_,
                              regions=bed_file,
                              binsize=1,
                              stepsize=-1,
                              flank=0,
                              storage='ndarray')
    with pytest.raises(Exception):
        Cover.create_from_bed('test',
                              bedfiles=bwfile_,
                              regions=bed_file,
                              binsize=-1,
                              stepsize=1,
                              flank=0,
                              storage='ndarray')
    with pytest.raises(Exception):
        # resolution must be greater than stepsize
        Cover.create_from_bed('test',
                              bedfiles=bwfile_,
                              regions=bed_file,
                              binsize=200,
                              stepsize=50,
                              resolution=300,
                              flank=0,
                              storage='ndarray')
    with pytest.raises(Exception):
        csvfile = os.path.join(data_path, 'ctcf_sample.csv')
        # must be a bed file
        Cover.create_from_bed('test',
                              bedfiles=csvfile,
                              regions=bed_file,
                              binsize=1,
                              stepsize=1,
                              storage='ndarray')
예제 #22
0
ROI_TRAIN_FILE = resource_filename('janggu', 'resources/roi_train.bed')
ROI_TEST_FILE = resource_filename('janggu', 'resources/roi_test.bed')
# PEAK_FILE only contains positive examples
PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')


# Training input and labels are purely defined genomic coordinates
DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                   roi=ROI_TRAIN_FILE,
                                   binsize=200,
                                   order=args.order,
                                   cache=True)

LABELS = Cover.create_from_bed('peaks', roi=ROI_TRAIN_FILE,
                               bedfiles=PEAK_FILE,
                               binsize=200,
                               resolution=200,
                               cache=True,
                               storage='sparse')


DNA_TEST = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                        roi=ROI_TEST_FILE,
                                        binsize=200,
                                        order=args.order)

LABELS_TEST = Cover.create_from_bed('peaks',
                                    bedfiles=PEAK_FILE,
                                    roi=ROI_TEST_FILE,
                                    binsize=200,
                                    resolution=200,
                                    storage='sparse')