def test_bam_store_whole_genome_option(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bam") cover1 = Cover.create_from_bam('test', bamfiles=bamfile_, regions=bed_file, store_whole_genome=True, binsize=200, stepsize=200, storage='ndarray') cover2 = Cover.create_from_bam('test2', bamfiles=bamfile_, regions=bed_file, store_whole_genome=False, binsize=200, stepsize=200, storage='ndarray') assert len(cover1) == 100 assert len(cover2) == len(cover1) assert cover1.shape == (100, 200, 2, 1) assert cover1.shape == cover2.shape np.testing.assert_equal(cover1[:], cover2[:]) assert cover1[:].sum() == 29.
def test_cover_from_bam_sanity(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bam") cover = Cover.create_from_bam( 'test', bamfiles=bamfile_, roi=bed_file, binsize=200, stepsize=200, flank=0, storage='ndarray') cover[0] with pytest.raises(IndexError): # not interable cover[1.2] cov2 = Cover.create_from_bam( 'test', bamfiles=bamfile_, storage='ndarray', store_whole_genome=True) assert len(cover.gindexer) == len(cover.garray.handle) assert len(cov2.garray.handle) != len(cover.garray.handle) with pytest.raises(Exception): # name must be a string Cover.create_from_bam( 1.2, bamfiles=bamfile_, roi=bed_file, binsize=1, stepsize=1, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bam( 'test', bamfiles=bamfile_, roi=bed_file, binsize=1, stepsize=1, flank=-1, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bam( 'test', bamfiles=bamfile_, roi=bed_file, binsize=1, stepsize=-1, flank=0, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bam( 'test', bamfiles=bamfile_, roi=bed_file, binsize=-1, stepsize=1, flank=0, storage='ndarray')
def get_data(params): zscore = ZScore() LABELS_TRAIN = ReduceDim(Cover.create_from_bam( 'geneexpr', bamfiles=RNA.format(params['traincell'], params['trainrep']), roi=ROI_INPUT_TRAIN, flank=params['cageflank'], conditions=['GeneExpr'], resolution=None, store_whole_genome=False, storage='ndarray', normalizer=[LogTransform(), zscore], stranded=False, cache=True), aggregator="mean") train_labels = LABELS_TRAIN train_input = [] if params['inputs'] in ['dna_only', 'epi_dna']: dnaflank = params['dnaflank'] order = params['order'] # DNA DNA_TRAIN = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_INPUT_TRAIN, flank=dnaflank, order=order, cache=True, store_whole_genome=False) train_input += [DNA_TRAIN] if params['inputs'] in ['epi_only', 'epi_dna']: zscore = ZScore() dnase_TRAIN = ReduceDim(Cover.create_from_bam( 'dnase', bamfiles=DNASE.format(params['traincell']), roi=ROI_INPUT_TRAIN, flank=params['dnaseflank'], resolution=None, store_whole_genome=False, normalizer=[LogTransform(), zscore], cache=True), aggregator="mean") train_input += [dnase_TRAIN] zscore = ZScore() h3k4_TRAIN = ReduceDim(Cover.create_from_bigwig( 'h3k4', bigwigfiles=[H3K4me3.format(params['traincell'])], roi=ROI_INPUT_TRAIN, flank=params['dnaseflank'], store_whole_genome=False, normalizer=[LogTransform(), zscore], cache=True), aggregator="mean") train_input += [h3k4_TRAIN] if len(train_input) == 0: raise ValueError('no input') return (train_input, train_labels)
def test_load_bam_resolution10(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bamfile_ = os.path.join(data_path, "sample.bam") gsfile_ = os.path.join(data_path, 'sample.chrom.sizes') content = pandas.read_csv(gsfile_, sep='\t', names=['chr', 'length'], index_col='chr') gsize = content.to_dict()['length'] bed_file = os.path.join(data_path, "sample.bed") for store in ['ndarray', 'hdf5', 'sparse']: # base pair binsize # print(store) cover = Cover.create_from_bam("yeast_I_II_III.bam", bamfiles=bamfile_, regions=bed_file, binsize=200, stepsize=200, genomesize=gsize, resolution=10, storage=store, cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 20, 2, 1)) # the region is read relative to the forward strand # read on the reverse strand val = np.where(cover[4] == 1) np.testing.assert_equal(cover[4].sum(), 1.) np.testing.assert_equal(val[1][0], 17) # pos np.testing.assert_equal(val[2][0], 1) # strand # two reads on the forward strand val = np.where(cover[13] == 1) np.testing.assert_equal(cover[13].sum(), 2.) np.testing.assert_equal(val[1], np.asarray([16, 17])) # pos np.testing.assert_equal(val[2], np.asarray([0, 0])) # strand # the region is read relative to the reverse strand # for index 50 # read on the reverse strand val = np.where(cover[52] == 1) np.testing.assert_equal(cover[52].sum(), 2.) np.testing.assert_equal(val[1], np.asarray([0, 8])) # pos np.testing.assert_equal(val[2], np.asarray([0, 0])) # strand # two reads on the forward strand val = np.where(cover[96] == 1) np.testing.assert_equal(cover[96].sum(), 1.) np.testing.assert_equal(val[1], np.asarray([2])) # pos np.testing.assert_equal(val[2], np.asarray([1])) # strand
def test_bam_inferred_binsize(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") bamfile_ = os.path.join(data_path, "sample.bam") cover = Cover.create_from_bam('test', bamfiles=bamfile_, regions=bed_file, flank=0, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 200, 2, 1)
def test_load_bam_resolutionNone(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bamfile_ = os.path.join(data_path, "sample.bam") gsfile_ = os.path.join(data_path, 'sample.chrom.sizes') content = pandas.read_csv(gsfile_, sep='\t', names=['chr', 'length'], index_col='chr') gsize = content.to_dict()['length'] bed_file = os.path.join(data_path, "sample.bed") for store in ['ndarray', 'hdf5', 'sparse']: # base pair binsize # print(store) cover1 = Cover.create_from_bam( "yeast_I_II_III.bam", bamfiles=bamfile_, roi=bed_file, binsize=200, stepsize=200, genomesize=gsize, resolution=1, storage=store, cache=True) cover = Cover.create_from_bam( "yeast_I_II_III.bam", bamfiles=bamfile_, roi=bed_file, binsize=200, stepsize=200, genomesize=gsize, resolution=None, storage=store, cache=True, datatags=['None']) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 2, 1)) np.testing.assert_equal(cover1[:].sum(axis=1), cover[:].sum(axis=1))
def test_bam_genomic_interval_access_part_genome(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bam") storage = False for reso in [1, 50]: for shift in [0, 1]: cover = Cover.create_from_bam( 'test', bamfiles=bamfile_, roi=bed_file, flank=0, storage='ndarray', store_whole_genome=storage, resolution=reso) for i in range(len(cover)): print('storage :',storage,'/ resolution :',reso,'/ shift :',shift) print(i, cover.gindexer[i]) np.testing.assert_equal(np.repeat(cover[i], cover.garray.resolution, axis=1), cover[cover.gindexer[i]]) chrom, start, end, strand = cover.gindexer[i].chrom, \ cover.gindexer[i].start, \ cover.gindexer[i].end, \ cover.gindexer[i].strand np.testing.assert_equal(np.repeat(cover[i], cover.garray.resolution, axis=1), cover[chrom, start, end, strand]) if shift != 0: start += shift * reso end += shift * reso if strand != '-': gicov = cover[chrom, start, end, strand][:, :(-shift*reso),:,:] np.testing.assert_equal(cover[i][:, shift:,:, :], gicov.reshape((1, gicov.shape[1]//reso, reso, 2, 1))[:, :, 0, :, :]) else: gicov = cover[chrom, start, end, strand][:, (shift*reso):,:,:] np.testing.assert_equal(cover[i][:, :-shift,:, :], gicov.reshape((1, gicov.shape[1]//reso, reso, 2, 1))[:, :, 0, :, :])
def test_cover_bam_unstranded(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bamfile_ = os.path.join(data_path, "sample.bam") gsfile_ = os.path.join(data_path, 'sample.chrom.sizes') content = pandas.read_csv(gsfile_, sep='\t', names=['chr', 'length'], index_col='chr') gsize = content.to_dict()['length'] bed_file = os.path.join(data_path, "sample.bed") cover = Cover.create_from_bam("yeast_I_II_III.bam", bamfiles=bamfile_, regions=bed_file, binsize=200, stepsize=200, genomesize=gsize, stranded=False) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 200, 1, 1)) # the region is read relative to the forward strand # read on the reverse strand val = np.where(cover[4] == 1) np.testing.assert_equal(cover[4].sum(), 1.) np.testing.assert_equal(val[1][0], 179) # pos # two reads on the forward strand val = np.where(cover[13] == 1) np.testing.assert_equal(cover[13].sum(), 2.) np.testing.assert_equal(val[1], np.asarray([162, 178])) # pos # the region is read relative to the reverse strand # for index 50 # read on the reverse strand val = np.where(cover[52] == 1) np.testing.assert_equal(cover[52].sum(), 2.) np.testing.assert_equal(val[1], np.asarray([9, 89])) # pos # two reads on the forward strand val = np.where(cover[96] == 1) np.testing.assert_equal(cover[96].sum(), 1.) np.testing.assert_equal(val[1], np.asarray([25])) # pos
def test_bam_genomic_interval_access(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bam") for storage in [True, False]: for reso in [1, 50]: for shift in [0, 1]: cover = Cover.create_from_bam('test', bamfiles=bamfile_, regions=bed_file, flank=0, storage='ndarray', store_whole_genome=storage, resolution=reso) for i in range(len(cover)): print('storage :', storage, '/ resolution :', reso, '/ shift :', shift) print(i, cover.gindexer[i]) np.testing.assert_equal(cover[i], cover[cover.gindexer[i]]) chrom, start, end, strand = cover.gindexer[ i].chrom, cover.gindexer[i].start, cover.gindexer[ i].end, cover.gindexer[i].strand np.testing.assert_equal(cover[i], cover[chrom, start, end, strand]) if shift != 0: start += shift * reso end += shift * reso if strand != '-': np.testing.assert_equal( cover[i][:, shift:, :, :], cover[chrom, start, end, strand][:, :-shift, :, :]) else: np.testing.assert_equal( cover[i][:, :-shift, :, :], cover[chrom, start, end, strand][:, shift:, :, :])
def test_cover_bam_paired_midpoint(): # sample2.bam contains paired end examples, # unmapped examples, unmapped mate and low quality example data_path = pkg_resources.resource_filename('janggu', 'resources/') bamfile_ = os.path.join(data_path, "sample2.bam") cover = Cover.create_from_bam("yeast_I_II_III.bam", bamfiles=bamfile_, stranded=False, pairedend='midpoint', min_mapq=30, store_whole_genome=True) assert cover.garray.handle['ref'].sum() == 2, cover.garray.handle['ref'] print(cover.garray.handle['ref']) # the read starts at index 6 and tlen is 39 assert cover.garray.handle['ref'][6 + 39 // 2, 0, 0] == 1 # another read maps to index 34 assert cover.garray.handle['ref'][34, 0, 0] == 1
def get_data(params): binsize = params['binsize'] # PEAKS LABELS = ReduceDim(Cover.create_from_bed('peaks', bedfiles=PEAKS, roi=ROI, binsize=binsize, conditions=['JunD'], resolution=binsize, store_whole_genome=True, storage='sparse', cache=True), aggregator='max') # training on chr1, validation on chr2, test on chr3 with swapped Dnase samples LABELS, LABELS_TEST = split_train_test(LABELS, 'chr3') LABELS_TRAIN, LABELS_VAL = split_train_test(LABELS, 'chr2') if params['type'] in ['dna_only', 'dnase_dna']: dnaflank = params['dnaflank'] order = params['order'] # DNA DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI, binsize=binsize, flank=dnaflank, order=order, cache=True, store_whole_genome=True) DNA, DNA_TEST = split_train_test(DNA, 'chr3') DNA_TRAIN, DNA_VAL = split_train_test(DNA, 'chr2') if params['type'] in ['dnase_bam_only', 'dnase_dna']: dnaseflank = params['dnaseflank'] # ACCESSIBILITY ACCESS_TEST = Cover.create_from_bam( 'dnase', bamfiles=[DNASE_STAM_ENCODE, DNASE_STAM_ROADMAP], roi=ROI, binsize=binsize, conditions=['Encode', 'Roadmap'], flank=dnaseflank, resolution=50, normalizer=params['normalize'], store_whole_genome=True, cache=True) ACCESS = Cover.create_from_bam( 'dnase', roi=ROI, bamfiles=[DNASE_STAM_ROADMAP, DNASE_STAM_ENCODE], binsize=binsize, conditions=['Roadmap', 'Encode'], resolution=50, flank=dnaseflank, normalizer=params['normalize'], store_whole_genome=True, cache=True) _, ACCESS_TEST = split_train_test(ACCESS_TEST, 'chr3') ACCESS, _ = split_train_test(ACCESS, 'chr3') ACCESS_TRAIN, ACCESS_VAL = split_train_test(ACCESS, 'chr2') if params['type'] in ['dna_dnase', 'dnase_bam_only']: if params['augment'] == 'orient': ACCESS_TRAIN = RandomOrientation(ACCESS_TRAIN) if params['augment'] == 'scale': ACCESS_TRAIN = RandomSignalScale(ACCESS_TRAIN, 0.1) if params['augment'] == 'both': ACCESS_TRAIN = RandomSignalScale(RandomOrientation(ACCESS_TRAIN), 0.1) if params['type'] == 'dna_only': return (DNA_TRAIN, LABELS_TRAIN), (DNA_VAL, LABELS_VAL), \ (DNA_TEST, LABELS_TEST) elif params['type'] == 'dnase_dna': return ([DNA_TRAIN, ACCESS_TRAIN], LABELS_TRAIN), \ ([DNA_VAL, ACCESS_VAL], LABELS_VAL),\ ([DNA_TEST, ACCESS_TEST], LABELS_TEST) elif params['type'] in ['dnase_bam_only']: return ([ACCESS_TRAIN], LABELS_TRAIN), \ ([ACCESS_VAL], LABELS_VAL), \ ([ACCESS_TEST], LABELS_TEST)