def test_bed_unsync_roi_targets(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") bed_shift_file = os.path.join(data_path, "positive_shift.bed") cover = Cover.create_from_bed( 'test', bedfiles=bed_shift_file, roi=bed_file, resolution=None, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 1, 1, 1) assert cover[:].sum() == 25 cover = Cover.create_from_bed( 'test', bedfiles=bed_shift_file, roi=bed_file, resolution=50, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 4, 1, 1) assert cover[:].sum() == 25 * 4 cover = Cover.create_from_bed( 'test', bedfiles=bed_shift_file, roi=bed_file, resolution=50, store_whole_genome=True, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 4, 1, 1) assert cover[:].sum() == 25 * 4 cover = Cover.create_from_bed( 'test', bedfiles=bed_shift_file, roi=bed_file, resolution=1, store_whole_genome=False, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 200, 1, 1) assert cover[:].sum() == 25 * 200 - 2 cover = Cover.create_from_bed( 'test', bedfiles=bed_shift_file, roi=bed_file, resolution=1, store_whole_genome=True, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 200, 1, 1) assert cover[:].sum() == 25 * 200 - 2
def test_load_cover_bed_binary(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath bed_file = pkg_resources.resource_filename('janggu', 'resources/sample.bed') score_file = pkg_resources.resource_filename( 'janggu', 'resources/scored_sample.bed') for store in ['ndarray', 'hdf5', 'sparse']: print('store', store) cover = Cover.create_from_bed("cov", bedfiles=score_file, regions=bed_file, binsize=200, stepsize=200, resolution=200, storage=store, mode='binary', cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 1) cover = Cover.create_from_bed("cov50", bedfiles=score_file, regions=bed_file, binsize=200, stepsize=200, storage=store, resolution=50, mode='binary', cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 4, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 4 * 1) cover = Cover.create_from_bed( "cov50_firstdim", bedfiles=score_file, regions=bed_file, binsize=200, stepsize=200, storage=store, #resolution=50, dimmode='first', mode='binary', cache=True) np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 1)
def test_load_cover_bed_categorical(): bed_file = pkg_resources.resource_filename('janggu', 'resources/sample.bed') score_file = pkg_resources.resource_filename( 'janggu', 'resources/scored_sample.bed') for store in ['ndarray', 'sparse']: cover = Cover.create_from_bed("cov", bedfiles=score_file, regions=bed_file, binsize=200, stepsize=200, resolution=200, storage=store, mode='categorical') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 6)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 1) cover = Cover.create_from_bed("cov50", bedfiles=score_file, regions=bed_file, binsize=200, stepsize=200, resolution=50, storage=store, mode='categorical') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 4, 1, 6)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 4 * 1) cover = Cover.create_from_bed( "cov50", bedfiles=score_file, regions=bed_file, # resolution=50, binsize=200, stepsize=200, storage=store, dimmode='first', mode='categorical') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 6)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 1)
def test_load_cover_bed_scored(): bed_file = pkg_resources.resource_filename('janggu', 'resources/sample.bed') score_file = pkg_resources.resource_filename('janggu', 'resources/scored_sample.bed') for store in ['ndarray', 'sparse']: cover = Cover.create_from_bed( "cov", bedfiles=score_file, roi=bed_file, binsize=200, stepsize=200, resolution=200, storage=store, mode='score') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 5) cover = Cover.create_from_bed( "cov50", bedfiles=score_file, roi=bed_file, binsize=200, stepsize=200, storage=store, resolution=50, mode='score') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 4, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 4*5) cover = Cover.create_from_bed( "cov50", bedfiles=score_file, roi=bed_file, storage=store, resolution=None, binsize=200, stepsize=200, collapser='max', mode='score') np.testing.assert_equal(len(cover), 100) np.testing.assert_equal(cover.shape, (100, 1, 1, 1)) np.testing.assert_equal(cover[0].sum(), 0) np.testing.assert_equal(cover[4].sum(), 5)
def get_data(params): train_labels = Cover.create_from_bed('labels', bedfiles=bedfiles, roi=train_roi, resolution=200, store_whole_genome=True, storage='sparse', cache=True, dtype='int8', minoverlap=.5, verbose=True) test_labels = view(train_labels, test_roi) val_labels = view(train_labels, val_roi) train_seq = Bioseq.create_from_refgenome('dna', refgenome=refgenome, roi=train_roi, store_whole_genome=True, storage='ndarray', cache=True, order=params['order'], flank=params['flank'], verbose=True) test_seq = view(train_seq, test_roi) val_seq = view(train_seq, val_roi) return ((train_seq, ReduceDim(train_labels)), (val_seq, ReduceDim(val_labels)), (test_seq, ReduceDim(test_labels)))
def test_create_from_array_whole_genome_true(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # load the dataset # The pseudo genome represents just a concatenation of all sequences # in sample.fa and sample2.fa. Therefore, the results should be almost # identically to the models obtained from classify_fasta.py. # ROI contains regions spanning positive and negative examples ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE, bedfiles=[PEAK_FILE]*5, binsize=200, stepsize=200, resolution=200, store_whole_genome=True) pred = LABELS[:] for storage in ['ndarray', 'sparse', 'hdf5']: print(storage) cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer, cache=True, storage=storage, store_whole_genome=True) np.testing.assert_equal(cov_out[:], LABELS[:]) np.testing.assert_equal(cov_out.shape, LABELS.shape)
def test_create_from_array_whole_genome_false(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath # load the dataset # The pseudo genome represents just a concatenation of all sequences # in sample.fa and sample2.fa. Therefore, the results should be almost # identically to the models obtained from classify_fasta.py. REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa') # ROI contains regions spanning positive and negative examples ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_FILE, binsize=200, stepsize=200, order=1, store_whole_genome=False, datatags=['ref']) LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE, bedfiles=PEAK_FILE, binsize=200, stepsize=200, resolution=200, store_whole_genome=False, datatags=['train']) @inputlayer @outputconv('sigmoid') def double_stranded_model_dnaconv(inputs, inp, oup, params): with inputs.use('dna') as layer: layer = DnaConv2D(Conv2D(params[0], (params[1], 1), activation=params[2]))(layer) output = LocalAveragePooling2D(window_size=layer.shape.as_list()[1], name='motif')(layer) return inputs, output modeltemplate = double_stranded_model_dnaconv K.clear_session() # create a new model object model = Janggu.create(template=modeltemplate, modelparams=(30, 21, 'relu'), inputs=DNA, outputs=LABELS) model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['acc']) pred = model.predict(DNA) cov_out = Cover.create_from_array('BindingProba', pred, LABELS.gindexer, store_whole_genome=False) assert pred.shape == cov_out.shape np.testing.assert_equal(pred, cov_out[:]) assert len(cov_out.gindexer) == len(pred) assert len(cov_out.garray.handle) == len(pred)
def test_bed_store_whole_genome_option(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") cover1 = Cover.create_from_bed('test', bedfiles=bed_file, regions=bed_file, store_whole_genome=True, storage='ndarray') cover2 = Cover.create_from_bed('test2', bedfiles=bed_file, regions=bed_file, store_whole_genome=False, storage='ndarray') assert len(cover1) == 25 assert len(cover2) == len(cover1) assert cover1.shape == (25, 200, 1, 1) assert cover1.shape == cover2.shape np.testing.assert_equal(cover1[:], np.ones(cover1.shape)) np.testing.assert_equal(cover2[:], np.ones(cover1.shape))
def test_bed_inferred_binsize(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") #file_ = os.path.join(data_path, "sample.bw") cover = Cover.create_from_bed('test', bedfiles=bed_file, regions=bed_file, resolution=1, storage='ndarray') assert len(cover) == 25 assert cover.shape == (25, 200, 1, 1)
def test_bed_genomic_interval_access_part_genome(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bed") storage = False for reso in [1, 50]: for shift in [0, 1]: cover = Cover.create_from_bed( 'test', bedfiles=bamfile_, roi=bed_file, flank=0, storage='ndarray', store_whole_genome=storage, resolution=reso) for i in range(len(cover)): print('storage :',storage,'/ resolution :',reso,'/ shift :',shift) print(i, cover.gindexer[i]) np.testing.assert_equal(np.repeat(cover[i], cover.garray.resolution, axis=1), cover[cover.gindexer[i]]) chrom, start, end, strand = cover.gindexer[i].chrom, \ cover.gindexer[i].start, \ cover.gindexer[i].end, \ cover.gindexer[i].strand np.testing.assert_equal(np.repeat(cover[i], cover.garray.resolution, axis=1), cover[chrom, start, end, strand]) if shift != 0: start += shift * reso end += shift * reso if strand != '-': gicov = cover[chrom, start, end, strand][:, :(-shift*reso),:,:] np.testing.assert_equal(cover[i][:, shift:,:, :], gicov.reshape((1, gicov.shape[1]//reso, reso, 1, 1))[:, :, 0, :, :]) else: gicov = cover[chrom, start, end, strand][:, (shift*reso):,:,:] np.testing.assert_equal(cover[i][:, :-shift,:, :], gicov.reshape((1, gicov.shape[1]//reso, reso, 1, 1))[:, :, 0, :, :])
def test_janggu_chr2_validation(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') posfile = os.path.join(data_path, 'scored_sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, binsize=200, stepsize=50, roi=bed_file, order=1) ctcf = Cover.create_from_bed("positives", bedfiles=posfile, roi=bed_file, binsize=200, stepsize=50, resolution=None, flank=0, collapser='max', storage='ndarray') @inputlayer @outputconv('sigmoid') def _cnn_model1(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer layer = DnaConv2D(Conv2D(5, (3, 1), name='fconv1'), merge_mode='max', name='bothstrands')(layer) layer = MaxPooling2D((198, 1))(layer) return inputs, layer bwm1 = Janggu.create(_cnn_model1, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn1') bwm1.compile(optimizer='adadelta', loss='binary_crossentropy') p1 = bwm1.fit(dna, ctcf, validation_data=['chr2'])
def test_bed_overreaching_ends_part_genome(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "bed_test.bed") for store in ['ndarray', 'sparse']: print(store) cover = Cover.create_from_bed( 'test', bedfiles=bed_file, roi=bed_file, binsize=2, flank=20, resolution=1, store_whole_genome=False, storage=store) assert len(cover) == 9 assert cover.shape == (9, 2+2*20, 1, 1) np.testing.assert_equal(cover[0].sum(), 18) np.testing.assert_equal(cover[:].sum(), 9*18)
def test_bed_genomic_interval_access(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "sample.bed") bamfile_ = os.path.join(data_path, "sample.bed") for storage in [True, False]: for reso in [1, 50]: for shift in [0, 1]: cover = Cover.create_from_bed('test', bedfiles=bamfile_, regions=bed_file, flank=0, storage='ndarray', store_whole_genome=storage, resolution=reso) for i in range(len(cover)): print('storage :', storage, '/ resolution :', reso, '/ shift :', shift) print(i, cover.gindexer[i]) np.testing.assert_equal(cover[i], cover[cover.gindexer[i]]) chrom, start, end, strand = cover.gindexer[ i].chrom, cover.gindexer[i].start, cover.gindexer[ i].end, cover.gindexer[i].strand np.testing.assert_equal(cover[i], cover[chrom, start, end, strand]) if shift != 0: start += shift * reso end += shift * reso if strand != '-': np.testing.assert_equal( cover[i][:, shift:, :, :], cover[chrom, start, end, strand][:, :-shift, :, :]) else: np.testing.assert_equal( cover[i][:, :-shift, :, :], cover[chrom, start, end, strand][:, shift:, :, :])
def test_bed_overreaching_ends(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, "positive.bed") cover = Cover.create_from_bed('test', bedfiles=bed_file, regions=bed_file, flank=2000, resolution=1, store_whole_genome=True, storage='ndarray') cover.garray.handle['chr1'][0] = 1 assert len(cover) == 25 assert cover.shape == (25, 200 + 2 * 2000, 1, 1) np.testing.assert_equal(cover[0][0, :550, 0, 0].sum(), 0) np.testing.assert_equal(cover[0][0, 550, 0, 0], 1.) np.testing.assert_equal( cover[0][0, 550:(550 + len(cover.garray.handle['chr1'])), :, :], cover.garray.handle['chr1'])
# to enforce synchronized datasets. DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_TRAIN_FILE, binsize=200, order=args.order, storage='hdf5', cache=True, store_whole_genome=False, random_state=43) LABELS = Cover.create_from_bed('peaks', roi=ROI_TRAIN_FILE, bedfiles=PEAK_FILE, binsize=200, resolution=200, storage='sparse', cache=True, store_whole_genome=True, random_state=43) DNA_TEST = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_TEST_FILE, binsize=200, order=args.order) LABELS_TEST = Cover.create_from_bed('peaks', bedfiles=PEAK_FILE, roi=ROI_TEST_FILE, binsize=200,
def test_janggu_instance_conv(tmpdir): os.environ['JANGGU_OUTPUT'] = tmpdir.strpath """Test Janggu creation by shape and name. """ data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') posfile = os.path.join(data_path, 'scored_sample.bed') refgenome = os.path.join(data_path, 'sample_genome.fa') dna = Bioseq.create_from_refgenome('dna', refgenome=refgenome, storage='ndarray', roi=bed_file, order=1, binsize=200, stepsize=50) ctcf = Cover.create_from_bed("positives", bedfiles=posfile, roi=bed_file, binsize=200, stepsize=50, resolution=50, store_whole_genome=False, flank=0, collapser=None, storage='ndarray') ctcf = Cover.create_from_bed("positives", bedfiles=posfile, roi=bed_file, binsize=200, stepsize=50, resolution=50, store_whole_genome=True, flank=0, collapser=None, storage='ndarray') @inputlayer @outputconv('sigmoid') def _cnn_model(inputs, inp, oup, params): with inputs.use('dna') as inlayer: layer = inlayer layer = Complement()(layer) layer = Reverse()(layer) return inputs, layer bwm = Janggu.create(_cnn_model, modelparams=(2, ), inputs=dna, outputs=ctcf, name='dna_ctcf_HepG2-cnn') bwm.compile(optimizer='adadelta', loss='binary_crossentropy') storage = bwm._storage_path(bwm.name, outputdir=tmpdir.strpath) bwm.save() bwm.summary() assert os.path.exists(storage) Janggu.create_by_name('dna_ctcf_HepG2-cnn')
def get_data(params): binsize = params['binsize'] # PEAKS LABELS = ReduceDim(Cover.create_from_bed('peaks', bedfiles=PEAKS, roi=ROI, binsize=binsize, conditions=['JunD'], resolution=binsize, store_whole_genome=True, storage='sparse', cache=True), aggregator='max') # training on chr1, validation on chr2, test on chr3 with swapped Dnase samples LABELS, LABELS_TEST = split_train_test(LABELS, 'chr3') LABELS_TRAIN, LABELS_VAL = split_train_test(LABELS, 'chr2') if params['type'] in ['dna_only', 'dnase_dna']: dnaflank = params['dnaflank'] order = params['order'] # DNA DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI, binsize=binsize, flank=dnaflank, order=order, cache=True, store_whole_genome=True) DNA, DNA_TEST = split_train_test(DNA, 'chr3') DNA_TRAIN, DNA_VAL = split_train_test(DNA, 'chr2') if params['type'] in ['dnase_bam_only', 'dnase_dna']: dnaseflank = params['dnaseflank'] # ACCESSIBILITY ACCESS_TEST = Cover.create_from_bam( 'dnase', bamfiles=[DNASE_STAM_ENCODE, DNASE_STAM_ROADMAP], roi=ROI, binsize=binsize, conditions=['Encode', 'Roadmap'], flank=dnaseflank, resolution=50, normalizer=params['normalize'], store_whole_genome=True, cache=True) ACCESS = Cover.create_from_bam( 'dnase', roi=ROI, bamfiles=[DNASE_STAM_ROADMAP, DNASE_STAM_ENCODE], binsize=binsize, conditions=['Roadmap', 'Encode'], resolution=50, flank=dnaseflank, normalizer=params['normalize'], store_whole_genome=True, cache=True) _, ACCESS_TEST = split_train_test(ACCESS_TEST, 'chr3') ACCESS, _ = split_train_test(ACCESS, 'chr3') ACCESS_TRAIN, ACCESS_VAL = split_train_test(ACCESS, 'chr2') if params['type'] in ['dna_dnase', 'dnase_bam_only']: if params['augment'] == 'orient': ACCESS_TRAIN = RandomOrientation(ACCESS_TRAIN) if params['augment'] == 'scale': ACCESS_TRAIN = RandomSignalScale(ACCESS_TRAIN, 0.1) if params['augment'] == 'both': ACCESS_TRAIN = RandomSignalScale(RandomOrientation(ACCESS_TRAIN), 0.1) if params['type'] == 'dna_only': return (DNA_TRAIN, LABELS_TRAIN), (DNA_VAL, LABELS_VAL), \ (DNA_TEST, LABELS_TEST) elif params['type'] == 'dnase_dna': return ([DNA_TRAIN, ACCESS_TRAIN], LABELS_TRAIN), \ ([DNA_VAL, ACCESS_VAL], LABELS_VAL),\ ([DNA_TEST, ACCESS_TEST], LABELS_TEST) elif params['type'] in ['dnase_bam_only']: return ([ACCESS_TRAIN], LABELS_TRAIN), \ ([ACCESS_VAL], LABELS_VAL), \ ([ACCESS_TEST], LABELS_TEST)
# ROI contains regions spanning positive and negative examples ROI_FILE = resource_filename('janggu', 'resources/roi_train.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') # Training input and labels are purely defined genomic coordinates DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_FILE, binsize=200, order=args.order, datatags=['ref']) LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE, bedfiles=PEAK_FILE, binsize=200, resolution=200, datatags=['train']) # evaluation metrics from sklearn.metrics def wrap_roc(y_true, y_pred): fpr, tpr, _ = roc_curve(y_true, y_pred) aux = str('({:.2%})'.format(roc_auc_score(y_true, y_pred))) print('roc', aux) return fpr, tpr, aux def wrap_prc(y_true, y_pred): precision, recall, _ = precision_recall_curve(y_true, y_pred) aux = str('({:.2%})'.format(average_precision_score(y_true, y_pred)))
ROI_TRAIN_FILE = resource_filename('janggu', 'resources/roi_train.bed') ROI_TEST_FILE = resource_filename('janggu', 'resources/roi_test.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_FILE, order=args.order, binsize=200, store_whole_genome=True) LABELS = Cover.create_from_bed('peaks', roi=ROI_FILE, bedfiles=PEAK_FILE, binsize=200, resolution=200, storage='sparse', store_whole_genome=True) # in case the dataset has been loaded with store_whole_genome=True, # it is possible to reuse the same dataset by subsetting on different # regions of the genome. DNA_TRAIN = view(DNA, ROI_TRAIN_FILE) LABELS_TRAIN = view(LABELS, ROI_TRAIN_FILE) DNA_TEST = view(DNA, ROI_TEST_FILE) LABELS_TEST = view(LABELS, ROI_TEST_FILE) # Define the model templates
# identically to the models obtained from classify_fasta.py. REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa') # ROI contains regions spanning positive and negative examples ROI_TRAIN = resource_filename('janggu', 'resources/roi_train.bed') ROI_TEST = resource_filename('janggu', 'resources/roi_test.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') DNA_TEST = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_TEST, binsize=200) LABELS_TEST = Cover.create_from_bed('peaks', bedfiles=PEAK_FILE, roi=ROI_TEST, binsize=200, resolution=None) # Training input and labels are purely defined genomic coordinates DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_TRAIN, binsize=200) LABELS = Cover.create_from_bed('peaks', roi=ROI_TRAIN, bedfiles=PEAK_FILE, binsize=200, resolution=None)
def test_cover_from_bed_sanity(): data_path = pkg_resources.resource_filename('janggu', 'resources/') bed_file = os.path.join(data_path, 'sample.bed') bwfile_ = os.path.join(data_path, "scored_sample.bed") cover = Cover.create_from_bed('test', bedfiles=bwfile_, regions=bed_file, binsize=200, stepsize=50, resolution=50, flank=0, storage='ndarray') cover[0] Cover.create_from_bed('test', bedfiles=bwfile_, regions=bed_file, binsize=200, stepsize=50, resolution=50, storage='ndarray') with pytest.raises(Exception): # name must be a string Cover.create_from_bed(1.2, bedfiles=bwfile_, regions=bed_file, binsize=1, stepsize=1, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bed('test', bedfiles=bwfile_, regions=bed_file, binsize=1, stepsize=1, flank=-1, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bed('test', bedfiles=bwfile_, regions=bed_file, binsize=1, stepsize=-1, flank=0, storage='ndarray') with pytest.raises(Exception): Cover.create_from_bed('test', bedfiles=bwfile_, regions=bed_file, binsize=-1, stepsize=1, flank=0, storage='ndarray') with pytest.raises(Exception): # resolution must be greater than stepsize Cover.create_from_bed('test', bedfiles=bwfile_, regions=bed_file, binsize=200, stepsize=50, resolution=300, flank=0, storage='ndarray') with pytest.raises(Exception): csvfile = os.path.join(data_path, 'ctcf_sample.csv') # must be a bed file Cover.create_from_bed('test', bedfiles=csvfile, regions=bed_file, binsize=1, stepsize=1, storage='ndarray')
ROI_TRAIN_FILE = resource_filename('janggu', 'resources/roi_train.bed') ROI_TEST_FILE = resource_filename('janggu', 'resources/roi_test.bed') # PEAK_FILE only contains positive examples PEAK_FILE = resource_filename('janggu', 'resources/scores.bed') # Training input and labels are purely defined genomic coordinates DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_TRAIN_FILE, binsize=200, order=args.order, cache=True) LABELS = Cover.create_from_bed('peaks', roi=ROI_TRAIN_FILE, bedfiles=PEAK_FILE, binsize=200, resolution=200, cache=True, storage='sparse') DNA_TEST = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME, roi=ROI_TEST_FILE, binsize=200, order=args.order) LABELS_TEST = Cover.create_from_bed('peaks', bedfiles=PEAK_FILE, roi=ROI_TEST_FILE, binsize=200, resolution=200, storage='sparse')