def test_read_chromsizes(): d = """chr1\nchr2\nchr2""" with pytest.raises(ValueError): bioframe.read_chromsizes(StringIO(d)) d = """chr1\t1\nchr3\t2\nchr2\t3\n """ chromsizes = bioframe.read_chromsizes(StringIO(d)) assert type(chromsizes) is pd.Series assert chromsizes.name == "length" assert list(chromsizes.index) == ["chr1", "chr2", "chr3"] assert list(chromsizes.values) == [1, 3, 2]
def binnify(chromsizes_path, binsize, all_names): import bioframe chromsizes = bioframe.read_chromsizes(chromsizes_path, filter_chroms=not (all_names)) bins = bioframe.binnify(chromsizes, binsize) print(bins.to_csv(sep="\t", index=False))
def digest(chromsizes_path, fasta_path, enzyme_name): import bioframe chromsizes = bioframe.read_chromsizes(chromsizes_path, all_names=True) fasta_records = bioframe.load_fasta(fasta_path, engine='pyfaidx', as_raw=True) if not chromsizes.index.isin(fasta_records).all(): raise ValueError("Some chromosomes mentioned in {}" " are not found in {}".format(chromsizes_path, fasta_path)) frags = bioframe.tools.digest(fasta_records, enzyme_name) print(frags.to_csv(sep='\t', index=False))
def test_binnify(): chromsizes = bioframe.read_chromsizes(testdir + "/test_data/test.chrom.sizes", filter_chroms=False) assert len(chromsizes) == 2 assert len(bioframe.binnify(chromsizes, int(np.max( chromsizes.values)))) == len(chromsizes) assert len(bioframe.binnify(chromsizes, int(np.min( chromsizes.values)))) == (len(chromsizes) + 1) assert len(bioframe.binnify(chromsizes, 1)) == np.sum(chromsizes.values)
def test_frac_gc(): pytest.importorskip("pysam") chromsizes = bioframe.read_chromsizes(testdir + "/test_data/test.chrom.sizes", filter_chroms=False) fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa") unmapped_bp = (0 == bioframe.frac_mapped(bioframe.binnify(chromsizes, 1), fasta_records, return_input=False).values) assert np.isnan( bioframe.frac_gc( bioframe.binnify(chromsizes, 1), fasta_records, return_input=False, mapped_only=True, ).values[unmapped_bp]).all() ## mapped_only=True should ignore N or return np.nan if interval only contains N np.testing.assert_equal( np.array([0.5, 0.5, np.nan]), bioframe.frac_gc( bioframe.binnify(chromsizes, 5), fasta_records, return_input=False, mapped_only=True, ).values, ) assert (np.array([0.5, 0.5]) == bioframe.frac_gc( bioframe.binnify(chromsizes, 7), fasta_records, return_input=False, mapped_only=True, ).values).all() ## mapped_only=False should count N as zero assert (np.array([0.4, 0.4, 0]) == bioframe.frac_gc( bioframe.binnify(chromsizes, 5), fasta_records, return_input=False, mapped_only=False, ).values).all() assert (np.array([0.4, 2 / 7]) == bioframe.frac_gc( bioframe.binnify(chromsizes, 7), fasta_records, return_input=False, mapped_only=False, ).values).all()
def test_frac_mapped(): pytest.importorskip("pysam") chromsizes = bioframe.read_chromsizes(testdir + "/test_data/test.chrom.sizes", filter_chroms=False) fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa") unmapped = np.array( [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 1), fasta_records, return_input=False).values).all() unmapped = np.array([0.8, 0.8, 0]) assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 5), fasta_records, return_input=False).values).all() unmapped = np.array([0.8, 4 / 7]) assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 7), fasta_records, return_input=False).values).all()
def binnify(chromsizes_path, binsize): import bioframe chromsizes = bioframe.read_chromsizes(chromsizes_path) bins = bioframe.tools.binnify(chromsizes, binsize) print(bins.to_csv(sep='\t', index=False))
### Test API: # common parameters: ignore_diags = 2 clr_weight_name = "weight" bad_bins = None chunksize = 10_000 # keep it small to engage chunking weight1 = clr_weight_name + "1" weight2 = clr_weight_name + "2" transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]} assumed_binsize = 1_000_000 chromsizes_file = op.join( op.dirname(op.realpath(__file__)), "data/mm9.chrom.sizes.reduced", ) chromsizes = bioframe.read_chromsizes(chromsizes_file) chromosomes = list(chromsizes.index) supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes] # test the most frequent use cases, balancing applied, no bad bins, etc. common_regions = [] for i in range(4): chrom = chromosomes[i] halfway_chrom = int(chromsizes[chrom] / 2) # make halfway_chrom point "bin-aligned" according to anticipated binsize halfway_chrom = round(halfway_chrom / assumed_binsize) * assumed_binsize reg1 = (chrom, 0, halfway_chrom) reg2 = (chrom, halfway_chrom, chromsizes[chrom]) common_regions.append(reg1) common_regions.append(reg2)