def digest(chromsizes_path, fasta_path, enzyme_name): import bioframe chromsizes = bioframe.read_chromsizes(chromsizes_path, all_names=True) fasta_records = bioframe.load_fasta(fasta_path, engine='pyfaidx', as_raw=True) if not chromsizes.index.isin(fasta_records).all(): raise ValueError("Some chromosomes mentioned in {}" " are not found in {}".format(chromsizes_path, fasta_path)) frags = bioframe.tools.digest(fasta_records, enzyme_name) print(frags.to_csv(sep='\t', index=False))
def test_digest(): pytest.importorskip("Bio") fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa") assert len(fasta_records) == 2 ### no HindIII sites in the test.fa fasta records, so shouldn't change shape[0] assert bioframe.digest(fasta_records, "HindIII").shape == (2, 3) ### one DpnII site on chrTEST2, shape[0] should increase by one assert bioframe.digest(fasta_records, "DpnII").shape == (3, 3) ### DpnII site is on chrTEST2 position 3, first interval of chrTEST2 should end at 3 assert bioframe.digest(fasta_records, "DpnII").iloc[1].end == 3
def gc(bins_path, fasta_path, mapped_only): import bioframe import pandas as pd bins = pd.read_table(bins_path) chromosomes = bins['chrom'].unique() fasta_records = bioframe.load_fasta(fasta_path, engine='pyfaidx', as_raw=True) if any(chrom not in fasta_records.keys() for chrom in chromosomes): raise ValueError("Some chromosomes mentioned in {}" " are not found in {}".format(bins_path, fasta_path)) bins['GC'] = bioframe.tools.frac_gc(bins, fasta_records, mapped_only) print(bins.to_csv(sep='\t', index=False))
def gene_content(genome, binsize, gc=True): chrom_sizes = bioframe.fetch_chromsizes(genome) chrom_table = binnify(chrom_sizes, binsize) gene_count = frac_gene_coverage(chrom_table, genome) if gc: fasta_path = f'/net/levsha/share/lab/genomes/{genome}/{genome}.fa' fasta_records = load_fasta(fasta_path) gene_count['frac_gc'] = frac_gc(chrom_table, fasta_records) return gene_count
def test_frac_gc(): pytest.importorskip("pysam") chromsizes = bioframe.read_chromsizes(testdir + "/test_data/test.chrom.sizes", filter_chroms=False) fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa") unmapped_bp = (0 == bioframe.frac_mapped(bioframe.binnify(chromsizes, 1), fasta_records, return_input=False).values) assert np.isnan( bioframe.frac_gc( bioframe.binnify(chromsizes, 1), fasta_records, return_input=False, mapped_only=True, ).values[unmapped_bp]).all() ## mapped_only=True should ignore N or return np.nan if interval only contains N np.testing.assert_equal( np.array([0.5, 0.5, np.nan]), bioframe.frac_gc( bioframe.binnify(chromsizes, 5), fasta_records, return_input=False, mapped_only=True, ).values, ) assert (np.array([0.5, 0.5]) == bioframe.frac_gc( bioframe.binnify(chromsizes, 7), fasta_records, return_input=False, mapped_only=True, ).values).all() ## mapped_only=False should count N as zero assert (np.array([0.4, 0.4, 0]) == bioframe.frac_gc( bioframe.binnify(chromsizes, 5), fasta_records, return_input=False, mapped_only=False, ).values).all() assert (np.array([0.4, 2 / 7]) == bioframe.frac_gc( bioframe.binnify(chromsizes, 7), fasta_records, return_input=False, mapped_only=False, ).values).all()
def gc(bins_path, fasta_path, mapped_only): import bioframe import pandas as pd if bins_path == "-": bins_path = sys.stdin bins = pd.read_table(bins_path) chromosomes = bins["chrom"].unique() fasta_records = bioframe.load_fasta(fasta_path, engine="pyfaidx", as_raw=True) if any(chrom not in fasta_records.keys() for chrom in chromosomes): raise ValueError("Some chromosomes mentioned in {}" " are not found in {}".format(bins_path, fasta_path)) bins = bioframe.frac_gc(bins, fasta_records, mapped_only) print(bins.to_csv(sep="\t", index=False))
def test_frac_mapped(): pytest.importorskip("pysam") chromsizes = bioframe.read_chromsizes(testdir + "/test_data/test.chrom.sizes", filter_chroms=False) fasta_records = bioframe.load_fasta(testdir + "/test_data/test.fa") unmapped = np.array( [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 1), fasta_records, return_input=False).values).all() unmapped = np.array([0.8, 0.8, 0]) assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 5), fasta_records, return_input=False).values).all() unmapped = np.array([0.8, 4 / 7]) assert (unmapped == bioframe.frac_mapped(bioframe.binnify(chromsizes, 7), fasta_records, return_input=False).values).all()
import multiprocess as mp import numpy as np import pandas as pd import bioframe import cooltools import cooler from cooltools.eigdecomp import cooler_cis_eig mm10 = bioframe.fetch_chromsizes('mm10') chromsizes = bioframe.fetch_chromsizes('mm10') chromosomes = list(chromsizes.index) binsize = 10000 bins = cooler.binnify(mm10, binsize) fasta_records = bioframe.load_fasta('/data05/genomes/mm10_20chr.fa') bins['GC'] = bioframe.tools.frac_gc(bins, fasta_records) bins.head() import fnmatch import os for file in os.listdir('.'): if fnmatch.fnmatch(file, '*_10kb.cool'): clr = cooler.Cooler(file) cond = file.split('.')[0] lam, eigs = cooler_cis_eig(clr, bins, n_eigs=3, phasing_track_col='GC', sort_metric='var_explained')