def gcdepth(args): """ %prog gcdepth sample_name tag Plot GC content vs depth vs genomnic bins. Inputs are mosdepth output: - NA12878_S1.mosdepth.global.dist.txt - NA12878_S1.mosdepth.region.dist.txt - NA12878_S1.regions.bed.gz - NA12878_S1.regions.bed.gz.csi - NA12878_S1.regions.gc.bed.gz A sample mosdepth.sh script might look like: ``` #!/bin/bash LD_LIBRARY_PATH=mosdepth/htslib/ mosdepth/mosdepth $1 \\ bams/$1.bam -t 4 -c chr1 -n --by 1000 bedtools nuc -fi GRCh38/WholeGenomeFasta/genome.fa \\ -bed $1.regions.bed.gz \\ | pigz -c > $1.regions.gc.bed.gz ``` """ import hashlib from jcvi.algorithms.formula import MAD_interval as confidence_interval from jcvi.graphics.base import latex, plt, savefig, set2 p = OptionParser(gcdepth.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) sample_name, tag = args # The tag is used to add to title, also provide a random (hashed) color coloridx = int(hashlib.sha256(tag).hexdigest(), 16) % len(set2) color = set2[coloridx] # mosdepth outputs a table that we can use to plot relationship gcbedgz = sample_name + ".regions.gc.bed.gz" df = pd.read_csv(gcbedgz, delimiter="\t") mf = df.loc[:, ("4_usercol", "6_pct_gc")] mf.columns = ["depth", "gc"] # We discard any bins that are gaps mf = mf[(mf["depth"] > 0.001) | (mf["gc"] > 0.001)] # Create GC bins gcbins = defaultdict(list) for i, row in mf.iterrows(): gcp = int(round(row["gc"] * 100)) gcbins[gcp].append(row["depth"]) gcd = sorted( (k * 0.01, confidence_interval(v)) for (k, v) in gcbins.items()) gcd_x, gcd_y = zip(*gcd) m, lo, hi = zip(*gcd_y) # Plot plt.plot( mf["gc"], mf["depth"], ".", color="lightslategray", ms=2, mec="lightslategray", alpha=0.1, ) patch = plt.fill_between( gcd_x, lo, hi, facecolor=color, alpha=0.25, zorder=10, linewidth=0.0, label="Median +/- MAD band", ) plt.plot(gcd_x, m, "-", color=color, lw=2, zorder=20) ax = plt.gca() ax.legend(handles=[patch], loc="best") ax.set_xlim(0, 1) ax.set_ylim(0, 100) ax.set_title("{} ({})".format(latex(sample_name), tag)) ax.set_xlabel("GC content") ax.set_ylabel("Depth") savefig(sample_name + ".gcdepth.png")
def gcdepth(args): """ %prog gcdepth sample_name tag Plot GC content vs depth vs genomnic bins. Inputs are mosdepth output: - NA12878_S1.mosdepth.global.dist.txt - NA12878_S1.mosdepth.region.dist.txt - NA12878_S1.regions.bed.gz - NA12878_S1.regions.bed.gz.csi - NA12878_S1.regions.gc.bed.gz A sample mosdepth.sh script might look like: ``` #!/bin/bash LD_LIBRARY_PATH=mosdepth/htslib/ mosdepth/mosdepth $1 \\ bams/$1.bam -t 4 -c chr1 -n --by 1000 bedtools nuc -fi GRCh38/WholeGenomeFasta/genome.fa \\ -bed $1.regions.bed.gz \\ | pigz -c > $1.regions.gc.bed.gz ``` """ import hashlib from jcvi.algorithms.formula import MAD_interval as confidence_interval from jcvi.graphics.base import latex, plt, savefig, set2 p = OptionParser(gcdepth.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) sample_name, tag = args # The tag is used to add to title, also provide a random (hashed) color coloridx = int(hashlib.sha1(tag).hexdigest(), 16) % len(set2) color = set2[coloridx] # mosdepth outputs a table that we can use to plot relationship gcbedgz = sample_name + ".regions.gc.bed.gz" df = pd.read_csv(gcbedgz, delimiter="\t") mf = df.loc[:, ("4_usercol", "6_pct_gc")] mf.columns = ["depth", "gc"] # We discard any bins that are gaps mf = mf[(mf["depth"] > .001) | (mf["gc"] > .001)] # Create GC bins gcbins = defaultdict(list) for i, row in mf.iterrows(): gcp = int(round(row["gc"] * 100)) gcbins[gcp].append(row["depth"]) gcd = sorted((k * .01, confidence_interval(v)) for (k, v) in gcbins.items()) gcd_x, gcd_y = zip(*gcd) m, lo, hi = zip(*gcd_y) # Plot plt.plot(mf["gc"], mf["depth"], ".", color="lightslategray", ms=2, mec="lightslategray", alpha=.1) patch = plt.fill_between(gcd_x, lo, hi, facecolor=color, alpha=.25, zorder=10, linewidth=0.0, label="Median +/- MAD band") plt.plot(gcd_x, m, "-", color=color, lw=2, zorder=20) ax = plt.gca() ax.legend(handles=[patch], loc="best") ax.set_xlim(0, 1) ax.set_ylim(0, 100) ax.set_title("{} ({})".format(latex(sample_name), tag)) ax.set_xlabel("GC content") ax.set_ylabel("Depth") savefig(sample_name + ".gcdepth.png")