def plot_cumulative_histogram(data_folder, adaID, fragment, insert_sizes, title=None, ax=None, show=False, savefig=False, **kwargs): '''Plot cumulative histogram of insert sizes''' import matplotlib.pyplot as plt if ax is None: fig, ax = plt.subplots(1, 1) ax.plot(insert_sizes, np.linspace(0, 1, len(insert_sizes)), **kwargs) ax.set_xlabel('Insert size') ax.set_ylabel('Cumulative fraction') ax.set_xlim(-1, 1000) ax.set_ylim(-0.02, 1.02) if title is not None: ax.set_title(title) plt.tight_layout() if show: plt.ion() plt.show() if savefig: output_filename = get_insert_size_distribution_cumulative_filename(data_folder, adaID, fragment) from hivwholeseq.utils.generic import mkdirs from hivwholeseq.sequencing.filenames import get_figure_folder mkdirs(get_figure_folder(data_folder, adaID)) fig.savefig(output_filename)
def plot_quality_along_reads(data_folder, adaID, title, quality, VERBOSE=0, savefig=False): """Plot the results of the quality scores along reads""" import matplotlib.pyplot as plt from matplotlib import cm fig, axs = plt.subplots(1, 2, figsize=(16, 9)) for i, (ax, qual) in enumerate(izip(axs, quality)): for j, qpos in enumerate(qual): x = qpos y = np.linspace(0, 1, len(x))[::-1] ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qual))), alpha=0.5, lw=2) ax.set_xlabel("Phred quality", fontsize=14) ax.set_ylabel("Fraction of bases above quality x", fontsize=14) ax.set_title("Read" + str(i + 1), fontsize=16) ax.text(2, 0.03, "blue to red: 0 to " + str(len(qual)) + " base", fontsize=18) fig.suptitle(title, fontsize=20) if savefig: from hivwholeseq.utils.generic import mkdirs from hivwholeseq.sequencing.filenames import get_figure_folder, get_quality_along_reads_filename fig_folder = get_figure_folder(data_folder, adaID) fig_filename = get_quality_along_reads_filename(data_folder, adaID) mkdirs(fig_folder) fig.savefig(fig_filename) else: plt.tight_layout() plt.ion() plt.show()
def plot_histogram(data_folder, adaID, fragment, h, title=None, ax=None, show=False, savefig=False, **kwargs): '''Plot histogram of insert sizes''' import matplotlib.pyplot as plt if ax is None: fig, ax = plt.subplots(1, 1) if title is not None: ax.set_title(title) x = 0.5 * (h[1][1:] + h[1][:-1]) y = h[0] ax.plot(x, y, **kwargs) ax.set_xlabel('Insert size') ax.set_ylabel('Density') plt.tight_layout() if show: plt.ion() plt.show() if savefig: output_filename = get_insert_size_distribution_filename(data_folder, adaID, fragment) from hivwholeseq.utils.generic import mkdirs from hivwholeseq.sequencing.filenames import get_figure_folder mkdirs(get_figure_folder(data_folder, adaID)) plt.savefig(output_filename)
def plot_quality_along_reads(data_folder, adaID, title, quality, VERBOSE=0, savefig=False): '''Plot the results of the quality scores along reads''' import matplotlib.pyplot as plt from matplotlib import cm fig, axs = plt.subplots(1, 2, figsize=(16, 9)) for i, (ax, qual) in enumerate(izip(axs, quality)): for j, qpos in enumerate(qual): x = qpos y = np.linspace(0, 1, len(x))[::-1] ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qual))), alpha=0.5, lw=2) ax.set_xlabel('Phred quality', fontsize=14) ax.set_ylabel('Fraction of bases above quality x', fontsize=14) ax.set_title('Read'+str(i+1), fontsize=16) ax.text(2, 0.03, 'blue to red: 0 to '+str(len(qual))+' base', fontsize=18) fig.suptitle(title, fontsize=20) if savefig: from hivwholeseq.utils.generic import mkdirs from hivwholeseq.sequencing.filenames import get_figure_folder, \ get_quality_along_reads_filename fig_folder = get_figure_folder(data_folder, adaID) fig_filename = get_quality_along_reads_filename(data_folder, adaID) mkdirs(fig_folder) fig.savefig(fig_filename) else: plt.tight_layout() plt.ion() plt.show()
def plot_cuts_quality_along_reads(data_folder, adaID, quality, title='', VERBOSE=0, savefig=False): '''Plot some cuts of the quality along the read''' from scipy.stats import percentileofscore as pof import matplotlib.pyplot as plt from matplotlib import cm fig, axs = plt.subplots(1, 2, figsize=(14, 8)) qthreshs = [10, 20, 30, 35] for i, (ax, qual) in enumerate(izip(axs, quality)): for j, qthresh in enumerate(qthreshs): x = np.arange(len(qual)) y = np.array( [100 - pof(qual[k], qthresh) for k in xrange(len(qual))]) ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qthreshs))), alpha=0.8, lw=2, label='Q = ' + str(qthresh)) ax.set_xlabel('Position [bp]', fontsize=14) ax.set_ylabel('Percentage of bases above quality x', fontsize=14) ax.set_title('Read' + str(i + 1), fontsize=16) ax.set_ylim(-1, 101) ax.set_xlim(-1, len(qual) + 1) ax.legend(loc='best') if title: fig.suptitle(title, fontsize=20) if savefig: from hivwholeseq.utils.generic import mkdirs if savefig == True: from hivwholeseq.sequencing.filenames import get_figure_folder, \ get_quality_along_reads_filename fig_folder = get_figure_folder(data_folder, adaID) fig_filename = get_quality_along_reads_filename(data_folder, adaID, simple=True) elif isinstance(savefig, basestring): import os fig_filename = savefig fig_folder = os.path.dirname(fig_filename) else: raise ValueError( 'savefig must be a bool or a figure filename (string)') mkdirs(fig_folder) fig.savefig(fig_filename) else: plt.tight_layout() plt.ion() plt.show()
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True): '''Produce a report on rough coverage on reference (ignore inserts)''' ref_filename = get_reference_premap_filename(data_folder, adaID) refseq = SeqIO.read(ref_filename, 'fasta') # Prepare data structures coverage = np.zeros(len(refseq), int) # Parse the BAM file unmapped = 0 mapped = 0 bamfilename = get_premapped_filename(data_folder, adaID, type='bam') with pysam.Samfile(bamfilename, 'rb') as bamfile: for read in bamfile: if read.is_unmapped or (not read.is_proper_pair) or (not len( read.cigar)): unmapped += 1 continue # Proceed along CIGARs ref_pos = read.pos for (bt, bl) in read.cigar: if bt not in (0, 2): continue # Treat deletions as 'covered' coverage[ref_pos:ref_pos + bl] += 1 ref_pos += bl mapped += 1 # Save results from hivwholeseq.sequencing.filenames import get_coverage_figure_filename import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(13, 6)) ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b') ax.set_xlabel('Position') ax.set_ylabel('Coverage') ax.set_yscale('log') ax.set_title('adaID ' + adaID + ', premapped', fontsize=18) ax.set_xlim(-20, len(refseq) + 20) plt.tight_layout() from hivwholeseq.utils.generic import mkdirs from hivwholeseq.sequencing.filenames import get_figure_folder mkdirs(get_figure_folder(data_folder, adaID)) plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped')) plt.close(fig) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nPremapping results: '+\ str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n') f.write('\nCoverage plotted: '+\ get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
def plot_cuts_quality_along_reads(data_folder, adaID, quality, title="", VERBOSE=0, savefig=False): """Plot some cuts of the quality along the read""" from scipy.stats import percentileofscore as pof import matplotlib.pyplot as plt from matplotlib import cm fig, axs = plt.subplots(1, 2, figsize=(14, 8)) qthreshs = [10, 20, 30, 35] for i, (ax, qual) in enumerate(izip(axs, quality)): for j, qthresh in enumerate(qthreshs): x = np.arange(len(qual)) y = np.array([100 - pof(qual[k], qthresh) for k in xrange(len(qual))]) ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qthreshs))), alpha=0.8, lw=2, label="Q = " + str(qthresh)) ax.set_xlabel("Position [bp]", fontsize=14) ax.set_ylabel("Percentage of bases above quality x", fontsize=14) ax.set_title("Read" + str(i + 1), fontsize=16) ax.set_ylim(-1, 101) ax.set_xlim(-1, len(qual) + 1) ax.legend(loc="best") if title: fig.suptitle(title, fontsize=20) if savefig: from hivwholeseq.utils.generic import mkdirs if savefig == True: from hivwholeseq.sequencing.filenames import get_figure_folder, get_quality_along_reads_filename fig_folder = get_figure_folder(data_folder, adaID) fig_filename = get_quality_along_reads_filename(data_folder, adaID, simple=True) elif isinstance(savefig, basestring): import os fig_filename = savefig fig_folder = os.path.dirname(fig_filename) else: raise ValueError("savefig must be a bool or a figure filename (string)") mkdirs(fig_folder) fig.savefig(fig_filename) else: plt.tight_layout() plt.ion() plt.show()
def plot_cuts_quality_along_reads(data_folder, adaID, title, quality, VERBOSE=0, savefig=False): '''Plot some cuts of the quality along the read''' from scipy.stats import percentileofscore as pof import matplotlib.pyplot as plt from matplotlib import cm fig, axs = plt.subplots(1, 2, figsize=(14, 8)) qthreshs = [10, 20, 30, 35] for i, (ax, qual) in enumerate(izip(axs, quality)): for j, qthresh in enumerate(qthreshs): x = np.arange(len(qual)) y = np.array([100 - pof(qual[k], qthresh) for k in xrange(len(qual))]) ax.plot(x, y, color=cm.jet(int(255.0 * j / len(qthreshs))), alpha=0.8, lw=2, label='Q = '+str(qthresh)) ax.set_xlabel('Position [bp]', fontsize=14) ax.set_ylabel('Percentage of bases above quality x', fontsize=14) ax.set_title('Read'+str(i+1), fontsize=16) ax.set_ylim(-1, 101) ax.set_xlim(-1, len(qual) + 1) ax.legend(loc='best') fig.suptitle(title, fontsize=20) if savefig: from hivwholeseq.utils.generic import mkdirs from hivwholeseq.sequencing.filenames import get_figure_folder, \ get_quality_along_reads_filename fig_folder = get_figure_folder(data_folder, adaID) fig_filename = get_quality_along_reads_filename(data_folder, adaID, simple=True) mkdirs(fig_folder) fig.savefig(fig_filename) else: plt.tight_layout() plt.ion() plt.show()