def plot_gc_content(self, fontsize=16, ec="k", bins=100): """plot GC content histogram :params bins: a value for the number of bins or an array (with a copy() method) :param ec: add black contour on the bars .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam')) b.plot_gc_content() """ data = self.get_gc_content() try: X = np.linspace(0, 100, bins) except: X = bins.copy() pylab.hist(data, X, normed=True, ec=ec) pylab.grid(True) mu = pylab.mean(data) sigma = pylab.std(data) X = pylab.linspace(X.min(), X.max(), 100) pylab.plot(X, pylab.normpdf(X, mu, sigma), lw=2, color="r", ls="--") pylab.xlabel("GC content", fontsize=16)
def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read['sequence']) for read in self.lq_sequence] L2 = [len(read['sequence']) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads")
def scatter_plot(self, filename=None, hold=False): """Scatter plot of the score versus length of each ortholog .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.scatter_plot() """ if hold is False: pylab.clf() colors = ["green", "orange", "red", "blue"] markers = ['o', 's', 'x', 'o'] for i, this in enumerate(["Complete", "Fragmented", "Missing", "Duplicated"]): mask = self.df.Status == "Complete" if sum(mask)>0: self.df[mask].plot(x="Length", y="Score", kind="scatter", color=colors[i], marker=markers[i], label="Complete") pylab.legend() pylab.grid() if filename: pylab.savefig(filename)
def hist_concordance(self, bins=100, fontsize=16): """ formula : 1 - (in + del + mismatch / (in + del + mismatch + match) ) For BWA and BLASR, the get_cigar_stats are different !!! BWA for instance has no X stored while Pacbio forbids the use of the M (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters. Subread Accuracy: The post-mapping accuracy of the basecalls. Formula: [1 - (errors/subread length)], where errors = number of deletions + insertions + substitutions. """ try: concordance = self._concordance except: self._set_concordance() concordance = self._concordance pylab.hist(concordance, bins=bins) pylab.grid() mu = np.mean(concordance) median = np.median(concordance) pylab.axvline(mu, color='r', alpha=0.5) pylab.axvline(median, color='r', alpha=0.5, ls="--") pylab.xlabel("concordance", fontsize=fontsize)
def plot_count_per_sample(self, fontsize=12, sample_list=None): """"Number of mapped reads per sample. Each color for each replicate .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_count_per_sample() """ sample_names = self.sample_names N = len(sample_names) dd = self.df[sample_names].sum() pylab.clf() colors = [] for sample in self.sample_names: colors.append(self.colors[self.get_cond_from_sample(sample)]) pylab.bar(range(N), (dd/1000000).values, color=colors, alpha=1, zorder=10, lw=1, ec="k", width=0.9) pylab.xlabel("Samples", fontsize=fontsize) pylab.ylabel("Total read count (millions)", fontsize=fontsize) pylab.grid(True, zorder=0) pylab.title("Total read count per sample", fontsize=fontsize) pylab.xticks(range(N), self.sample_names)
def plot_volcano(self): """ .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_volcano() """ d1 = self.df.query("padj>0.05") d2 = self.df.query("padj<=0.05") fig = pylab.figure() pylab.plot(d1.log2FoldChange, -np.log10(d1.padj), marker="o", alpha=0.5, color="r", lw=0) pylab.plot(d2.log2FoldChange, -np.log10(d2.padj), marker="o", alpha=0.5, color="k", lw=0) pylab.grid(True) pylab.xlabel("fold change") pylab.ylabel("log10 adjusted p-value") m1 = abs(min(self.df.log2FoldChange)) m2 = max(self.df.log2FoldChange) limit = max(m1,m2) pylab.xlim([-limit, limit]) y1,y2 = pylab.ylim() pylab.ylim([0,y2]) pylab.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)")
def plot_hist_normalized_coverage(self, filename=None, binwidth=0.1, max_z=4): """ Barplot of the normalized coverage with gaussian fitting """ pylab.clf() # if there are a NaN -> can't set up binning d = self.df["scale"][self.range[0]:self.range[1]].dropna() # remove outlier -> plot crash if range between min and max is too high d = d[np.abs(d - d.mean()) <= (4 * d.std())] bins = self._set_bins(d, binwidth) self.mixture_fitting.data = d try: self.mixture_fitting.plot(self.gaussians_params, bins=bins, Xmin=0, Xmax=max_z) except ZeroDivisionError: pass pylab.grid(True) pylab.xlim([0,max_z]) pylab.xlabel("Normalised per-base coverage") try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
def plot_unknown_barcodes(self, N=20): ub = self.data['UnknownBarcodes'] df = pd.DataFrame({x['Lane']: x['Barcodes'] for x in ub}) if "unknown" in df.index and len(df) == 1: df.loc['known'] = [0 for i in df.columns] # if data is made of undetermined only, the dataframe is just made of # N lanes with one entry : unknown S = df.sum(axis=1).sort_values(ascending=False).index[0:N] data = df.loc[S][::-1] #print(data) data.columns = ["Lane {}".format(x) for x in data.columns] from matplotlib import rcParams rcParams['axes.axisbelow'] = True pylab.figure(figsize=(10, 8)) ax = pylab.gca() data.plot(kind="barh", width=1, ec="k", ax=ax) rcParams['axes.axisbelow'] = False pylab.xlabel("Number of reads", fontsize=12) pylab.ylabel("") pylab.grid(True) pylab.legend( ["Lane {}".format(x) for x in range(1, len(df.columns) + 1)], loc="lower right") try: pylab.tight_layout() except Exception as err: print(err) return data
def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.scatter(-pylab.log10(df['Adjusted P-value']), range(len(df)), s=10 * df['size'], c=df['size']) pylab.xlabel("Odd ratio") pylab.ylabel("Gene sets") pylab.yticks(range(len(df)), df.name) a, b = pylab.xlim() pylab.xlim([0, b]) pylab.grid(True) ax = pylab.gca() M = max(df['size']) if M > 100: l1, l2, l3 = "10", "100", str(M) else: l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M) handles = [ pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""), pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""), pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="") ] ax.legend(handles=handles, loc="upper left", title="gene-set size") pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.tight_layout() ax = pylab.colorbar(pylab.gci()) return df
def get_max_gc_correlation(self, reference): """Plot correlation between coverage and GC content by varying the GC window The GC content uses a moving window of size W. This parameter affects the correlation bewteen coverage and GC. This function find the *optimal* window length. """ pylab.clf() corrs = [] wss = [] def func(params): ws = int(round(params[0])) if ws < 10: return 0 self.bed.compute_gc_content(reference, ws) corr = self.get_gc_correlation() corrs.append(corr) wss.append(ws) return corr from scipy.optimize import fmin res = fmin(func, 100, xtol=1, disp=False) # guess is 200 pylab.plot(wss, corrs, "o") pylab.xlabel("GC window size") pylab.ylabel("Correlation") pylab.grid() return res[0]
def histogram_sequence_lengths(self, logy=True): """Histogram sequence lengths .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.histogram_sequence_lengths() """ data = [len(x) for x in self.sequences] bary, barx = np.histogram(data, bins=range(max(data)+1)) # get rid of zeros to avoid warnings bx = [x for x,y in zip(barx, bary) if y!=0] by = [y for x,y in zip(barx, bary) if y!=0] if logy: pylab.bar(bx, pylab.log10(by)) else: pylab.bar(bx, by) pylab.xlim([1,max(data)+1]) pylab.grid(True) pylab.xlabel("position (bp)", fontsize=self.fontsize) pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
def plot_gc_content(self, fontsize=16, ec="k", bins=100): """plot GC content histogram :params bins: a value for the number of bins or an array (with a copy() method) :param ec: add black contour on the bars .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam')) b.plot_gc_content() """ data = self.get_gc_content() try: X = np.linspace(0, 100, bins) except: X = bins.copy() pylab.hist(data, X, density=True, ec=ec) pylab.grid(True) mu = pylab.mean(data) sigma = pylab.std(data) X = pylab.linspace(X.min(), X.max(), 100) from sequana.misc import normpdf pylab.plot(X, normpdf(X, mu, sigma), lw=2, color="r", ls="--") pylab.xlabel("GC content", fontsize=16)
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:,"snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title,fontsize=fontsize) if grid is True: pylab.grid(True)
def plotter(filename, key): name = key.replace(" ", "_") pylab.ioff() histograms[key].plot(logy=False, lw=2, marker="o") pylab.title(name + "(%s)" % count) pylab.grid(True) pylab.savefig(filename) pylab.close() # need to close the figure otherwise warnings
def plot_padj_hist(self, bins=60, fontsize=16): pylab.hist(self.df.padj.dropna(), bins=bins, ec="k") pylab.grid(True) pylab.xlabel("Adjusted p-value", fontsize=fontsize) pylab.ylabel("Occurences", fontsize=fontsize) try: pylab.tight_layout() except: pass
def plot_pvalue_hist(self, bins=60, fontsize=16, rotation=0): pylab.hist(self.df.pvalue.dropna(), bins=bins, ec="k") pylab.grid(True) pylab.xlabel("raw p-value", fontsize=fontsize) pylab.ylabel("Occurences", fontsize=fontsize) try: pylab.tight_layout() except: pass
def barplot_per_sample(self, alpha=0.5, width=0.8, filename=None): df = self.get_data_reads() # this is ugly but will do the job for now under = df.query("name=='Undetermined'") others = df.query("name!='Undetermined'") under = under.groupby("name").sum().reset_index() others = others.groupby("name").sum().reset_index() under = under[["name", "count"]].set_index("name") others = others[["name", "count"]].set_index("name") all_data = others.sort_index(ascending=False) all_data.columns = ["samples"] # appended at the end all_data.loc['undetermined'] = 0 # revert back all_data = all_data.loc[::-1] # just for legend under.columns = ['undetermined'] if all_data.sum().min() > 1e6: all_data /= 1e6 under /= 1e6 M = True else: M = False all_data.plot(kind="barh", alpha=alpha, zorder=1, width=width, ec='k') under.plot(kind="barh", alpha=alpha, color="red", ax=pylab.gca(), zorder=1, width=width, ec='k') pylab.ylim([-0.5, len(all_data) + 0.5]) if len(all_data) < 100: pylab.yticks(range(len(all_data)), all_data.index) pylab.legend() pylab.grid(True, zorder=-1) if M: pylab.xlabel("Number of reads (M)") else: pylab.xlabel("Number of reads") try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename, dpi=200)
def plot_volcano_differences(self, mode="all"): cond1, cond2 = "cond1", "cond2" labels = [cond1, cond2] A = self.r1.df.loc[self.r1.gene_lists[mode]] B = self.r2.df.loc[self.r2.gene_lists[mode]] AB = set(A.index).intersection(set(B.index)) Aonly = A.loc[set(A.index).difference(set(B.index))] Bonly = B.loc[set(B.index).difference(set(A.index))] Acommon = A.loc[AB] Bcommon = B.loc[AB] pylab.clf() pylab.plot(Acommon.log2FoldChange, -np.log10(Acommon.padj), marker="o", alpha=0.5, color="r", lw=0, label="Common in experiment 1", pickradius=4, picker=True) pylab.plot(Bcommon.log2FoldChange, -np.log10(Bcommon.padj), marker="o", alpha=0.5, color="orange", lw=0, label="Common in experiment 2", pickradius=4, picker=True) for x in AB: a_l = A.loc[x].log2FoldChange a_p = -np.log10(A.loc[x].padj) b_l = B.loc[x].log2FoldChange b_p = -np.log10(B.loc[x].padj) pylab.plot([a_l, b_l], [a_p, b_p], 'k', alpha=0.5) pylab.plot(Bonly.log2FoldChange, -np.log10(Bonly.padj), marker="*", alpha=0.5, color="blue", lw=0, label="In experiment 2 only", pickradius=4, picker=True) pylab.plot(Aonly.log2FoldChange, -np.log10(Aonly.padj), marker="*", alpha=0.5, color="cyan", lw=0, label="In experiment 1 only", pickradius=4, picker=True) for name, x in Bonly.iterrows(): x1 = x.log2FoldChange y1 = -np.log10(x.padj) x2 = self.r1.df.loc[name].log2FoldChange y2 = -np.log10(self.r1.df.loc[name].padj) pylab.plot( [x1,x2], [y1,y2], ls="--", color='r') for name, x in Aonly.iterrows(): x1 = x.log2FoldChange y1 = -np.log10(x.padj) x2 = self.r2.df.loc[name].log2FoldChange y2 = -np.log10(self.r2.df.loc[name].padj) pylab.plot( [x1,x2], [y1,y2], ls="-", color='r') pylab.axhline(1.33, alpha=0.5, ls="--", color="r") pylab.xlabel("log2 fold Change") pylab.ylabel("log10 adjusted p-values") pylab.legend() pylab.grid(True) return Aonly, Bonly, Acommon, Bcommon
def plot(self, fontsize=16): """plot quality versus base position""" pylab.plot(self.quality, label="offset: %s" % self.offset) pylab.xlabel('base position', fontsize=fontsize) pylab.ylabel('Quality per base', fontsize=fontsize) pylab.grid(True) # ylim set autoscale to off so if we want to call this function several # times, we must reset autoscale to on before calling ylim pylab.autoscale() limits = pylab.ylim() pylab.ylim(max(0,limits[0]-1), limits[1]+1)
def plot(self, fontsize=16): """plot quality versus base position""" pylab.plot(self.quality, label="offset: %s" % self.offset) pylab.xlabel('base position', fontsize=fontsize) pylab.ylabel('Quality per base', fontsize=fontsize) pylab.grid(True) # ylim set autoscale to off so if we want to call this function several # times, we must reset autoscale to on before calling ylim pylab.autoscale() limits = pylab.ylim() pylab.ylim(max(0, limits[0] - 1), limits[1] + 1)
def boxplot_mapq_concordance(self): # method can only be bwa for now assert self.method == "bwa" data = self._get_data() df = pd.DataFrame(data, columns=["mapq", "length", "concordance"]) pylab.clf() pylab.boxplot([df[df.mapq == i]['concordance'] for i in range(1,61)]) pylab.xlabel("mapq") pylab.ylabel("concordance") pylab.grid() tt = [10,20,30,40,50,60] pylab.xticks(tt, tt)
def plot_genesets_hist(self, bins=20): N = len(self.gene_sets.keys()) pylab.clf() pylab.hist([len(v) for k, v in self.gene_sets.items()], bins=bins, lw=1, ec="k") pylab.title("{} gene sets".format(N)) pylab.xlabel("Gene set sizes") pylab.grid(True) a, b = pylab.xlim() pylab.xlim([0, b])
def plot_indel_dist(self, fontsize=16): """Plot indel count (+ ratio) :Return: list of insertions, deletions and ratio insertion/deletion for different length starting at 1 .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.plot_indel_dist() What you see on this figure is the presence of 10 insertions of length 1, 1 insertion of length 2 and 3 deletions of length 1 # Note that in samtools, several insertions or deletions in a single alignment are ignored and only the first one seems to be reported. For instance 10M1I10M1I stored only 1 insertion in its report; Same comment for deletions. .. todo:: speed up and handle long reads cases more effitiently by storing INDELS as histograms rather than lists """ try: self.insertions except: self._set_indels() if len(self.insertions) ==0 or len(self.deletions) == 0: raise ValueError("No deletions or insertions found") N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1 D = [self.deletions.count(i) for i in range(N)] I = [self.insertions.count(i) for i in range(N)] R = [i/d if d!=0 else 0 for i,d in zip(I, D)] fig, ax = pylab.subplots() ax.plot(range(N), I, marker="x", label="Insertions") ax.plot(range(N), D, marker="x", label="Deletions") ax.plot(range(N), R, "--r", label="Ratio insertions/deletions") ax.set_yscale("symlog") pylab.ylim([1, pylab.ylim()[1]]) pylab.legend() pylab.grid() from matplotlib.ticker import MaxNLocator ax.xaxis.set_major_locator(MaxNLocator(integer=True)) pylab.xlabel("Indel length", fontsize=fontsize) pylab.ylabel("Indel count", fontsize=fontsize) return I, D, R
def hist_coverage(self, bins=100): """ .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.hist_coverage() """ try: self.coverage except: self._set_coverage() pylab.hist(self.coverage, bins=bins) pylab.xlabel("Coverage") pylab.ylabel("Number of mapped bases") pylab.grid()
def hist_length_repeats(self, bins=20, alpha=0.5, hold=False, fontsize=12, grid=True, title="Repeat length", xlabel="Repeat length", ylabel="#", logy=True): """Plots histogram of the repeat lengths """ # check that user has set a threshold if hold is False: pylab.clf() pylab.hist(self.list_len_repeats, alpha=alpha, bins=bins) pylab.title(title) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True) if logy: pylab.semilogy()
def plot_read_length(self): """Plot occurences of aligned read lengths .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("test.bam")) b.plot_read_length() """ X, Y = self._get_read_length() pylab.plot(X, Y, label="min length:{}; max length:{}".format(min(X), max(X))) pylab.grid() pylab.xlabel("Read length", fontsize=16) pylab.legend()
def histogram_gc_content(self): """Plot histogram of GC content .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.histogram_gc_content() """ pylab.hist(self.gc_list, bins=range(0, 100)) pylab.grid() pylab.title("GC content distribution (per sequence)") pylab.xlabel(r"Mean GC content (%)", fontsize=self.fontsize) pylab.xlim([0,100])
def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="GC %", ylabel="#", label="",title=None): """Plot histogram GC content :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: fontsize of the x and y labels and title. :param bool grid: add grid or not :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_GC() """ mean_GC = np.mean(self.df.loc[:,'GC_content']) # set title if needed if title is None: title = "GC %% \n Mean GC : %.2f" %(mean_GC) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self.df.loc[:,'GC_content'], bins=bins, alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2)) + ", N : " + str(len(self))) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True) pylab.xlim([0, 100]) try: pylab.tight_layout() except:pass
def plot_acgt_content(self, stacked=False): """Plot histogram of GC content .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.plot_acgt_content() """ df = self.get_actg_content() if stacked is True: df.plot.bar(stacked=True) else: df.plot() pylab.grid(True) pylab.xlabel("position (bp)", fontsize=self.fontsize) pylab.ylabel("percent", fontsize=self.fontsize)
def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_nb_passes() """ max_nb_pass = self.df.nb_passes.max() if bins is None: k = range(1, max_nb_pass+1) # histogram nb passes if hold is False: pylab.clf() pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha, label=label, log=logy, width=1) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot(self, bins=80, rwidth=0.8, **kwargs): pylab.clf() Y, X, _ = pylab.hist(self.data, bins=bins, rwidth=rwidth, **kwargs) pylab.xlabel(self.xlabel, fontsize=self.fontsize) pylab.ylabel(self.ylabel, fontsize=self.fontsize) """self.Y = Y self.X = X ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(self.data) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=self.fontsize) """ pylab.grid(self.grid) pylab.title(self.title) try: pylab.tight_layout() except:pass
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ mean_len = np.mean(self.df.loc[:,'read_length']) mean_GC = np.mean(self.df.loc[:,'GC_content']) if hold is False: pylab.clf() data = self.df.loc[:,['read_length','GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)