def plot_idr_vs_peaks(self, filename=None, savefig=False): # global_idr is actually -log10(idr) pylab.clf() X1 = pylab.linspace(0, self.threshold, 100) X2 = pylab.linspace(self.threshold, 1, 100) # convert global idr to proba df1 = self.df.query("idr<@self.threshold") df2 = self.df.query("idr>[email protected]") pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2) shift = len(df1) pylab.plot([shift + sum(df2['idr'] < x) for x in X2], X2, "-", color='k', lw=2) pylab.xlabel('Number of significant peaks') pylab.ylabel('IDR') pylab.axhline(0.05, color='b', ls='--') pylab.axvline(self.N_significant_peaks, color='b', ls='--') if savefig: pylab.savefig(filename)
def get_percentage_genes_covered_at_this_fraction(self, this): assert this <= 1 and this >= 0 icol = self.coverage_column X = pylab.linspace(0, 1, 101) N = float(len(self.df)) Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X]) return np.interp(this, X, Y)
def plot_gc_content(self, fontsize=16, ec="k", bins=100): """plot GC content histogram :params bins: a value for the number of bins or an array (with a copy() method) :param ec: add black contour on the bars .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam')) b.plot_gc_content() """ data = self.get_gc_content() try: X = np.linspace(0, 100, bins) except: X = bins.copy() pylab.hist(data, X, normed=True, ec=ec) pylab.grid(True) mu = pylab.mean(data) sigma = pylab.std(data) X = pylab.linspace(X.min(), X.max(), 100) pylab.plot(X, pylab.normpdf(X, mu, sigma), lw=2, color="r", ls="--") pylab.xlabel("GC content", fontsize=16)
def plot_gc_content(self, fontsize=16, ec="k", bins=100): """plot GC content histogram :params bins: a value for the number of bins or an array (with a copy() method) :param ec: add black contour on the bars .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam')) b.plot_gc_content() """ data = self.get_gc_content() try: X = np.linspace(0, 100, bins) except: X = bins.copy() pylab.hist(data, X, density=True, ec=ec) pylab.grid(True) mu = pylab.mean(data) sigma = pylab.std(data) X = pylab.linspace(X.min(), X.max(), 100) from sequana.misc import normpdf pylab.plot(X, normpdf(X, mu, sigma), lw=2, color="r", ls="--") pylab.xlabel("GC content", fontsize=16)
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:,"snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title,fontsize=fontsize) if grid is True: pylab.grid(True)
def plot(self, normed=True, N=1000, Xmin=None, Xmax=None, bins=50, color='red', lw=2, hist_kw={ 'color': '#5F9EA0', "edgecolor": "k" }, ax=None): if ax: ax.hist(self.data, normed=normed, bins=bins, **hist_kw) else: pylab.hist(self.data, density=normed, bins=bins, **hist_kw) if Xmin is None: Xmin = self.data.min() if Xmax is None: Xmax = self.data.max() X = pylab.linspace(Xmin, Xmax, N) if ax: ax.plot(X, [self.model.pdf(x, self.results.x) for x in X], color=color, lw=lw) else: pylab.plot(X, [self.model.pdf(x, self.results.x) for x in X], color=color, lw=lw) K = len(self.results.x) # The PIs must be normalised import scipy.stats as ss for i in range(self.k): mu, sigma, pi_ = self.results.mus[i], self.results.sigmas[ i], self.results.pis[i] if ax: ax.plot(X, [pi_ * ss.norm.pdf(x, mu, sigma) for x in X], 'k--', alpha=0.7, lw=2) else: pylab.plot(X, [pi_ * ss.norm.pdf(x, mu, sigma) for x in X], 'k--', alpha=0.7, lw=2)
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#", title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:, "snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:, 'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:, 'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:, 'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:, 'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_jaccard_distance(self, mode, padjs=[0.0001,0.001,0.01,0.05,0.1], Nfc=50, smooth=False, window=5): assert mode in ['down', 'up', 'all'] pylab.clf() if mode == "down": m1 = self.r1.df.log2FoldChange.min() m2 = self.r2.df.log2FoldChange.min() minimum = min(m1,m2) print(m1, m2) X = pylab.linspace(0, minimum, Nfc) elif mode == "up": m1 = self.r1.df.log2FoldChange.max() m2 = self.r2.df.log2FoldChange.max() maximum = max(m1,m2) X = pylab.linspace(0, maximum, Nfc) else: minmax1 = self.r1.df.log2FoldChange.abs().max() minmax2 = self.r2.df.log2FoldChange.abs().max() maximum = max(minmax1, minmax2) X = pylab.linspace(0, maximum, Nfc) common = {} for padj in padjs: I = [] common[padj] = [] for x in X: if mode == "down": # less than a given fold change that is negative A = set(self.r1.df.query("log2FoldChange<=@x and padj<@padj").index) B = set(self.r2.df.query("log2FoldChange<=@x and padj<@padj").index) elif mode == "up": # greater than a given fold change that is positive A = set(self.r1.df.query("log2FoldChange>=@x and padj<@padj").index) B = set(self.r2.df.query("log2FoldChange>=@x and padj<@padj").index) else: A = set(self.r1.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index) B = set(self.r2.df.query("(log2FoldChange>=@x or log2FoldChange<=-@x) and padj<@padj").index) if len(A) == 0 or len(B) == 0: # no overlap yet I.append(100) else: res = len(A.intersection(B)) / (len(A) + len(B) - len(A.intersection(B))) * 100 I.append(res) common[padj].append(len(A.intersection(B))) try: if smooth: I = pd.Series(I).rolling(window).median().values else: assert False except: pass pylab.plot(X, I, 'o-', label=str(padj)) ax = pylab.gca() ax.set_ylabel("Jaccard similarity (intersection/union)") ax.set_xlabel("Fold change (log2)") ax2 = ax.twinx() for padj in padjs: ax2.plot(X, common[padj], color='orange', ls='--') ax2.set_ylabel("Cardinality of the union ") ax.legend() ax.set_ylim([0,100]) #ax2.set_ylim([0,100]) if mode == "down": ax.axvline(-2, ls='--', color='r') else: ax.axvline(2, ls='--', color='r')