def plot_percentage_null_read_counts(self): """ Bars represent the percentage of null counts in each samples. The dashed horizontal line represents the percentage of feature counts being equal to zero across all samples. .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_percentage_null_read_counts() """ N = len(self.sample_names) data = (self.df[self.sample_names]==0).sum() data = data / len(self.df) * 100 all_null = (self.df[self.sample_names].sum(axis=1) == 0).sum() pylab.clf() pylab.bar(range(N), data) pylab.axhline(all_null / len(self.df) * 100, lw=2, ls="--", color="k") pylab.xticks(range(N), self.sample_names) pylab.xlabel("Sample")
def plot_count_per_sample(self, fontsize=12, sample_list=None): """"Number of mapped reads per sample. Each color for each replicate .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_count_per_sample() """ sample_names = self.sample_names N = len(sample_names) dd = self.df[sample_names].sum() pylab.clf() colors = [] for sample in self.sample_names: colors.append(self.colors[self.get_cond_from_sample(sample)]) pylab.bar(range(N), (dd/1000000).values, color=colors, alpha=1, zorder=10, lw=1, ec="k", width=0.9) pylab.xlabel("Samples", fontsize=fontsize) pylab.ylabel("Total read count (millions)", fontsize=fontsize) pylab.grid(True, zorder=0) pylab.title("Total read count per sample", fontsize=fontsize) pylab.xticks(range(N), self.sample_names)
def plot_corr(self): lengths = self.SIRV_data.SIRV.get_lengths_as_dict() spikes = self.spikes_found() spikes["lengths"] = [lengths[k] for k in spikes.index] corr = spikes.corr() pylab.imshow(corr) N = len(spikes.columns) pylab.xticks(range(N), spikes.columns, rotation=90) pylab.yticks(range(N), spikes.columns) pylab.clim(0, 1) pylab.colorbar()
def plot_corr(self): lengths = self.SIRV_data.SIRV.get_lengths_as_dict() spikes = self.spikes_found() spikes["lengths"] = [lengths[k] for k in spikes.index] corr = spikes.corr() pylab.imshow(corr) N = len(spikes.columns) pylab.xticks(range(N), spikes.columns, rotation=90) pylab.yticks(range(N), spikes.columns) pylab.clim(0,1) pylab.colorbar()
def boxplot_mapq_concordance(self): # method can only be bwa for now assert self.method == "bwa" data = self._get_data() df = pd.DataFrame(data, columns=["mapq", "length", "concordance"]) pylab.clf() pylab.boxplot([df[df.mapq == i]['concordance'] for i in range(1,61)]) pylab.xlabel("mapq") pylab.ylabel("concordance") pylab.grid() tt = [10,20,30,40,50,60] pylab.xticks(tt, tt)
def boxplot_mapq_concordance(self, method): # method can only be bwa for now assert method == "bwa" data = self._get_data(method) df = pd.DataFrame(data, columns=["mapq", "length", "concordance"]) pylab.clf() pylab.boxplot([df[df.mapq == i]['concordance'] for i in range(1, 61)]) pylab.xlabel("mapq") pylab.ylabel("concordance") pylab.grid() tt = [10, 20, 30, 40, 50, 60] pylab.xticks(tt, tt)
def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_nb_passes() """ max_nb_pass = self.df.nb_passes.max() if bins is None: k = range(1, max_nb_pass + 1) # histogram nb passes if hold is False: pylab.clf() pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha, label=label, log=logy, width=1) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_ZMW_subreads(self, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.hist_ZMW_subreads() """ if self._nb_pass is None: self._get_ZMW_passes() max_nb_pass = max(self._nb_pass.keys()) k = range(1, max_nb_pass + 1) val = [self._nb_pass[i] for i in k] # histogram nb passes if hold is False: pylab.clf() pylab.bar(k, val, alpha=alpha, label=label, log=logy) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_boxplot_normeddata(self, fliersize=2, linewidth=2, rotation=0, **kwargs): import seaborn as sbn ax = sbn.boxplot( data=self.counts_norm.clip(1), linewidth=linewidth, fliersize=fliersize, palette=self.design_df.group_color, **kwargs, ) pos, labs = pylab.xticks() pylab.xticks(pos, labs, rotation=rotation) ax.set(yscale="log") self._format_plot(ylabel="Normalised count distribution") pylab.tight_layout()
def plot(self, interpolation='None', aspect='auto', cmap='hot', tight_layout=True, colorbar=True, fontsize_x=None, fontsize_y=None, rotation_x=90, xticks_on=True, yticks_on=True, **kargs): """wrapper around imshow to plot a dataframe :param interpolation: set to None :param aspect: set to 'auto' :param cmap: colormap to be used. :param tight_layout: :param colorbar: add a colobar (default to True) :param fontsize_x: fontsize on xlabels :param fontsize_y: fontsize on ylabels :param rotation_x: rotate labels on xaxis :param xticks_on: switch off the xticks and labels :param yticks_on: switch off the yticks and labels """ data = self.df pylab.clf() pylab.imshow(data, interpolation=interpolation, aspect=aspect, cmap=cmap, **kargs) if fontsize_x == None: fontsize_x = 16 #FIXME use default values if fontsize_y == None: fontsize_y = 16 #FIXME use default values if yticks_on is True: pylab.yticks(range(0, len(data.index)), data.index, fontsize=fontsize_y) else: pylab.yticks([]) if xticks_on is True: pylab.xticks(range(0, len(data.columns[:])), data.columns, fontsize=fontsize_x, rotation=rotation_x) else: pylab.xticks([]) if colorbar is True: pylab.colorbar() if tight_layout: pylab.tight_layout()
def plot_boxplot_rawdata(self, fliersize=2, linewidth=2, rotation=0, **kwargs): import seaborn as sbn ax = sbn.boxplot( data=self.counts_raw.clip(1), linewidth=linewidth, fliersize=fliersize, palette=self.design_df.group_color, **kwargs, ) pos, labs = pylab.xticks() pylab.xticks(pos, labs, rotation=rotation) ax.set_ylabel("Counts (raw) in log10 scale") ax.set_yscale("log") self._format_plot(ylabel="Raw count distribution") pylab.tight_layout()
def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_nb_passes() """ max_nb_pass = self.df.nb_passes.max() if bins is None: k = range(1, max_nb_pass+1) # histogram nb passes if hold is False: pylab.clf() pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha, label=label, log=logy, width=1) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def barplot(self, filename="lane{}_status.png", lanes=None): df = self.get_data_reads() if lanes is None: lanes = df.lane.unique() for lane in lanes: pylab.clf() query = "lane==@lane and name!='Undetermined'" counts = df.query(query)['count'] total = counts.sum() L = len(counts) query = "lane==@lane and name=='Undetermined'" under = df.query(query)['count'].sum() if total > 0: pylab.bar(range(L), counts, color="b", label="reads") if total == 0: color = "red" else: if 100 * under / total < 20: color = "green" elif 100 * under / total < 50: color = "orange" else: color = "red" pylab.bar(range(L, L + 1), under, color=color, label="undetermined") pylab.xticks([]) pylab.ylabel("Number of reads") try: pylab.legend(loc="lower left") except: pass pylab.title("Lane {}".format(lane)) pylab.savefig(filename.format(lane), dpi=200)
def plot_percentage_null_read_counts(self): """ Bars represent the percentage of null counts in each samples. The dashed horizontal line represents the percentage of feature counts being equal to zero across all samples. .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_percentage_null_read_counts() """ N = len(self.sample_names) data = (self.df[self.sample_names]==0).sum() data = data / len(self.df) * 100 all_null = (self.df[self.sample_names].sum(axis=1) == 0).sum() colors = [] for sample in self.sample_names: colors.append(self.colors[self.get_cond_from_sample(sample)]) pylab.clf() pylab.bar(range(N), data, color=colors, alpha=1, zorder=10, lw=1, ec="k", width=0.9) pylab.axhline(all_null / len(self.df) * 100, lw=2, ls="--", color="k", zorder=20) pylab.xticks(range(N), self.sample_names) pylab.xlabel("Sample") pylab.ylabel("Proportion of null counts (%)") pylab.grid(True, zorder=0)
def plot_count_per_sample(self, fontsize=12, rotation=45): """Number of mapped and annotated reads (i.e. counts) per sample. Each color for each replicate .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_count_per_sample() """ pylab.clf() df = self.counts_raw.sum().rename("total_counts") df = pd.concat([self.design_df, df], axis=1) pylab.bar( df.index, df.total_counts / 1000000, color=df.group_color, lw=1, zorder=10, ec="k", width=0.9, ) pylab.xlabel("Samples", fontsize=fontsize) pylab.ylabel("reads (M)", fontsize=fontsize) pylab.grid(True, zorder=0) pylab.title("Total read count per sample", fontsize=fontsize) pylab.xticks(rotation=rotation, ha="right") # pylab.xticks(range(N), self.sample_names) try: pylab.tight_layout() except: pass
def plot_percentage_null_read_counts(self): """Bars represent the percentage of null counts in each samples. The dashed horizontal line represents the percentage of feature counts being equal to zero across all samples .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_percentage_null_read_counts() """ pylab.clf() # how many null counts ? df = (self.counts_raw == 0).sum() / self.counts_raw.shape[0] * 100 df = df.rename("percent_null") df = pd.concat([self.design_df, df], axis=1) pylab.bar(df.index, df.percent_null, color=df.group_color, ec="k", lw=1, zorder=10) all_null = (self.counts_raw == 0).all(axis=1).sum() / self.counts_raw.shape[0] pylab.axhline(all_null, ls="--", color="black", alpha=0.5) pylab.xticks(rotation=45, ha="right") pylab.ylabel("Proportion of null counts (%)") pylab.grid(True, zorder=0) pylab.tight_layout()
def _format_plot(self, title="", xlabel="", ylabel="", rotation=0): pylab.title(title) pylab.xticks(rotation=rotation, ha="right") pylab.xlabel(xlabel) pylab.ylabel(ylabel)
def plot(self, kargs_scatter={ 's': 20, 'c': 'b' }, kargs_grids={}, kargs_histx={}, kargs_histy={}, scatter_position='bottom left', width=.5, height=.5, offset_x=.10, offset_y=.10, gap=0.06, facecolor='lightgrey', grid=True, show_labels=True, **kargs): """Scatter plot of set of 2 vectors and their histograms. :param x: a dataframe or a numpy matrix (2 vectors) or a list of 2 items, which can be a mix of list or numpy array. if **size** and/or **color** are found in the columns dataframe, those columns will be used in the scatter plot. kargs_scatter keys **c** and **s** will then be ignored. If a list of lists, **x** will be the first row and **y** the second row. :param y: if x is a list or an array, then y must also be provided as a list or an array :param kargs_scatter: a dictionary with pairs of key/value accepted by matplotlib.scatter function. Examples is a list of colors or a list of sizes as shown in the examples below. :param kargs_grid: a dictionary with pairs of key/value accepted by the maplotlib.grid (applied on histogram and axis at the same time) :param kargs_histx: a dictionary with pairs of key/value accepted by the matplotlib.histogram :param kargs_histy: a dictionary with pairs of key/value accepted by the matplotlib.histogram :param kargs: other optional parameters are **hold**, **facecolor**. :param scatter_position: can be 'bottom right/bottom left/top left/top right' :param width: width of the scatter plot (value between 0 and 1) :param height: height of the scatter plot (value between 0 and 1) :param offset_x: :param offset_y: :param gap: gap between the scatter and histogram plots. :param grid: defaults to True :return: the scatter, histogram1 and histogram2 axes. .. plot:: :include-source: :width: 80% import pylab import pandas as pd X = pylab.randn(1000) Y = pylab.randn(1000) df = pd.DataFrame({'X':X, 'Y':Y}) from sequana.viz import ScatterHist ScatterHist(df).plot() .. plot:: :include-source: :width: 80% from sequana.viz import ScatterHist ScatterHist(x=[1,2,3,4], y=[3,5,6,4]).plot( kargs_scatter={ 's':[200,400,600,800], 'c': ['red', 'green', 'blue', 'yellow'], 'alpha':0.5}, kargs_histx={'color': 'red'}, kargs_histy={'color': 'green'}) .. seealso:: `notebook <http://nbviewer.ipython.org/github/sequana/sequana/blob/master/notebooks/viz/scatter.ipynb>`__ """ df = self.df try: kargs_scatter['s'] = df['size'] except: pass try: kargs_scatter['c'] = df['color'] except: pass if kargs.get("hold", False) is False: pylab.clf() W = width H = height if scatter_position == 'bottom left': X0 = offset_x Y0 = offset_y Xoff = X0 + W + gap Yoff = Y0 + H + gap Wh = 1 - offset_x * 2 - W - gap Hh = 1 - offset_y * 2 - H - gap elif scatter_position == 'bottom right': Wh = 1 - offset_x * 2 - W - gap Hh = 1 - offset_y * 2 - H - gap X0 = offset_x + Wh + gap Y0 = offset_y Xoff = offset_x Yoff = Y0 + H + gap elif scatter_position == 'top right': Wh = 1 - offset_x * 2 - W - gap Hh = 1 - offset_y * 2 - H - gap X0 = offset_x + Wh + gap Y0 = offset_y + Hh + gap Xoff = offset_x Yoff = offset_y elif scatter_position == 'top left': Wh = 1 - offset_x * 2 - W - gap Hh = 1 - offset_y * 2 - H - gap X0 = offset_x Y0 = offset_y + Hh + gap Xoff = offset_x + W + gap Yoff = offset_y #Y0 #+ H + gap else: #pragma: no cover raise ValueError( "scatter_position must be 'top left', 'top right', 'bottom left', 'bottom right'" ) facecolor = kargs.get('facecolor', 'lightgrey') ax_scatter = pylab.axes( (X0, Y0, W, H), facecolor=facecolor, xscale='linear', yscale='linear') #, xticks='auto', yticks='auto') if show_labels: ax_scatter.set_xlabel(self.xy_names[0]) ax_scatter.set_ylabel(self.xy_names[1]) ax_hist_x = pylab.axes( (X0, Yoff, W, Hh), facecolor=facecolor, xscale='linear', yscale='linear') #, xticks='auto', yticks='auto') ax_hist_y = pylab.axes( (Xoff, Y0, Wh, H), facecolor=facecolor, xscale='linear', yscale='linear') #, xticks='auto', yticks='auto') # move ticks on axis if needed ax_hist_x.xaxis.set_ticks_position('top') if scatter_position == 'bottom left': ax_scatter.yaxis.set_ticks_position('left') ax_hist_x.yaxis.set_ticks_position('right') elif scatter_position == 'bottom right': ax_hist_y.yaxis.set_ticks_position('left') elif scatter_position == 'top right': ax_scatter.xaxis.set_ticks_position('top') ax_scatter.yaxis.set_ticks_position('right') ax_hist_y.yaxis.set_ticks_position('left') ax_hist_x.xaxis.set_ticks_position('bottom') elif scatter_position == 'top left': ax_scatter.xaxis.set_ticks_position('top') ax_hist_y.yaxis.set_ticks_position('right') ax_hist_x.xaxis.set_ticks_position('bottom') else: #pragma: no cover raise ValueError( "scatter_position must be 'top left', 'top right', 'bottom left', 'bottom right'" ) ax_scatter.scatter(df.x, df.y, **kargs_scatter) ax_hist_x.hist(df.x, **kargs_histx) # fixme: user may not want that ? kargs_histy['orientation'] = 'horizontal' ax_hist_y.hist(df.y, **kargs_histy) # I tried c.set_xticks but rotation could not be found pylab.xticks(ax_hist_y.get_xticks(), rotation=90) # grid if grid is True: ax_scatter.grid(b=grid, which='major', axis='both', **kargs_grids) ax_hist_x.grid(b=grid, which='major', axis='both', **kargs_grids) ax_hist_y.grid(b=grid, which='major', axis='both', **kargs_grids) return (ax_scatter, ax_hist_x, ax_hist_y)