def hist_concordance(self, method, bins=100, fontsize=16): """ formula : 1 - (in + del + mismatch / (in + del + mismatch + match) ) For BWA and BLASR, the get_cigar_stats are different !!! BWA for instance has no X stored while Pacbio forbids the use of the M (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters. Subread Accuracy: The post-mapping accuracy of the basecalls. Formula: [1 - (errors/subread length)], where errors = number of deletions + insertions + substitutions. """ try: concordance = self._concordance except: self._set_concordance(method) concordance = self._concordance pylab.hist(concordance, bins=bins) pylab.grid() mu = np.mean(concordance) median = np.median(concordance) pylab.axvline(mu, color='r', alpha=0.5) pylab.axvline(median, color='r', alpha=0.5, ls="--") pylab.xlabel("concordance", fontsize=fontsize)
def plot_idr_vs_peaks(self, filename=None, savefig=False): # global_idr is actually -log10(idr) pylab.clf() X1 = pylab.linspace(0, self.threshold, 100) X2 = pylab.linspace(self.threshold, 1, 100) # convert global idr to proba df1 = self.df.query("idr<@self.threshold") df2 = self.df.query("idr>[email protected]") pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2) shift = len(df1) pylab.plot([shift + sum(df2['idr'] < x) for x in X2], X2, "-", color='k', lw=2) pylab.xlabel('Number of significant peaks') pylab.ylabel('IDR') pylab.axhline(0.05, color='b', ls='--') pylab.axvline(self.N_significant_peaks, color='b', ls='--') if savefig: pylab.savefig(filename)
def hist_concordance(self, bins=100, fontsize=16): """ formula : 1 - (in + del + mismatch / (in + del + mismatch + match) ) For BWA and BLASR, the get_cigar_stats are different !!! BWA for instance has no X stored while Pacbio forbids the use of the M (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters. Subread Accuracy: The post-mapping accuracy of the basecalls. Formula: [1 - (errors/subread length)], where errors = number of deletions + insertions + substitutions. """ try: concordance = self._concordance except: self._set_concordance() concordance = self._concordance pylab.hist(concordance, bins=bins) pylab.grid() mu = np.mean(concordance) median = np.median(concordance) pylab.axvline(mu, color='r', alpha=0.5) pylab.axvline(median, color='r', alpha=0.5, ls="--") pylab.xlabel("concordance", fontsize=fontsize)
def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.scatter(-pylab.log10(df['Adjusted P-value']), range(len(df)), s=10 * df['size'], c=df['size']) pylab.xlabel("Odd ratio") pylab.ylabel("Gene sets") pylab.yticks(range(len(df)), df.name) a, b = pylab.xlim() pylab.xlim([0, b]) pylab.grid(True) ax = pylab.gca() M = max(df['size']) if M > 100: l1, l2, l3 = "10", "100", str(M) else: l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M) handles = [ pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""), pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""), pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="") ] ax.legend(handles=handles, loc="upper left", title="gene-set size") pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.tight_layout() ax = pylab.colorbar(pylab.gci()) return df
def scatter_length_cov_gc(self, min_length=200, min_cov=10): pylab.clf() pylab.scatter(self.df.length, self.df['cov'], c=self.df.GC) pylab.loglog() pylab.axvline(min_length, lw=2, c="r", ls='--') pylab.axhline(min_cov, lw=2, c="r", ls='--') pylab.xlabel("contig length") pylab.ylabel("contig coverage") pylab.colorbar(label="GC") pylab.grid(True)
def barplot(self, enrich, cutoff=0.05, nmax=10): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.barh(range(len(df)), -pylab.log10(df['Adjusted P-value'])) pylab.yticks(range(len(df)), df.name) pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.grid(True) pylab.xlabel("Adjusted p-value (log10)") pylab.ylabel("Gene sets") a, b = pylab.xlim() pylab.xlim([0, b]) pylab.tight_layout() return df
def plot(self, X=[0, 0.1, 0.2, 0.3, .4, .5, .6, .7, .8, .9, .95, .99, .999, 1], fontsize=16, label=None): """plot percentage of genes covered (y axis) as a function of percentage of genes covered at least by X percent (x-axis). """ icol = self.coverage_column N = float(len(self.df)) X = np.array(X) Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X]) if label is None: pylab.plot(X * 100, Y, "o-") else: pylab.plot(X * 100, Y, "o-", label=label) pylab.xlabel("Gene coverage (%)", fontsize=fontsize) pylab.ylabel("Percentage of genes covered", fontsize=fontsize) for this in [25, 50, 75]: pylab.axhline(this, color="r", alpha=0.5, ls="--") pylab.axvline(this, color="r", alpha=0.5, ls="--")
def plot_common_major_counts(self, mode, labels=None, switch_up_down_cond2=False, add_venn=True, xmax=None, title="", fontsize=12, sortby="log2FoldChange"): """ :param mode: down, up or all .. plot:: :include-source: from sequana import sequana_data from sequana.compare import RNADiffCompare c = RNADiffCompare( sequana_data("rnadiff/rnadiff_onecond_1"), sequana_data("rnadiff/rnadiff_onecond_2")) c.plot_common_major_counts("down") """ #cond1, cond2 = self._get_cond1_cond2() if labels is None: labels = ['r1', 'r2'] if mode in ["down"]: # Negative values ! gl1 = set(self.r1.gene_lists['down']) gl2 = set(self.r2.gene_lists['down']) A = self.r1.df.loc[gl1].sort_values(by=sortby) B = self.r2.df.loc[gl1].sort_values(by=sortby) else: gl1 = set(self.r1.gene_lists[mode]) gl2 = set(self.r2.gene_lists[mode]) A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False) B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False) # sometimes, up and down may be inverted as compared to the other # conditions N = [] for i in range(1,max(len(A), len(B))): a = A.iloc[0:i].index b = B.iloc[0:i].index n = len(set(b).intersection(set(a))) N.append(n / i*100) max_common = len(set(A.index).intersection(set(B.index))) pylab.clf() if len(A) > len(B): pylab.axhline(max_common/len(A)*100, color="r", ls='--', label="min set intersection") pylab.axvline(len(B), ls="--", color="k", label="rank of minor set") else: pylab.axhline(max_common/len(B)*100, color='r', ls='--', label="min set intersect") pylab.axvline(len(A), ls="--", color="k", label="rank of minor set") pylab.plot(N) pylab.xlabel('rank', fontsize=fontsize) pylab.ylabel('% common features', fontsize=fontsize) pylab.grid(True) pylab.ylim([0,100]) if xmax: pylab.xlim([0, xmax]) else: pylab.xlim([0, max(len(A),len(B))]) pylab.title(title, fontsize=fontsize) ax = pylab.gca() ax2 = ax.twinx() ax2.plot(A[sortby].values, "orange", label=sortby) ax2.set_ylabel(sortby) pylab.legend(loc="lower left") ax.legend(loc="lower right") if add_venn: f = pylab.gcf() ax = f.add_axes([0.5,0.5,0.35,0.35], facecolor="grey") if mode=="down": self.plot_venn_down(ax=ax, title=None, labels=labels, mode="two_only") elif mode=="up": self.plot_venn_up(ax=ax, title=None, labels=labels, mode="two_only") elif mode=="all": self.plot_venn_all(ax=ax, title=None, labels=labels, mode="two_only")
def plot_go_terms(self, ontologies, max_features=50, log=False, fontsize=8, minimum_genes=0, pvalue=0.05, cmap="summer_r", sort_by="fold_enrichment", show_pvalues=False, include_negative_enrichment=False, fdr_threshold=0.05, compute_levels=True, progress=True): assert sort_by in ['pValue', 'fold_enrichment', 'fdr'] # FIXME: pvalue and fold_enrichment not sorted in same order pylab.clf() df = self.get_data( ontologies, include_negative_enrichment=include_negative_enrichment, fdr=fdr_threshold) if len(df) == 0: return df df = df.query("pValue<=@pvalue") logger.info("Filtering out pvalue>{}. Kept {} GO terms".format( pvalue, len(df))) df = df.reset_index(drop=True) # Select a subset of the data to keep the best max_features in terms of # pValue subdf = df.query("number_in_list>@minimum_genes").copy() logger.info( "Filtering out GO terms with less than {} genes: Kept {} GO terms". format(minimum_genes, len(subdf))) logger.info("Filtering out the 3 parent terms") subdf = subdf.query("id not in @self.ontologies") # Keeping only a part of the data, sorting by pValue if sort_by == "pValue": subdf = subdf.sort_values(by="pValue", ascending=False).iloc[-max_features:] df = df.sort_values(by="pValue", ascending=False) elif sort_by == "fold_enrichment": subdf = subdf.sort_values(by="abs_log2_fold_enrichment", ascending=True).iloc[-max_features:] df = df.sort_values(by="abs_log2_fold_enrichment", ascending=False) elif sort_by == "fdr": subdf = subdf.sort_values(by="fdr", ascending=False).iloc[-max_features:] df = df.sort_values(by="fdr", ascending=False) subdf = subdf.reset_index(drop=True) # We get all levels for each go id. # They are stored by MF, CC or BP if compute_levels: paths = self.get_graph(list(subdf['id'].values), progress=progress) levels = [] keys = list(paths.keys()) goid_levels = paths[keys[0]] if len(keys) > 1: for k in keys[1:]: goid_levels.update(paths[k]) levels = [goid_levels[ID] for ID in subdf['id'].values] subdf["level"] = levels else: subdf['level'] = "" N = len(subdf) size_factor = 12000 / len(subdf) max_size = subdf.number_in_list.max() min_size = subdf.number_in_list.min() sizes = [ max(max_size * 0.2, x) for x in size_factor * subdf.number_in_list.values / subdf.number_in_list.max() ] m1 = min(sizes) m3 = max(sizes) m2 = m1 + (m3 - m1) / 2 if log: pylab.scatter(pylab.log2(subdf.fold_enrichment), range(len(subdf)), c=subdf.fdr, s=sizes, cmap=cmap, alpha=0.8, ec="k", vmin=0, vmax=fdr_threshold, zorder=10) #pylab.barh(range(N), pylab.log2(subdf.fold_enrichment), color="r", # label="pvalue>0.05; FDR>0.05") #pylab.axvline(1, color="gray", ls="--") #pylab.axvline(-1, color="gray", ls="--") else: pylab.scatter(subdf.fold_enrichment, range(len(subdf)), c=subdf.fdr, cmap=cmap, s=sizes, ec="k", alpha=.8, vmin=0, vmax=fdr_threshold, zorder=10) # pylab.barh(range(N), subdf.fold_enrichment, color="r", # label="not significant") pylab.grid(zorder=-10) ax2 = pylab.colorbar(shrink=0.5) ax2.ax.set_ylabel('FDR') labels = [ x if len(x) < 50 else x[0:47] + "..." for x in list(subdf.label) ] ticks = [ "{} ({}) {}".format(ID, level, "; " + label.title()) for level, ID, label in zip(subdf['level'], subdf.id, labels) ] pylab.yticks(range(N), ticks, fontsize=fontsize, ha='left') yax = pylab.gca().get_yaxis() try: pad = [x.label.get_window_extent().width for x in yax.majorTicks] yax.set_tick_params(pad=max(pad)) except: yax.set_tick_params(pad=60 * fontsize * 0.7) yax.set_tick_params(pad=60 * fontsize * 0.6) fc_max = subdf.fold_enrichment.max(skipna=True) fc_min = subdf.fold_enrichment.min(skipna=True) # go into log2 space fc_max = pylab.log2(fc_max) fc_min = pylab.log2(fc_min) abs_max = max(fc_max, abs(fc_min), 1) if log: fc_max = abs_max * 1.5 else: fc_max = 2**abs_max * 1.2 pylab.axvline(0, color="k", lw=2) if log: pylab.xlabel("Fold Enrichment (log2)") else: pylab.xlabel("Fold Enrichment") if include_negative_enrichment: pylab.xlim([-fc_max, fc_max]) else: pylab.xlim([0, fc_max]) pylab.tight_layout() # The pvalue: if show_pvalues: ax = pylab.gca().twiny() ax.set_xlim([0, max(-pylab.log10(subdf.pValue)) * 1.2]) ax.set_xlabel("p-values (log10)", fontsize=12) ax.plot(-pylab.log10(subdf.pValue), range(len(subdf)), label="pvalue", lw=2, color="k") ax.axvline(1.33, lw=1, ls="--", color="grey", label="pvalue=0.05") pylab.tight_layout() pylab.legend(loc="lower right") s1 = pylab.scatter([], [], s=m1, marker='o', color='#555555', ec="k") s2 = pylab.scatter([], [], s=m2, marker='o', color='#555555', ec="k") s3 = pylab.scatter([], [], s=m3, marker='o', color='#555555', ec="k") if len(subdf) < 10: labelspacing = 1.5 * 4 borderpad = 4 handletextpad = 2 elif len(subdf) < 20: labelspacing = 1.5 * 2 borderpad = 1 handletextpad = 2 else: labelspacing = 1.5 borderpad = 2 handletextpad = 2 if len(subdf) >= 3: leg = pylab.legend( (s1, s2, s3), (str(int(min_size)), str(int(min_size + (max_size - min_size) / 2)), str(int(max_size))), scatterpoints=1, loc='lower right', ncol=1, frameon=True, title="gene-set size", labelspacing=labelspacing, borderpad=borderpad, handletextpad=handletextpad, fontsize=8) else: leg = pylab.legend((s1, ), (str(int(min_size)), ), scatterpoints=1, loc='lower right', ncol=1, frameon=True, title="gene-set size", labelspacing=labelspacing, borderpad=borderpad, handletextpad=handletextpad, fontsize=8) frame = leg.get_frame() frame.set_facecolor('#b4aeae') frame.set_edgecolor('black') frame.set_alpha(1) self.subdf = subdf self.df = df return df