def plot_polymerase_per_barcode(self, fontsize=12, unbarcoded=True): """Number Of Polymerase Reads Per Barcode""" PR = self.df_barcoded["Polymerase Reads"].sum() data = self.df_barcoded['Polymerase Reads'].sort_values( ascending=False).values pylab.plot([int(x) for x in range(1, len(data) + 1)], data, label="barcodes") pylab.axhline(data.mean(), color="r", label="average") try: if unbarcoded is True: unbar = self.df_not_barcoded['Polymerase Reads'].iloc[0] pylab.axhline(unbar, color="k", ls="--", label="not barcoded") except: pass pylab.xlabel("Barcode Rank Order", fontsize=fontsize) pylab.ylabel("Counts of Reads", fontsize=fontsize) pylab.title("Total Polymerase count: {}".format(PR)) pylab.legend() pylab.ylim(ymin=0) try: pylab.tight_layout() except: pass
def plot_volcano(self): """ .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_volcano() """ d1 = self.df.query("padj>0.05") d2 = self.df.query("padj<=0.05") fig = pylab.figure() pylab.plot(d1.log2FoldChange, -np.log10(d1.padj), marker="o", alpha=0.5, color="r", lw=0) pylab.plot(d2.log2FoldChange, -np.log10(d2.padj), marker="o", alpha=0.5, color="k", lw=0) pylab.grid(True) pylab.xlabel("fold change") pylab.ylabel("log10 adjusted p-value") m1 = abs(min(self.df.log2FoldChange)) m2 = max(self.df.log2FoldChange) limit = max(m1,m2) pylab.xlim([-limit, limit]) y1,y2 = pylab.ylim() pylab.ylim([0,y2]) pylab.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)")
def plot_contig_length_vs_GC(self, alpha=0.5): pylab.plot(self.df["length"], self.df['GC'], "o", alpha=alpha) pylab.xlabel("contig length (bp)") pylab.ylabel("GC (%)") pylab.grid(True) pylab.ylim([0, 100]) pylab.xlim(0, max(self.df['length']) + 10)
def plot(self, color_line='r', bgcolor='grey', color='yellow', lw=4, hold=False, ax=None): xmax = self.xmax + 1 if ax: pylab.sca(ax) pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.3) pylab.fill_between([0, xmax], [20, 20], [30, 30], color='orange', alpha=0.3) pylab.fill_between([0, xmax], [30, 30], [41, 41], color='green', alpha=0.3) if self.X is None: X = range(1, self.xmax + 1) pylab.fill_between(X, self.df.mean() + self.df.std(), self.df.mean() - self.df.std(), color=color, interpolate=False) pylab.plot(X, self.df.mean(), color=color_line, lw=lw) pylab.ylim([0, 41]) pylab.xlim([0, self.xmax + 1]) pylab.title("Quality scores across all bases") pylab.xlabel("Position in read (bp)") pylab.ylabel("Quality") pylab.grid(axis='x')
def plot_gc_vs_coverage(self, filename=None, bins=None, Nlevels=6, fontsize=20, norm="log", ymin=0, ymax=100, contour=True, **kwargs): if Nlevels is None or Nlevels==0: contour = False data = self.df[['cov','gc']].copy() data['gc'] *= 100 data = data.dropna() if bins is None: bins = [100, min(int(data['gc'].max()-data['gc'].min()+1), max(5,self.bed.gc_window_size - 4))] bins[0] = max(10, min(bins[0], self.df['cov'].max())) from biokit import Hist2D h2 = Hist2D(data) try: h2.plot(bins=bins, xlabel="Per-base coverage", ylabel=r'GC content (%)', Nlevels=Nlevels, contour=contour, norm=norm, fontsize=fontsize, **kwargs) except: h2.plot(bins=bins, xlabel="Per-base coverage", ylabel=r'GC content (%)' , Nlevels=Nlevels, contour=False, norm=norm, fontsize=fontsize, **kwargs) pylab.ylim([ymin, ymax]) try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
def plot_alignment(self, bamfile, motif, window=200, global_th=10,title=None,legend=True, legend_fontsize=11, valid_rnames=[], valid_flags=[]): """ plot alignments that match the motif. """ bam = BAM(bamfile) print("Found {} hits".format(len(bam))) pylab.clf() count = 0 for aln in bam: if valid_rnames and aln.rname not in valid_rnames: continue if valid_flags and aln.flag not in valid_flags: continue seq = aln.query_sequence if seq: count += 1 X1 = [seq[i:i+window].count(motif) for i in range(len(seq))] pylab.plot(range(aln.reference_start, aln.reference_start+len(seq)),X1, label=aln.query_name) print("Showing {} entries after filtering".format(count)) max_theo = int(1.2*window / len(motif)) pylab.ylim([0, max_theo]) if legend and count<15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16)
def barplot_per_sample(self, alpha=0.5, width=0.8, filename=None): df = self.get_data_reads() # this is ugly but will do the job for now under = df.query("name=='Undetermined'") others = df.query("name!='Undetermined'") under = under.groupby("name").sum().reset_index() others = others.groupby("name").sum().reset_index() under = under[["name", "count"]].set_index("name") others = others[["name", "count"]].set_index("name") all_data = others.sort_index(ascending=False) all_data.columns = ["samples"] # appended at the end all_data.loc['undetermined'] = 0 # revert back all_data = all_data.loc[::-1] # just for legend under.columns = ['undetermined'] if all_data.sum().min() > 1e6: all_data /= 1e6 under /= 1e6 M = True else: M = False all_data.plot(kind="barh", alpha=alpha, zorder=1, width=width, ec='k') under.plot(kind="barh", alpha=alpha, color="red", ax=pylab.gca(), zorder=1, width=width, ec='k') pylab.ylim([-0.5, len(all_data) + 0.5]) if len(all_data) < 100: pylab.yticks(range(len(all_data)), all_data.index) pylab.legend() pylab.grid(True, zorder=-1) if M: pylab.xlabel("Number of reads (M)") else: pylab.xlabel("Number of reads") try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename, dpi=200)
def plot(self, fontsize=16): """plot quality versus base position""" pylab.plot(self.quality, label="offset: %s" % self.offset) pylab.xlabel('base position', fontsize=fontsize) pylab.ylabel('Quality per base', fontsize=fontsize) pylab.grid(True) # ylim set autoscale to off so if we want to call this function several # times, we must reset autoscale to on before calling ylim pylab.autoscale() limits = pylab.ylim() pylab.ylim(max(0,limits[0]-1), limits[1]+1)
def plot(self, fontsize=16): """plot quality versus base position""" pylab.plot(self.quality, label="offset: %s" % self.offset) pylab.xlabel('base position', fontsize=fontsize) pylab.ylabel('Quality per base', fontsize=fontsize) pylab.grid(True) # ylim set autoscale to off so if we want to call this function several # times, we must reset autoscale to on before calling ylim pylab.autoscale() limits = pylab.ylim() pylab.ylim(max(0, limits[0] - 1), limits[1] + 1)
def plot(self, clf=True): if clf: pylab.clf() M = self.df_shustring.shustring_length.max() print(M) M = int(M / 1000) + 1 for i in range(M): pylab.axhline(i * 1000, ls='--', color='grey') pylab.plot(self.df_shustring.shustring_length) pylab.xlabel('position (bp)') pylab.ylabel('Length of repeats') pylab.ylim(bottom=0)
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[60, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ if self._df is None: self._get_df() mean_len = np.mean(self._df.loc[:, 'read_length']) mean_GC = np.mean(self._df.loc[:, 'GC_content']) if hold is False: pylab.clf() data = self._df.loc[:, ['read_length', 'GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)
def plot_indel_dist(self, fontsize=16): """Plot indel count (+ ratio) :Return: list of insertions, deletions and ratio insertion/deletion for different length starting at 1 .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.plot_indel_dist() What you see on this figure is the presence of 10 insertions of length 1, 1 insertion of length 2 and 3 deletions of length 1 # Note that in samtools, several insertions or deletions in a single alignment are ignored and only the first one seems to be reported. For instance 10M1I10M1I stored only 1 insertion in its report; Same comment for deletions. .. todo:: speed up and handle long reads cases more effitiently by storing INDELS as histograms rather than lists """ try: self.insertions except: self._set_indels() if len(self.insertions) ==0 or len(self.deletions) == 0: raise ValueError("No deletions or insertions found") N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1 D = [self.deletions.count(i) for i in range(N)] I = [self.insertions.count(i) for i in range(N)] R = [i/d if d!=0 else 0 for i,d in zip(I, D)] fig, ax = pylab.subplots() ax.plot(range(N), I, marker="x", label="Insertions") ax.plot(range(N), D, marker="x", label="Deletions") ax.plot(range(N), R, "--r", label="Ratio insertions/deletions") ax.set_yscale("symlog") pylab.ylim([1, pylab.ylim()[1]]) pylab.legend() pylab.grid() from matplotlib.ticker import MaxNLocator ax.xaxis.set_major_locator(MaxNLocator(integer=True)) pylab.xlabel("Indel length", fontsize=fontsize) pylab.ylabel("Indel count", fontsize=fontsize) return I, D, R
def boxplot_quality(self, color_line='r', bgcolor='grey', color='yellow', lw=4, hold=False, ax=None): quality = self.df[[str(x) for x in range(42)]] # not sure why we have phred score from 0 to 41 N = self.metadata['ReadNum'] proba = quality / N self.xmax = 150 xmax = self.xmax + 1 if ax: pylab.sca(ax) # pragma no cover pylab.fill_between([0,xmax], [0,0], [20,20], color='red', alpha=0.3) pylab.fill_between([0,xmax], [20,20], [30,30], color='orange', alpha=0.3) pylab.fill_between([0,xmax], [30,30], [41,41], color='green', alpha=0.3) X = [] Q = [] S = [] for pos in range(1, 151): qualities = [((int(k)+1)*v) for k,v in quality.loc[pos].items()] mean_quality = sum(qualities) / N X.append(pos) Q.append(mean_quality) proba = quality.loc[pos] / N std = pylab.sqrt(sum([(x-mean_quality)**2 * y for x, y in zip(range(42), proba)])) S.append(std) print(len(X)) print(len(Q)) print(len(S)) Q = np.array(Q) X = np.array(X) S = np.array(S) pylab.fill_between(X, Q+S, Q-S, color=color, interpolate=False) pylab.plot(X, Q, color=color_line, lw=lw) pylab.ylim([0, 41]) pylab.xlim([0, self.xmax+1]) pylab.title("Quality scores across all bases") pylab.xlabel("Position in read (bp)") pylab.ylabel("Quality") pylab.grid(axis='x')
def plot_alignment(self, motif, window=200, global_th=10, title=None, legend=True, legend_fontsize=11): """ plot alignments that match the motif. """ df = self._get_aligments(motif=motif, window=window, global_th=global_th) print("Found {} hits".format(len(df))) bam = BAM(self.bamfile) pylab.clf() count = 0 for aln in bam: if aln.query_name in df.query_name.values: seq = aln.query_sequence if seq: count += 1 X1 = [ seq[i:i + window].count(motif) for i in range(len(seq)) ] pylab.plot(range(aln.reference_start, aln.reference_start + len(seq)), X1, label=aln.query_name) max_theo = int(1.2 * window / len(motif)) pylab.ylim([0, max_theo]) if legend and count < 15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16) return df
def plot(self): """""" if self.design: self.df['label'] = self.design.df['type'] + "/" + self.design.df[ 'condition'] pylab.clf() MX = self.df.FRiP.max() MY = self.df['in_peaks'].max() pylab.plot([0, MX], [0, MY], ls='--', color='b', alpha=0.5) for label in self.df['label'].unique(): self.df.query('label==@label').plot(x='FRiP', y='in_peaks', marker="o", lw=0, label=label, ax=pylab.gca()) pylab.ylabel('Reads in peaks') pylab.xlabel('FRiP') pylab.xlim(0, pylab.xlim()[1]) pylab.ylim(0, pylab.ylim()[1]) pylab.grid()
def plot_sequence_quality(self, max_score=40, ax=None): ymax = max_score + 1 xmax = 0 for sample in self.fastqc_data.keys(): if "per_sequence_quality_scores" in self.fastqc_data[sample]: data = { self._avg_bp_from_range(d['base']): d['mean'] for d in self.fastqc_data[sample] ['per_base_sequence_quality'] } df = pd.Series(data) df.plot(color="k", alpha=0.5) if df.max() > ymax: ymax = df.max() if df.index.max() > xmax: xmax = df.index.max() if ax: pylab.sca(ax) pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.4) pylab.fill_between([0, xmax], [20, 20], [30, 30], color='orange', alpha=0.4) pylab.fill_between([0, xmax], [30, 30], [ymax, ymax], color='green', alpha=0.4) X = range(1, xmax + 1) pylab.ylim([0, ymax]) if xmax != 0: pylab.xlim([0, xmax]) pylab.title("Quality scores across all bases") pylab.xlabel("Position in read (bp)") pylab.ylabel("Phred Score", fontsize=12) pylab.grid(axis='x')
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ mean_len = np.mean(self.df.loc[:,'read_length']) mean_GC = np.mean(self.df.loc[:,'GC_content']) if hold is False: pylab.clf() data = self.df.loc[:,['read_length','GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)
def _plot(self, Xr, pca=None, pc1=0, pc2=1, colors=None, show_labels=True): if colors is None: colors = [self.colors[k] for k in self.labels] if len(colors) != len(Xr): colors = ["r"] * len(Xr[:,0]) else: for k in self.labels: if k not in colors.keys(): logger.warning("No key color for this sample: {}. Set to red".format(k)) colors[k] = "r" colors = [colors[k] for k in self.labels] pylab.scatter(Xr[:,pc1], Xr[:,pc2], c=colors) ax = pylab.gca() X1, X2 = pylab.xlim() dX = X2 - X1 pylab.xlim([X1 + X1*0.05, X2 + X2*0.05]) Y1, Y2 = pylab.ylim() dY = Y2 - Y1 pylab.ylim([Y1 + Y1*0.05, Y2 + Y2*0.05]) count = 0 if show_labels: for x,y in zip(Xr[:,pc1], Xr[:,pc2]): x += dX / 40 y += dY / 40 ax.annotate(self.labels[count], (x,y)) count += 1 if count > 100: break if pca: pylab.xlabel("PC{} ({}%)".format(pc1+1, round(pca.explained_variance_ratio_[pc1]*100, 2))) pylab.ylabel("PC{} ({}%)".format(pc2+1, round(pca.explained_variance_ratio_[pc2]*100, 2))) pylab.grid(True)
def plot_scores(self, filename=None, savefig=False): # scores from pylab import log10 pylab.clf() pylab.plot(log10(self.df.query('score>540')['rep1_signal']), log10(self.df.query('score>540')['rep2_signal']), 'ko', alpha=0.5, label='<0.05 IDR') pylab.plot(log10(self.df.query('score<540')['rep1_signal']), log10(self.df.query('score<540')['rep2_signal']), 'ro', alpha=0.5, label='>=0.05 IDR') N = pylab.ylim()[1] pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--') pylab.xlabel("Rep1 log10 score") pylab.ylabel("Rep2 log10 score") pylab.legend(loc='lower right') if savefig: pylab.savefig(filename)
def plot_coverage(self, filename=None, fontsize=16, rm_lw=1, rm_color="#0099cc", rm_label="Running median", th_lw=1, th_color="r", th_ls="--", main_color="k", main_lw=1, main_kwargs={}, sample=True, set_ylimits=True): """ Plot coverage as a function of base position. :param filename: :param rm_lw: line width of the running median :param rm_color: line color of the running median :param rm_color: label for the running median :param th_lw: line width of the thresholds :param th_color: line color of the thresholds :param main_color: line color of the coverage :param main_lw: line width of the coverage :param sample: if there are more than 1 000 000 points, we use an integer step to skip data points. We can still plot all points at your own risk by setting this option to False :param set_ylimits: we want to focus on the "normal" coverage ignoring unsual excess. To do so, we set the yaxis range between 0 and a maximum value. This maximum value is set to the minimum between the 6 times the mean coverage and 1.5 the maximum of the high coverage threshold curve. If you want to let the ylimits free, set this argument to False .. note:: if there are more than 1,000,000 points, we show only 1,000,000 by points. For instance for 5,000,000 points, In addition to the coverage, the running median and coverage confidence corresponding to the lower and upper zscore thresholds are shown. .. note:: uses the thresholds attribute. """ # z = (X/rm - \mu ) / sigma high_zcov = (self.thresholds.high * self.best_gaussian["sigma"] + self.best_gaussian["mu"]) * self.df["rm"] low_zcov = (self.thresholds.low * self.best_gaussian["sigma"] + self.best_gaussian["mu"]) * self.df["rm"] pylab.clf() ax = pylab.gca() ax.set_facecolor('#eeeeee') pylab.xlim(0,self.df["pos"].iloc[-1]) axes = [] labels = [] # 1,000,000 points is a lot for matplotlib. Let us restrict ourself to 1 # million points for now. if len(self.df) > 1000000 and sample is True: NN = int(len(self.df)/1000000) else: NN = 1 # the main coverage plot p1, = pylab.plot(self.df["cov"][::NN], color=main_color, label="Coverage", linewidth=main_lw, **main_kwargs) axes.append(p1) labels.append("Coverage") # The running median plot if rm_lw > 0: p2, = pylab.plot(self.df["rm"][::NN], color=rm_color, linewidth=rm_lw, label=rm_label) axes.append(p2) labels.append(rm_label) # The threshold curves if th_lw > 0: p3, = pylab.plot(high_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls, label="Thresholds") p4, = pylab.plot(low_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls, label="_nolegend_") axes.append(p3) labels.append("Thresholds") pylab.legend(axes, labels, loc="best") pylab.xlabel("Position", fontsize=fontsize) pylab.ylabel("Per-base coverage", fontsize=fontsize) pylab.grid(True) # sometimes there are large coverage value that squeeze the plot. # Let us restrict it if set_ylimits is True: pylab.ylim([0, min([ high_zcov.max() * 1.5, self.df["cov"].mean()*6])]) else: pylab.ylim([0, pylab.ylim()[1]]) try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
for i in range(len(list_analysis)): analysis = list_analysis[i] res = compute_table_performance(analysis, df_results) print("%s" % analysis) # [TP, FP, FN, TN] # print(len(res[0]), len(res[1]), res[2], res[3] , sum([len(res[0]), len(res[1]), res[2], res[3]])) TP = res[0] FP = res[1] FN = [0] * res[2] TN = [0] * res[3] y_true = np.array([1] * len(TP) + [1] * len(FN) + [0] * len(FP) + [0] * len(TN)) y_scores = np.array(TP + FN + FP + TN) precision, recall, thresholds = precision_recall_curve(y_true, y_scores) pylab.plot(recall, precision, color=colors[i], label=analysis) pylab.xlabel('Recall') pylab.ylabel('Precision') pylab.ylim([0.0, 1.05]) pylab.xlim([0.0, 1.05]) pylab.title('Precision-Recall') #pylab.legend(loc="lower left") lgd = pylab.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) #pylab.tight_layout() if file_fig != "show": pylab.savefig(file_fig, bbox_extra_artists=(lgd, ), bbox_inches='tight') else: pylab.show()
def plot_common_major_counts(self, mode, labels=None, switch_up_down_cond2=False, add_venn=True, xmax=None, title="", fontsize=12, sortby="log2FoldChange"): """ :param mode: down, up or all .. plot:: :include-source: from sequana import sequana_data from sequana.compare import RNADiffCompare c = RNADiffCompare( sequana_data("rnadiff/rnadiff_onecond_1"), sequana_data("rnadiff/rnadiff_onecond_2")) c.plot_common_major_counts("down") """ #cond1, cond2 = self._get_cond1_cond2() if labels is None: labels = ['r1', 'r2'] if mode in ["down"]: # Negative values ! gl1 = set(self.r1.gene_lists['down']) gl2 = set(self.r2.gene_lists['down']) A = self.r1.df.loc[gl1].sort_values(by=sortby) B = self.r2.df.loc[gl1].sort_values(by=sortby) else: gl1 = set(self.r1.gene_lists[mode]) gl2 = set(self.r2.gene_lists[mode]) A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False) B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False) # sometimes, up and down may be inverted as compared to the other # conditions N = [] for i in range(1,max(len(A), len(B))): a = A.iloc[0:i].index b = B.iloc[0:i].index n = len(set(b).intersection(set(a))) N.append(n / i*100) max_common = len(set(A.index).intersection(set(B.index))) pylab.clf() if len(A) > len(B): pylab.axhline(max_common/len(A)*100, color="r", ls='--', label="min set intersection") pylab.axvline(len(B), ls="--", color="k", label="rank of minor set") else: pylab.axhline(max_common/len(B)*100, color='r', ls='--', label="min set intersect") pylab.axvline(len(A), ls="--", color="k", label="rank of minor set") pylab.plot(N) pylab.xlabel('rank', fontsize=fontsize) pylab.ylabel('% common features', fontsize=fontsize) pylab.grid(True) pylab.ylim([0,100]) if xmax: pylab.xlim([0, xmax]) else: pylab.xlim([0, max(len(A),len(B))]) pylab.title(title, fontsize=fontsize) ax = pylab.gca() ax2 = ax.twinx() ax2.plot(A[sortby].values, "orange", label=sortby) ax2.set_ylabel(sortby) pylab.legend(loc="lower left") ax.legend(loc="lower right") if add_venn: f = pylab.gcf() ax = f.add_axes([0.5,0.5,0.35,0.35], facecolor="grey") if mode=="down": self.plot_venn_down(ax=ax, title=None, labels=labels, mode="two_only") elif mode=="up": self.plot_venn_up(ax=ax, title=None, labels=labels, mode="two_only") elif mode=="all": self.plot_venn_all(ax=ax, title=None, labels=labels, mode="two_only")
def stats(self, results, df_avc, bw=1): stats = {} stats['read_fragments'] = len(self.df) stats['fragment_length'] = self.read_length # average cross correlation across all chromosomes print("Read {} fragments".format(stats['read_fragments'])) print("ChIP data mean length: {}".format(self.read_length)) #df_avc.sum(axis=1).plot() df = df_avc.sum(axis=1) corr_max = df.max() shift_max = df.idxmax() # note that in phantomPeak, they use the last value as min... not the # actual min. Not very important. corr_min = df.min() shift_min = df.idxmin() print("Maximum cross-correlation value: {:.5f}".format(corr_max)) print("Maximum cross-correlation shift: {}".format(shift_max)) print("Minimum cross-correlation value: {:.5f}".format(corr_min)) print("Minimum cross-correlation shift: {}".format(shift_min)) stats['shift_max'] = int(shift_max) # to make it json serialisable stats['corr_max'] = corr_max # original code phantomPeak but always equal to 1 it range max >5 ?? # default is 500 so sbw=1 whatsoever #sbw = 2 * floor(ceil(5/15000) / 2) + 1 sbw = 1 # here we could use a rolling mean #df.rolling(window=5, center=True).mean() # so following runnin mean is useless # cc$y <- runmean(cc$y,sbw,alg="fast") # # again, computation of bw but always equal to 1 .... # Compute cross-correlation peak # bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks #bw = 1 # search for local peaks within bandwidth of bw = 1 peakidx = df.diff(periods=bw) > 0 peakidx = peakidx.astype(int).diff(periods=bw) == -1 # the final bw points are NA and filled with False peakidx = peakidx.shift(-bw).fillna(False) df_peaks = df[peakidx] # when searching for max, exclude peaks from the excluded region exclusion_range = [10, self.read_length + 10] mask = np.logical_or(df_peaks.index < exclusion_range[0], df_peaks.index > exclusion_range[1]) df_peaks = df_peaks[mask] # max_peak = df_peaks.max() shift_peak = df_peaks.idxmax() # now, we select peaks that are at least 90% of main peak and with shift # higher than main shift. why higher ? mask = np.logical_and(df_peaks > max_peak * 0.9, df_peaks.index >= shift_peak) best_df_peaks = df_peaks[mask] best = best_df_peaks.sort_values(ascending=False)[0:3] values = ",".join(["{:.5f}".format(x) for x in best.values]) pos = ",".join([str(x) for x in best.index]) print("Top 3 cross-correlation values: {}".format(values)) print("Top 3 estimates for fragment length: {}".format(pos)) # now the real window half size according to phantom peaks, not spp ... # min + (max-min)/3 threshold = (df_peaks.max() - corr_min) / 3 + corr_min whs = df[df > threshold].index.max() # coming back to real cross correlation, identify peak in window # readlength +- 2*binning !! not symmetry in phantompeak # x >= ( chip.data$read.length - round(2*binning) & # x <= ( chip.data$read.length + round(1.5*binning) binning = self.binning ph_min = self.read_length - round(2 * binning) ph_max = self.read_length + round(1.5 * binning) phantom = df[np.logical_and(df.index >= ph_min, df.index <= ph_max)] print("Phantom peak range detection:{}-{}".format(ph_min, ph_max)) print("Phantom peak location:{}".format(phantom.idxmax())) print("Phantom peak Correlation: {:.5f}".format(phantom.max())) stats['phantom_corr'] = phantom.max() stats['phantom_location'] = int(phantom.idxmax()) # for json NSC = df_peaks.max() / phantom.max() # error in phatompeaks ?? is encoded as follows but no link with phantom # peak... # Another difference with phantom peak is that the min in phantom peak # is not the min but last value on the RHS so # phantom_coeff = df_peaks.max() / df.min() # is # phantom_coeff = df_peaks.max() / df.iloc[-1] NSC_spp = df_peaks.max() / df.iloc[-1] print( "Normalized Strand cross-correlation coefficient (NSC): {:.5f} [{:.5f}]" .format(NSC, NSC_spp)) RSC = (df_peaks.max() - df.min()) / (phantom.max() - df.min()) RSC_spp = (df_peaks.max() - df.iloc[-1]) / (phantom.max() - df.iloc[-1]) print( "Relative Strand cross-correlation Coefficient (RSC): {:.5f} [{:.5f}]" .format(RSC, RSC_spp)) if RSC > 0 and RSC < 0.25: tag = -2 elif RSC >= 0.25 and RSC < 0.5: tag = -1 elif RSC >= 0.5 and RSC < 1: tag = 0 elif RSC >= 1 and RSC < 1.5: tag = 1 elif RSC >= 1.5: tag = 2 print("Phantom Peak Quality Tag: {}".format(tag)) pylab.clf() df.plot() ##df_peaks.plot(marker="o", lw=0) ylim = pylab.ylim() #pylab.axvline(whs, ls='--', color='k', lw=1) Y0, Y1 = pylab.ylim() pylab.plot([phantom.idxmax(), phantom.idxmax()], [Y0, phantom.max()], ls='--', color='k', lw=1) pylab.plot([df.idxmax(), df.idxmax()], [Y0, df.max()], ls='--', color='r', lw=2) #pylab.fill_betweenx(ylim, 10,85, color='grey', alpha=0.5) pylab.ylim(ylim) pylab.ylabel("Cross-correlation") pylab.xlabel( "strand-shift: {}bp\nNSC={:.5f}, RSC={:.5f}, Qtag={}".format( best.index[0], NSC, RSC, tag)) pylab.xlim(self.start, self.stop) pylab.grid(True, zorder=-20) try: pylab.tight_layout() except: pass stats['NSC'] = NSC stats['RSC'] = RSC stats['Qtag'] = tag return stats
def plot(self, interpolation='None', aspect='auto', cmap='hot', tight_layout=True, colorbar=True, fontsize_x=None, fontsize_y=None, rotation_x=90, xticks_on=True, yticks_on=True, **kargs): """wrapper around imshow to plot a dataframe :param interpolation: set to None :param aspect: set to 'auto' :param cmap: colormap to be used. :param tight_layout: :param colorbar: add a colobar (default to True) :param fontsize_x: fontsize on xlabels :param fontsize_y: fontsize on ylabels :param rotation_x: rotate labels on xaxis :param xticks_on: switch off the xticks and labels :param yticks_on: switch off the yticks and labels """ data = self.df pylab.clf() pylab.imshow(data, interpolation=interpolation, aspect=aspect, cmap=cmap, **kargs) if fontsize_x == None: fontsize_x = 16 #FIXME use default values if fontsize_y == None: fontsize_y = 16 #FIXME use default values if yticks_on is True: pylab.yticks(range(0, len(data.index)), data.index, fontsize=fontsize_y) else: pylab.yticks([]) if xticks_on is True: pylab.xticks(range(0, len(data.columns[:])), data.columns, fontsize=fontsize_x, rotation=rotation_x) else: pylab.xticks([]) if colorbar is True: pylab.colorbar() if tight_layout: pylab.tight_layout() # For some reasons, in newest version of python/mpl, this is required # for ylim, not for xlim y1, y2 = pylab.ylim() pylab.ylim([y1 + 0.5, y2 - 0.5])