def plot_ranks(self, filename=None, savefig=False): # ranks # the *score* columns contains the scaled IDR value, min(int(log2(-125IDR), 1000). # e.g. peaks with an IDR of 0 have a score of 1000, idr 0.05 have a score of # int(-125log2(0.05)) = 540, and idr 1.0 has a score of 0. df1 = self.df.query('score>540') df2 = self.df.query('score<=540') pylab.clf() pylab.plot(df1.rep1_rank, df1.rep2_rank, 'ko', alpha=0.5, label='<0.05 IDR') pylab.plot(df2.rep1_rank, df2.rep2_rank, 'ro', alpha=0.5, label='>=0.05 IDR') pylab.xlabel("Peak rank - replicate 1") pylab.ylabel("Peak rank - replicate 2") N = len(self.df) pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--') #pylab.xlim([0,1.05]) #pylab.ylim([0,1.05]) pylab.legend(loc='lower right') if savefig: pylab.savefig(filename)
def plot_unknown_barcodes(self, N=20): ub = self.data['UnknownBarcodes'] df = pd.DataFrame({x['Lane']: x['Barcodes'] for x in ub}) if "unknown" in df.index and len(df) == 1: df.loc['known'] = [0 for i in df.columns] # if data is made of undetermined only, the dataframe is just made of # N lanes with one entry : unknown S = df.sum(axis=1).sort_values(ascending=False).index[0:N] data = df.loc[S][::-1] #print(data) data.columns = ["Lane {}".format(x) for x in data.columns] from matplotlib import rcParams rcParams['axes.axisbelow'] = True pylab.figure(figsize=(10, 8)) ax = pylab.gca() data.plot(kind="barh", width=1, ec="k", ax=ax) rcParams['axes.axisbelow'] = False pylab.xlabel("Number of reads", fontsize=12) pylab.ylabel("") pylab.grid(True) pylab.legend( ["Lane {}".format(x) for x in range(1, len(df.columns) + 1)], loc="lower right") try: pylab.tight_layout() except Exception as err: print(err) return data
def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"): if self._ORF_pos is None: self._find_ORF_CDS() n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0] n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0] # plot for all ORF and CDS pylab.hist(self._ORF_pos["len_ORF"].dropna(), alpha=alpha, label="ORF, N = " + str(n_ORF), bins=bins) pylab.hist(self._ORF_pos["len_CDS"].dropna(), alpha=alpha, label="CDS, N = " + str(n_CDS), bins=bins) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend() pylab.title("Length of ORF and CDS (after filter %s > %d)" \ %(self._type_filter, self._threshold))
def plot_dispersion(self): pylab.plot( self.dds_stats.baseMean, self.dds_stats.dispGeneEst, "ok", label="Estimate", ms=1, ) pylab.plot( self.dds_stats.baseMean, self.dds_stats.dispersion, "ob", label="final", ms=1, ) pylab.plot(self.dds_stats.baseMean, self.dds_stats.dispFit, "or", label="Fit", ms=1) pylab.legend() ax = pylab.gca() ax.set(yscale="log") ax.set(xscale="log") self._format_plot( title="Dispersion estimation", xlabel="Mean of normalized counts", ylabel="Dispersion", )
def plot_alignment(self, bamfile, motif, window=200, global_th=10,title=None,legend=True, legend_fontsize=11, valid_rnames=[], valid_flags=[]): """ plot alignments that match the motif. """ bam = BAM(bamfile) print("Found {} hits".format(len(bam))) pylab.clf() count = 0 for aln in bam: if valid_rnames and aln.rname not in valid_rnames: continue if valid_flags and aln.flag not in valid_flags: continue seq = aln.query_sequence if seq: count += 1 X1 = [seq[i:i+window].count(motif) for i in range(len(seq))] pylab.plot(range(aln.reference_start, aln.reference_start+len(seq)),X1, label=aln.query_name) print("Showing {} entries after filtering".format(count)) max_theo = int(1.2*window / len(motif)) pylab.ylim([0, max_theo]) if legend and count<15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16)
def plot_specific_alignment(self, bamfile, query_name, motif,clf=True, show_figure=True, authorized_flags=[0,16], windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5): found = None bam = BAM(bamfile) for aln in bam: if aln.query_name == query_name and aln.flag in authorized_flags: found = aln break # we may have several entries. let us pick up the first sizes = [] if found: # Detection seq = found.query_sequence if clf:pylab.clf() for window in windows: X = [seq[i:i+window].count(motif) for i in range(len(seq))] if show_figure: pylab.plot(X, label=window) score = sum([x>local_threshold for x in X]) sizes.append(score-window) if show_figure: pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("{} Not found in {} file".format(query_name, bamfile)) return sizes
def plot_stacked_hist(self, output_filename=None, dpi=200, kind="barh", fontsize=10, edgecolor="k", lw=1, width=1, ytick_fontsize=10): df = self.get_df() df.T.plot(kind=kind, stacked=True, edgecolor=edgecolor, lw=lw, width=width) ax = pylab.gca() positions = pylab.yticks() #ax.set_yticklabel(positions, labels, fontsize=ytick_fontsize) pylab.xlabel("Percentage (%)", fontsize=fontsize) pylab.ylabel("Sample index/name", fontsize=fontsize) pylab.yticks(fontsize=ytick_fontsize) pylab.legend(title="kingdom") pylab.xlim([0, 100]) if output_filename: pylab.savefig(output_filename, dpi=dpi)
def scatter_plot(self, filename=None, hold=False): """Scatter plot of the score versus length of each ortholog .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.scatter_plot() """ if hold is False: pylab.clf() colors = ["green", "orange", "red", "blue"] markers = ['o', 's', 'x', 'o'] for i, this in enumerate(["Complete", "Fragmented", "Missing", "Duplicated"]): mask = self.df.Status == "Complete" if sum(mask)>0: self.df[mask].plot(x="Length", y="Score", kind="scatter", color=colors[i], marker=markers[i], label="Complete") pylab.legend() pylab.grid() if filename: pylab.savefig(filename)
def hist_average_quality(self, fontsize=16, bins=None): """ bins is from 0 to 94 """ hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) for read in self.hq_sequence] lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) for read in self.lq_sequence] if bins is None: bins = range(0,94) Y1, X = np.histogram(hq_qv, bins=bins) Y2, X = np.histogram(lq_qv, bins=bins) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlim([0.5, 93.5]) pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize) ax = pylab.twinx() N = np.sum(Y1+Y2) ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
def plot_polymerase_per_barcode(self, fontsize=12, unbarcoded=True): """Number Of Polymerase Reads Per Barcode""" PR = self.df_barcoded["Polymerase Reads"].sum() data = self.df_barcoded['Polymerase Reads'].sort_values( ascending=False).values pylab.plot([int(x) for x in range(1, len(data) + 1)], data, label="barcodes") pylab.axhline(data.mean(), color="r", label="average") try: if unbarcoded is True: unbar = self.df_not_barcoded['Polymerase Reads'].iloc[0] pylab.axhline(unbar, color="k", ls="--", label="not barcoded") except: pass pylab.xlabel("Barcode Rank Order", fontsize=fontsize) pylab.ylabel("Counts of Reads", fontsize=fontsize) pylab.title("Total Polymerase count: {}".format(PR)) pylab.legend() pylab.ylim(ymin=0) try: pylab.tight_layout() except: pass
def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr] ["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr] ["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames) - (bar_width / 2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" % sum(nb_res_ORF)) pylab.bar(np.array(frames) + (bar_width / 2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" % sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def plot_specific_alignment(self, query_name, motif, clf=True, windows=[10, 50, 100, 200, 500, 1000]): found = None bam = BAM(self.bamfile) for aln in bam: if aln.query_name == query_name: found = aln if found: # Detection seq = found.query_sequence if clf: pylab.clf() for window in windows: X = [seq[i:i + window].count(motif) for i in range(len(seq))] pylab.plot(X, label=window) score = sum([x > window / 6 for x in X]) print(window, score / 3.) pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("Not found")
def scatter_plot(self, filename=None, hold=False): """Scatter plot of the score versus length of each ortholog .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.scatter_plot() Missing are not show since there is no information about contig . """ if hold is False: pylab.clf() colors = ["green", "orange", "red", "blue"] markers = ['o', 's', 'x', 'o'] for i, this in enumerate(["Complete", "Fragmented", "Duplicated"]): mask = self.df.Status == this if sum(mask) > 0: self.df[mask].plot(x="Length", y="Score", kind="scatter", color=colors[i], ax=pylab.gca(), marker=markers[i], label=this) pylab.legend() pylab.grid() if filename: pylab.savefig(filename)
def hist_length_repeats(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, label="Repeat length", xlabel="Repeat length", ylabel="#"): """Plots histogram of the repeat lengths """ # check that user has set a threshold if self._list_len_repeats is None: self._get_list_len_repeats() if bins is None: bins = range(max(0, self.threshold - 1), max(self._list_len_repeats) + 2) if hold is False: pylab.clf() pylab.hist(self._list_len_repeats, alpha=alpha, label=label, bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True)
def barplot_summary(self, filename=None, color=["green", "red"], alpha=0.8): df = self.get_data_reads() under = df.query("name=='Undetermined'") total = df.query("name!='Undetermined'") total = total.groupby("lane").sum().reset_index() total["name"] = "Determined" df = pd.concat([under, total]) #sort=True) df = df.pivot(index="lane", columns="name", values="count") df = df[["Determined", "Undetermined"]] if df.sum().min() > 1e6: df /= 1e6 df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k') pylab.xlabel("Number of reads (M)") else: df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k') pylab.xlabel("Number of reads") pylab.legend() if filename: pylab.savefig(filename, dpi=200) return df
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:,"snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title,fontsize=fontsize) if grid is True: pylab.grid(True)
def plot(self, chr_name, x1=None, x2=None, Y=20): df = self.df.query("name == @chr_name") for _, item in df.iterrows(): if item['type'] == "deletion": plot([item.start, item.end], [-1, -1], "r-", label="deletion") else: plot([item.start, item.end], [Y, Y], "b-", label="duplication") pylab.legend()
def barplot_per_sample(self, alpha=0.5, width=0.8, filename=None): df = self.get_data_reads() # this is ugly but will do the job for now under = df.query("name=='Undetermined'") others = df.query("name!='Undetermined'") under = under.groupby("name").sum().reset_index() others = others.groupby("name").sum().reset_index() under = under[["name", "count"]].set_index("name") others = others[["name", "count"]].set_index("name") all_data = others.sort_index(ascending=False) all_data.columns = ["samples"] # appended at the end all_data.loc['undetermined'] = 0 # revert back all_data = all_data.loc[::-1] # just for legend under.columns = ['undetermined'] if all_data.sum().min() > 1e6: all_data /= 1e6 under /= 1e6 M = True else: M = False all_data.plot(kind="barh", alpha=alpha, zorder=1, width=width, ec='k') under.plot(kind="barh", alpha=alpha, color="red", ax=pylab.gca(), zorder=1, width=width, ec='k') pylab.ylim([-0.5, len(all_data) + 0.5]) if len(all_data) < 100: pylab.yticks(range(len(all_data)), all_data.index) pylab.legend() pylab.grid(True, zorder=-1) if M: pylab.xlabel("Number of reads (M)") else: pylab.xlabel("Number of reads") try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename, dpi=200)
def plot_volcano_differences(self, mode="all"): cond1, cond2 = "cond1", "cond2" labels = [cond1, cond2] A = self.r1.df.loc[self.r1.gene_lists[mode]] B = self.r2.df.loc[self.r2.gene_lists[mode]] AB = set(A.index).intersection(set(B.index)) Aonly = A.loc[set(A.index).difference(set(B.index))] Bonly = B.loc[set(B.index).difference(set(A.index))] Acommon = A.loc[AB] Bcommon = B.loc[AB] pylab.clf() pylab.plot(Acommon.log2FoldChange, -np.log10(Acommon.padj), marker="o", alpha=0.5, color="r", lw=0, label="Common in experiment 1", pickradius=4, picker=True) pylab.plot(Bcommon.log2FoldChange, -np.log10(Bcommon.padj), marker="o", alpha=0.5, color="orange", lw=0, label="Common in experiment 2", pickradius=4, picker=True) for x in AB: a_l = A.loc[x].log2FoldChange a_p = -np.log10(A.loc[x].padj) b_l = B.loc[x].log2FoldChange b_p = -np.log10(B.loc[x].padj) pylab.plot([a_l, b_l], [a_p, b_p], 'k', alpha=0.5) pylab.plot(Bonly.log2FoldChange, -np.log10(Bonly.padj), marker="*", alpha=0.5, color="blue", lw=0, label="In experiment 2 only", pickradius=4, picker=True) pylab.plot(Aonly.log2FoldChange, -np.log10(Aonly.padj), marker="*", alpha=0.5, color="cyan", lw=0, label="In experiment 1 only", pickradius=4, picker=True) for name, x in Bonly.iterrows(): x1 = x.log2FoldChange y1 = -np.log10(x.padj) x2 = self.r1.df.loc[name].log2FoldChange y2 = -np.log10(self.r1.df.loc[name].padj) pylab.plot( [x1,x2], [y1,y2], ls="--", color='r') for name, x in Aonly.iterrows(): x1 = x.log2FoldChange y1 = -np.log10(x.padj) x2 = self.r2.df.loc[name].log2FoldChange y2 = -np.log10(self.r2.df.loc[name].padj) pylab.plot( [x1,x2], [y1,y2], ls="-", color='r') pylab.axhline(1.33, alpha=0.5, ls="--", color="r") pylab.xlabel("log2 fold Change") pylab.ylabel("log10 adjusted p-values") pylab.legend() pylab.grid(True) return Aonly, Bonly, Acommon, Bcommon
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#", title=""): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() pylab.hist(self._df.loc[:, 'snr_A'], alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:, 'snr_C'], alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:, 'snr_G'], alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:, 'snr_T'], alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_isoform_length_mapped_vs_unmapped(self, bins=None): df = self.df if bins is None: bins = range(0, len(df.reference_length.max()), 100) mapped = df[df.reference_name != -1] unmapped = df[df.reference_name == -1] pylab.hist(mapped.reference_length, bins=bins, alpha=0.5, label="mapped {}".format(len(mapped)), density=False) pylab.hist(unmapped.reference, bins=bins, alpha=0.5, label="unmapped {}".format(len(unmapped)), density=False) pylab.xlabel("Isoform length") pylab.legend()
def hist_isoform_length_mapped_vs_unmapped(self, bins=None): df = self.df if bins is None: bins = range(0, df.read_length.max(), 100) mapped = df[df.reference_name != -1] unmapped = df[df.reference_name == -1] pylab.hist(mapped.read_length, bins=bins, alpha=0.5, label="mapped {}".format(len(mapped)), normed=True) pylab.hist(unmapped.read_length, bins=bins, alpha=0.5, label="unmapped {}".format(len(unmapped)), normed=True) pylab.xlabel("Isoform length") pylab.legend()
def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000, alpha=1, output_filename=None): # compute histogram of the input reads once for all to be used # in the target_distribution method self.bins = bins self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, normed=True) lengths = self.bam_simul.df.read_length.values self.tokeep = [] vec = [] x = self.bam.df.read_length.mean() for i in range(self.bam_simul.df.shape[0]): can = lengths[i] aprob = min([ alpha, self.target_distribution(can) / self.target_distribution(x) ]) #acceptance probability u = pylab.uniform(0, 1) if u < aprob: x = can vec.append(x) self.tokeep.append(True) else: self.tokeep.append(False) #plotting the results: #theoretical curve x = pylab.arange(xmin, xmax, step) y = self.target_distribution(x) pylab.subplot(211) pylab.title('Metropolis-Hastings') pylab.plot(vec) pylab.subplot(212) pylab.hist(vec[burn:], bins=bins, normed=1) pylab.plot(x, y, 'r-') pylab.ylabel('Frequency') pylab.xlabel('x') pylab.legend(('PDF', 'Samples')) if output_filename is not None: self.bam_simul.filter_bool(output_filename, self.tokeep)
def plot_indel_dist(self, fontsize=16): """Plot indel count (+ ratio) :Return: list of insertions, deletions and ratio insertion/deletion for different length starting at 1 .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.plot_indel_dist() What you see on this figure is the presence of 10 insertions of length 1, 1 insertion of length 2 and 3 deletions of length 1 # Note that in samtools, several insertions or deletions in a single alignment are ignored and only the first one seems to be reported. For instance 10M1I10M1I stored only 1 insertion in its report; Same comment for deletions. .. todo:: speed up and handle long reads cases more effitiently by storing INDELS as histograms rather than lists """ try: self.insertions except: self._set_indels() if len(self.insertions) ==0 or len(self.deletions) == 0: raise ValueError("No deletions or insertions found") N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1 D = [self.deletions.count(i) for i in range(N)] I = [self.insertions.count(i) for i in range(N)] R = [i/d if d!=0 else 0 for i,d in zip(I, D)] fig, ax = pylab.subplots() ax.plot(range(N), I, marker="x", label="Insertions") ax.plot(range(N), D, marker="x", label="Deletions") ax.plot(range(N), R, "--r", label="Ratio insertions/deletions") ax.set_yscale("symlog") pylab.ylim([1, pylab.ylim()[1]]) pylab.legend() pylab.grid() from matplotlib.ticker import MaxNLocator ax.xaxis.set_major_locator(MaxNLocator(integer=True)) pylab.xlabel("Indel length", fontsize=fontsize) pylab.ylabel("Indel count", fontsize=fontsize) return I, D, R
def bar_plot_contigs_length(self): # show length of N contigs as compare to length of the reference fref = FastA(self.reference) Nref = len(fref.sequences) N = len(self.fasta) pylab.clf() pylab.bar(range(0, N, int(pylab.ceil(N / Nref))), sorted(fref.lengths), width=Nref / 1.1, label="Plasmodium chromosomes") pylab.bar(range(0, N), sorted(self.fasta.lengths), width=1, label="canu {} contigs".format(N)) pylab.legend()
def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"): if self._ORF_pos is None: self._find_ORF_CDS() n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0] n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0] # plot for all ORF and CDS pylab.hist(self._ORF_pos["len_ORF"].dropna(),alpha=alpha, label="ORF, N = " + str(n_ORF),bins=bins) pylab.hist(self._ORF_pos["len_CDS"].dropna(),alpha=alpha, label="CDS, N = " + str(n_CDS),bins=bins) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend() pylab.title("Length of ORF and CDS (after filter %s > %d)" \ %(self._type_filter, self._threshold))
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True,xlabel="SNR",ylabel="#"): """Plot histogram of the ACGT SNRs for all reads""" if self._df is None: self._get_df() if hold is False: pylab.clf() pylab.hist(self._df.loc[:,'snr_A'], alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:,'snr_C'], alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:,'snr_G'], alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:,'snr_T'], alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_read_length(self): """Plot occurences of aligned read lengths .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("test.bam")) b.plot_read_length() """ X, Y = self._get_read_length() pylab.plot(X, Y, label="min length:{}; max length:{}".format(min(X), max(X))) pylab.grid() pylab.xlabel("Read length", fontsize=16) pylab.legend()
def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF)) pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def hist_average_quality(self, fontsize=16): hq_qv = [ mean([phred.ascii_to_quality(X) for X in read['quality'].decode()]) for read in iso.hq_sequence ] lq_qv = [ mean([phred.ascii_to_quality(X) for X in read['quality'].decode()]) for read in iso.lq_sequence ] Y1, X = numpy.histogram(hq_qv, bins=range(0, 94)) Y2, X = numpy.histogram(lq_qv, bins=range(0, 94)) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize)
def plot_alignment(self, motif, window=200, global_th=10, title=None, legend=True, legend_fontsize=11): """ plot alignments that match the motif. """ df = self._get_aligments(motif=motif, window=window, global_th=global_th) print("Found {} hits".format(len(df))) bam = BAM(self.bamfile) pylab.clf() count = 0 for aln in bam: if aln.query_name in df.query_name.values: seq = aln.query_sequence if seq: count += 1 X1 = [ seq[i:i + window].count(motif) for i in range(len(seq)) ] pylab.plot(range(aln.reference_start, aln.reference_start + len(seq)), X1, label=aln.query_name) max_theo = int(1.2 * window / len(motif)) pylab.ylim([0, max_theo]) if legend and count < 15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16) return df
def plot_scores(self, filename=None, savefig=False): # scores from pylab import log10 pylab.clf() pylab.plot(log10(self.df.query('score>540')['rep1_signal']), log10(self.df.query('score>540')['rep2_signal']), 'ko', alpha=0.5, label='<0.05 IDR') pylab.plot(log10(self.df.query('score<540')['rep1_signal']), log10(self.df.query('score<540')['rep2_signal']), 'ro', alpha=0.5, label='>=0.05 IDR') N = pylab.ylim()[1] pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--') pylab.xlabel("Rep1 log10 score") pylab.ylabel("Rep2 log10 score") pylab.legend(loc='lower right') if savefig: pylab.savefig(filename)
def barplot(self, filename="lane{}_status.png", lanes=None): df = self.get_data_reads() if lanes is None: lanes = df.lane.unique() for lane in lanes: pylab.clf() query = "lane==@lane and name!='Undetermined'" counts = df.query(query)['count'] total = counts.sum() L = len(counts) query = "lane==@lane and name=='Undetermined'" under = df.query(query)['count'].sum() if total > 0: pylab.bar(range(L), counts, color="b", label="reads") if total == 0: color = "red" else: if 100 * under / total < 20: color = "green" elif 100 * under / total < 50: color = "orange" else: color = "red" pylab.bar(range(L, L + 1), under, color=color, label="undetermined") pylab.xticks([]) pylab.ylabel("Number of reads") try: pylab.legend(loc="lower left") except: pass pylab.title("Lane {}".format(lane)) pylab.savefig(filename.format(lane), dpi=200)
def plot_bar_grouped(self, normalise=False, ncol=2, N=None): """ :param normalise: :param ncol: columns in the legend """ if N is not None: N = np.array(N) else: N = np.array([len(x) for x in self.rawdata]) dd = pd.DataFrame(self.sirv).T if normalise: dd = dd/ (N/max(N)) dd.columns = self.labels dd.plot(kind="bar") pylab.xlabel("") pylab.legend(self.labels, ncol=ncol) pylab.tight_layout() return dd
def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None): # compute histogram of the input reads once for all to be used # in the target_distribution method self.bins = bins self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True) lengths = self.bam_simul.df.read_length.values self.tokeep = [] vec = [] x = self.bam.df.read_length.mean() for i in range(self.bam_simul.df.shape[0]): can = lengths[i] aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)]) #acceptance probability u = pylab.uniform(0,1) if u < aprob: x = can vec.append(x) self.tokeep.append(True) else: self.tokeep.append(False) #plotting the results: #theoretical curve x = pylab.arange(xmin, xmax, step) y = self.target_distribution(x) pylab.subplot(211) pylab.title('Metropolis-Hastings') pylab.plot(vec) pylab.subplot(212) pylab.hist(vec[burn:], bins=bins, density=1) pylab.plot(x,y,'r-') pylab.ylabel('Frequency') pylab.xlabel('x') pylab.legend(('PDF','Samples')) if output_filename is not None: self.bam_simul.filter_bool(output_filename, self.tokeep)
def plot_bar_grouped(self, normalise=False, ncol=2, N=None): """ :param normalise: :param ncol: columns in the legend """ if N is not None: N = np.array(N) else: N = np.array([len(x) for x in self.rawdata]) dd = pd.DataFrame(self.sirv).T if normalise: dd = dd / (N / max(N)) dd.columns = self.labels dd.plot(kind="bar") pylab.xlabel("") pylab.legend(self.labels, ncol=ncol) pylab.tight_layout() return dd
def _do_legend(self, figure, color_dict, bbox_to_anchor): if color_dict: patches = [ mpatches.Patch(color=c, label=l) for l, c in color_dict.items() ] legend = pylab.legend( loc="upper center", handles=patches, bbox_to_anchor=bbox_to_anchor, frameon=True, title="Sample groups", ) figure.add_artist(legend)