def plot(self, color_line='r', bgcolor='grey', color='yellow', lw=4, hold=False, ax=None): xmax = self.xmax + 1 if ax: pylab.sca(ax) pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.3) pylab.fill_between([0, xmax], [20, 20], [30, 30], color='orange', alpha=0.3) pylab.fill_between([0, xmax], [30, 30], [41, 41], color='green', alpha=0.3) if self.X is None: X = range(1, self.xmax + 1) pylab.fill_between(X, self.df.mean() + self.df.std(), self.df.mean() - self.df.std(), color=color, interpolate=False) pylab.plot(X, self.df.mean(), color=color_line, lw=lw) pylab.ylim([0, 41]) pylab.xlim([0, self.xmax + 1]) pylab.title("Quality scores across all bases") pylab.xlabel("Position in read (bp)") pylab.ylabel("Quality") pylab.grid(axis='x')
def plot_contig_length_vs_GC(self, alpha=0.5): pylab.plot(self.df["length"], self.df['GC'], "o", alpha=alpha) pylab.xlabel("contig length (bp)") pylab.ylabel("GC (%)") pylab.grid(True) pylab.ylim([0, 100]) pylab.xlim(0, max(self.df['length']) + 10)
def hist_length_repeats(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, label="Repeat length", xlabel="Repeat length", ylabel="#"): """Plots histogram of the repeat lengths """ # check that user has set a threshold if self._list_len_repeats is None: self._get_list_len_repeats() if bins is None: bins = range(max(0, self.threshold - 1), max(self._list_len_repeats) + 2) if hold is False: pylab.clf() pylab.hist(self._list_len_repeats, alpha=alpha, label=label, bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True)
def imshow_qualities(self): """Qualities :: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.imshow_qualities() from pylab import tight_layout; tight_layout() """ tiles = self._get_tile_info() d = defaultdict(list) for tile, seq in zip(tiles['tiles'], self.qualities): d[tile].append(seq) self.data_imqual = [pd.DataFrame(d[key]).mean().values for key in sorted(d.keys())] from biokit.viz import Imshow im = Imshow(self.data_imqual) im.plot(xticks_on=False, yticks_on=False, origin='lower') pylab.title("Quality per tile", fontsize=self.fontsize) pylab.xlabel("Position in read (bp)") pylab.ylabel("tile number")
def plot_specific_alignment(self, query_name, motif, clf=True, windows=[10, 50, 100, 200, 500, 1000]): found = None bam = BAM(self.bamfile) for aln in bam: if aln.query_name == query_name: found = aln if found: # Detection seq = found.query_sequence if clf: pylab.clf() for window in windows: X = [seq[i:i + window].count(motif) for i in range(len(seq))] pylab.plot(X, label=window) score = sum([x > window / 6 for x in X]) print(window, score / 3.) pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("Not found")
def plot_bar_flags(self, logy=True, fontsize=16, filename=None): """Plot an histogram of the flags contained in the BAM .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam', "testing")) b.plot_bar_flags() .. seealso:: :class:`SAMFlags` for meaning of each flag """ df = self.get_flags_as_df() df = df.sum() pylab.clf() if logy is True: barplot = df.plot(kind='bar', logy=logy, grid=True) else: barplot = df.plot(kind='bar', grid=True) pylab.xlabel("flags", fontsize=fontsize) pylab.ylabel("count", fontsize=fontsize) pylab.tight_layout() if filename: pylab.savefig(filename) return barplot
def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"): if self._ORF_pos is None: self._find_ORF_CDS() n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0] n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0] # plot for all ORF and CDS pylab.hist(self._ORF_pos["len_ORF"].dropna(), alpha=alpha, label="ORF, N = " + str(n_ORF), bins=bins) pylab.hist(self._ORF_pos["len_CDS"].dropna(), alpha=alpha, label="CDS, N = " + str(n_CDS), bins=bins) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend() pylab.title("Length of ORF and CDS (after filter %s > %d)" \ %(self._type_filter, self._threshold))
def hist_average_quality(self, fontsize=16, bins=None): """ bins is from 0 to 94 """ hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) for read in self.hq_sequence] lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) for read in self.lq_sequence] if bins is None: bins = range(0,94) Y1, X = np.histogram(hq_qv, bins=bins) Y2, X = np.histogram(lq_qv, bins=bins) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlim([0.5, 93.5]) pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize) ax = pylab.twinx() N = np.sum(Y1+Y2) ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read['sequence']) for read in self.lq_sequence] L2 = [len(read['sequence']) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads")
def plot_bar_mapq(self, fontsize=16, filename=None): """Plots bar plots of the MAPQ (quality) of alignments .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam', "testing")) b.plot_bar_mapq() """ df = self.get_mapq_as_df() df.plot(kind='hist', bins=range(0, df.max().values[0] + 1), legend=False, grid=True, logy=True) pylab.xlabel("MAPQ", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) try: # This may raise issue on MAC platforms pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
def plot_ranks(self, filename=None, savefig=False): # ranks # the *score* columns contains the scaled IDR value, min(int(log2(-125IDR), 1000). # e.g. peaks with an IDR of 0 have a score of 1000, idr 0.05 have a score of # int(-125log2(0.05)) = 540, and idr 1.0 has a score of 0. df1 = self.df.query('score>540') df2 = self.df.query('score<=540') pylab.clf() pylab.plot(df1.rep1_rank, df1.rep2_rank, 'ko', alpha=0.5, label='<0.05 IDR') pylab.plot(df2.rep1_rank, df2.rep2_rank, 'ro', alpha=0.5, label='>=0.05 IDR') pylab.xlabel("Peak rank - replicate 1") pylab.ylabel("Peak rank - replicate 2") N = len(self.df) pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--') #pylab.xlim([0,1.05]) #pylab.ylim([0,1.05]) pylab.legend(loc='lower right') if savefig: pylab.savefig(filename)
def hist_length_repeats(self, bins=20, alpha=0.5, hold=False, fontsize=12, grid=True, title="Repeat length", xlabel="Repeat length", ylabel="#"): """Plots histogram of the repeat lengths """ # check that user has set a threshold if self._list_len_repeats is None: self._get_list_len_repeats() if hold is False: pylab.clf() pylab.hist(self._list_len_repeats, alpha=alpha, bins=bins) pylab.title(title) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True)
def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.scatter(-pylab.log10(df['Adjusted P-value']), range(len(df)), s=10 * df['size'], c=df['size']) pylab.xlabel("Odd ratio") pylab.ylabel("Gene sets") pylab.yticks(range(len(df)), df.name) a, b = pylab.xlim() pylab.xlim([0, b]) pylab.grid(True) ax = pylab.gca() M = max(df['size']) if M > 100: l1, l2, l3 = "10", "100", str(M) else: l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M) handles = [ pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""), pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""), pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="") ] ax.legend(handles=handles, loc="upper left", title="gene-set size") pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.tight_layout() ax = pylab.colorbar(pylab.gci()) return df
def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr] ["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr] ["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames) - (bar_width / 2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" % sum(nb_res_ORF)) pylab.bar(np.array(frames) + (bar_width / 2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" % sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def get_max_gc_correlation(self, reference): """Plot correlation between coverage and GC content by varying the GC window The GC content uses a moving window of size W. This parameter affects the correlation bewteen coverage and GC. This function find the *optimal* window length. """ pylab.clf() corrs = [] wss = [] def func(params): ws = int(round(params[0])) if ws < 10: return 0 self.bed.compute_gc_content(reference, ws) corr = self.get_gc_correlation() corrs.append(corr) wss.append(ws) return corr from scipy.optimize import fmin res = fmin(func, 100, xtol=1, disp=False) # guess is 200 pylab.plot(wss, corrs, "o") pylab.xlabel("GC window size") pylab.ylabel("Correlation") pylab.grid() return res[0]
def hist_entropy(self, bins=50): """Histogram of the entropy of all found repeats """ self.df.entropy.hist(bins=bins) pylab.xlabel("Entropy") pylab.ylabel("#")
def plot_stacked_hist(self, output_filename=None, dpi=200, kind="barh", fontsize=10, edgecolor="k", lw=1, width=1, ytick_fontsize=10): df = self.get_df() df.T.plot(kind=kind, stacked=True, edgecolor=edgecolor, lw=lw, width=width) ax = pylab.gca() positions = pylab.yticks() #ax.set_yticklabel(positions, labels, fontsize=ytick_fontsize) pylab.xlabel("Percentage (%)", fontsize=fontsize) pylab.ylabel("Sample index/name", fontsize=fontsize) pylab.yticks(fontsize=ytick_fontsize) pylab.legend(title="kingdom") pylab.xlim([0, 100]) if output_filename: pylab.savefig(output_filename, dpi=dpi)
def plot_unknown_barcodes(self, N=20): ub = self.data['UnknownBarcodes'] df = pd.DataFrame({x['Lane']: x['Barcodes'] for x in ub}) if "unknown" in df.index and len(df) == 1: df.loc['known'] = [0 for i in df.columns] # if data is made of undetermined only, the dataframe is just made of # N lanes with one entry : unknown S = df.sum(axis=1).sort_values(ascending=False).index[0:N] data = df.loc[S][::-1] #print(data) data.columns = ["Lane {}".format(x) for x in data.columns] from matplotlib import rcParams rcParams['axes.axisbelow'] = True pylab.figure(figsize=(10, 8)) ax = pylab.gca() data.plot(kind="barh", width=1, ec="k", ax=ax) rcParams['axes.axisbelow'] = False pylab.xlabel("Number of reads", fontsize=12) pylab.ylabel("") pylab.grid(True) pylab.legend( ["Lane {}".format(x) for x in range(1, len(df.columns) + 1)], loc="lower right") try: pylab.tight_layout() except Exception as err: print(err) return data
def plot_specific_alignment(self, bamfile, query_name, motif,clf=True, show_figure=True, authorized_flags=[0,16], windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5): found = None bam = BAM(bamfile) for aln in bam: if aln.query_name == query_name and aln.flag in authorized_flags: found = aln break # we may have several entries. let us pick up the first sizes = [] if found: # Detection seq = found.query_sequence if clf:pylab.clf() for window in windows: X = [seq[i:i+window].count(motif) for i in range(len(seq))] if show_figure: pylab.plot(X, label=window) score = sum([x>local_threshold for x in X]) sizes.append(score-window) if show_figure: pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("{} Not found in {} file".format(query_name, bamfile)) return sizes
def histogram_sequence_lengths(self, logy=True): """Histogram sequence lengths .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.histogram_sequence_lengths() """ data = [len(x) for x in self.sequences] bary, barx = np.histogram(data, bins=range(max(data)+1)) # get rid of zeros to avoid warnings bx = [x for x,y in zip(barx, bary) if y!=0] by = [y for x,y in zip(barx, bary) if y!=0] if logy: pylab.bar(bx, pylab.log10(by)) else: pylab.bar(bx, by) pylab.xlim([1,max(data)+1]) pylab.grid(True) pylab.xlabel("position (bp)", fontsize=self.fontsize) pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
def plot_count_per_sample(self, fontsize=12, sample_list=None): """"Number of mapped reads per sample. Each color for each replicate .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_count_per_sample() """ sample_names = self.sample_names N = len(sample_names) dd = self.df[sample_names].sum() pylab.clf() colors = [] for sample in self.sample_names: colors.append(self.colors[self.get_cond_from_sample(sample)]) pylab.bar(range(N), (dd/1000000).values, color=colors, alpha=1, zorder=10, lw=1, ec="k", width=0.9) pylab.xlabel("Samples", fontsize=fontsize) pylab.ylabel("Total read count (millions)", fontsize=fontsize) pylab.grid(True, zorder=0) pylab.title("Total read count per sample", fontsize=fontsize) pylab.xticks(range(N), self.sample_names)
def hist_contig_length(self, bins=30, fontsize=16): pylab.clf() pylab.hist(self.df.length, lw=1, ec="k", bins=bins) pylab.grid() pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("#", fontsize=fontsize) pylab.title("Distribution {} contigs".format(len(self.df)))
def plot_volcano(self): """ .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_volcano() """ d1 = self.df.query("padj>0.05") d2 = self.df.query("padj<=0.05") fig = pylab.figure() pylab.plot(d1.log2FoldChange, -np.log10(d1.padj), marker="o", alpha=0.5, color="r", lw=0) pylab.plot(d2.log2FoldChange, -np.log10(d2.padj), marker="o", alpha=0.5, color="k", lw=0) pylab.grid(True) pylab.xlabel("fold change") pylab.ylabel("log10 adjusted p-value") m1 = abs(min(self.df.log2FoldChange)) m2 = max(self.df.log2FoldChange) limit = max(m1,m2) pylab.xlim([-limit, limit]) y1,y2 = pylab.ylim() pylab.ylim([0,y2]) pylab.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)")
def plot_polymerase_per_barcode(self, fontsize=12, unbarcoded=True): """Number Of Polymerase Reads Per Barcode""" PR = self.df_barcoded["Polymerase Reads"].sum() data = self.df_barcoded['Polymerase Reads'].sort_values( ascending=False).values pylab.plot([int(x) for x in range(1, len(data) + 1)], data, label="barcodes") pylab.axhline(data.mean(), color="r", label="average") try: if unbarcoded is True: unbar = self.df_not_barcoded['Polymerase Reads'].iloc[0] pylab.axhline(unbar, color="k", ls="--", label="not barcoded") except: pass pylab.xlabel("Barcode Rank Order", fontsize=fontsize) pylab.ylabel("Counts of Reads", fontsize=fontsize) pylab.title("Total Polymerase count: {}".format(PR)) pylab.legend() pylab.ylim(ymin=0) try: pylab.tight_layout() except: pass
def plot_idr_vs_peaks(self, filename=None, savefig=False): # global_idr is actually -log10(idr) pylab.clf() X1 = pylab.linspace(0, self.threshold, 100) X2 = pylab.linspace(self.threshold, 1, 100) # convert global idr to proba df1 = self.df.query("idr<@self.threshold") df2 = self.df.query("idr>[email protected]") pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2) shift = len(df1) pylab.plot([shift + sum(df2['idr'] < x) for x in X2], X2, "-", color='k', lw=2) pylab.xlabel('Number of significant peaks') pylab.ylabel('IDR') pylab.axhline(0.05, color='b', ls='--') pylab.axvline(self.N_significant_peaks, color='b', ls='--') if savefig: pylab.savefig(filename)
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:,"snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title,fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_plot_contig_length(self, bins=40, fontsize=16): """Plot distribution of contig lengths""" L = len(self.fasta.sequences) pylab.hist(self.fasta.lengths, lw=1, ec="k", bins=bins) pylab.grid() pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("#", fontsize=fontsize) pylab.title("Distribution {} contigs".format(L))
def plot_subreads_histogram(self, bins=10, fontsize=12): self.df_barcoded['Subreads'].hist(bins=bins, ec="k", rwidth=0.8) pylab.xlabel("Number of subreads", fontsize=fontsize) pylab.ylabel("Number of Barcoded Samples", fontsize=fontsize) try: pylab.tight_layout() except: pass
def plot_pvalue_hist(self, bins=60, fontsize=16, rotation=0): pylab.hist(self.df.pvalue.dropna(), bins=bins, ec="k") pylab.grid(True) pylab.xlabel("raw p-value", fontsize=fontsize) pylab.ylabel("Occurences", fontsize=fontsize) try: pylab.tight_layout() except: pass
def plot_padj_hist(self, bins=60, fontsize=16): pylab.hist(self.df.padj.dropna(), bins=bins, ec="k") pylab.grid(True) pylab.xlabel("Adjusted p-value", fontsize=fontsize) pylab.ylabel("Occurences", fontsize=fontsize) try: pylab.tight_layout() except: pass
def scatter_length_cov_gc(self, min_length=200, min_cov=10): pylab.clf() pylab.scatter(self.df.length, self.df['cov'], c=self.df.GC) pylab.loglog() pylab.axvline(min_length, lw=2, c="r", ls='--') pylab.axhline(min_cov, lw=2, c="r", ls='--') pylab.xlabel("contig length") pylab.ylabel("contig coverage") pylab.colorbar(label="GC") pylab.grid(True)
def plot(self, fontsize=16): """plot quality versus base position""" pylab.plot(self.quality, label="offset: %s" % self.offset) pylab.xlabel('base position', fontsize=fontsize) pylab.ylabel('Quality per base', fontsize=fontsize) pylab.grid(True) # ylim set autoscale to off so if we want to call this function several # times, we must reset autoscale to on before calling ylim pylab.autoscale() limits = pylab.ylim() pylab.ylim(max(0,limits[0]-1), limits[1]+1)
def boxplot_mapq_concordance(self): # method can only be bwa for now assert self.method == "bwa" data = self._get_data() df = pd.DataFrame(data, columns=["mapq", "length", "concordance"]) pylab.clf() pylab.boxplot([df[df.mapq == i]['concordance'] for i in range(1,61)]) pylab.xlabel("mapq") pylab.ylabel("concordance") pylab.grid() tt = [10,20,30,40,50,60] pylab.xticks(tt, tt)
def plot_indel_dist(self, fontsize=16): """Plot indel count (+ ratio) :Return: list of insertions, deletions and ratio insertion/deletion for different length starting at 1 .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.plot_indel_dist() What you see on this figure is the presence of 10 insertions of length 1, 1 insertion of length 2 and 3 deletions of length 1 # Note that in samtools, several insertions or deletions in a single alignment are ignored and only the first one seems to be reported. For instance 10M1I10M1I stored only 1 insertion in its report; Same comment for deletions. .. todo:: speed up and handle long reads cases more effitiently by storing INDELS as histograms rather than lists """ try: self.insertions except: self._set_indels() if len(self.insertions) ==0 or len(self.deletions) == 0: raise ValueError("No deletions or insertions found") N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1 D = [self.deletions.count(i) for i in range(N)] I = [self.insertions.count(i) for i in range(N)] R = [i/d if d!=0 else 0 for i,d in zip(I, D)] fig, ax = pylab.subplots() ax.plot(range(N), I, marker="x", label="Insertions") ax.plot(range(N), D, marker="x", label="Deletions") ax.plot(range(N), R, "--r", label="Ratio insertions/deletions") ax.set_yscale("symlog") pylab.ylim([1, pylab.ylim()[1]]) pylab.legend() pylab.grid() from matplotlib.ticker import MaxNLocator ax.xaxis.set_major_locator(MaxNLocator(integer=True)) pylab.xlabel("Indel length", fontsize=fontsize) pylab.ylabel("Indel count", fontsize=fontsize) return I, D, R
def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"): if self._ORF_pos is None: self._find_ORF_CDS() n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0] n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0] # plot for all ORF and CDS pylab.hist(self._ORF_pos["len_ORF"].dropna(),alpha=alpha, label="ORF, N = " + str(n_ORF),bins=bins) pylab.hist(self._ORF_pos["len_CDS"].dropna(),alpha=alpha, label="CDS, N = " + str(n_CDS),bins=bins) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend() pylab.title("Length of ORF and CDS (after filter %s > %d)" \ %(self._type_filter, self._threshold))
def hist_coverage(self, bins=100): """ .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.hist_coverage() """ try: self.coverage except: self._set_coverage() pylab.hist(self.coverage, bins=bins) pylab.xlabel("Coverage") pylab.ylabel("Number of mapped bases") pylab.grid()
def hist_length_repeats(self, bins=20, alpha=0.5, hold=False, fontsize=12, grid=True, title="Repeat length", xlabel="Repeat length", ylabel="#", logy=True): """Plots histogram of the repeat lengths """ # check that user has set a threshold if hold is False: pylab.clf() pylab.hist(self.list_len_repeats, alpha=alpha, bins=bins) pylab.title(title) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True) if logy: pylab.semilogy()
def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF)) pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def plot_bar_mapq(self, fontsize=16, filename=None, ): """Plots bar plots of the MAPQ (quality) of alignments .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam', "testing")) b.plot_bar_mapq() """ df = self.get_mapq_as_df() df.plot(kind='hist', bins=range(0,df.max().values[0]+1), legend=False, grid=True, logy=True) pylab.xlabel("MAPQ", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) pylab.tight_layout() if filename: pylab.savefig(filename)
def pie_plot(self, filename=None, hold=False): """Plot PIE plot of the status (complete / fragment / missed) .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.pie_plot() """ if hold is False: pylab.clf() self.df.groupby('Status').count()['# Busco id'].plot(kind="pie") pylab.ylabel("") #pylab.title("Distribution Complete/Fragmented/Missing") #pylab.legend() if filename: pylab.savefig(filename)
def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="GC %", ylabel="#", label="",title=None): """Plot histogram GC content :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: fontsize of the x and y labels and title. :param bool grid: add grid or not :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_GC() """ mean_GC = np.mean(self.df.loc[:,'GC_content']) # set title if needed if title is None: title = "GC %% \n Mean GC : %.2f" %(mean_GC) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self.df.loc[:,'GC_content'], bins=bins, alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2)) + ", N : " + str(len(self))) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True) pylab.xlim([0, 100]) try: pylab.tight_layout() except:pass
def plot_acgt_content(self, stacked=False): """Plot histogram of GC content .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.plot_acgt_content() """ df = self.get_actg_content() if stacked is True: df.plot.bar(stacked=True) else: df.plot() pylab.grid(True) pylab.xlabel("position (bp)", fontsize=self.fontsize) pylab.ylabel("percent", fontsize=self.fontsize)
def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_nb_passes() """ max_nb_pass = self.df.nb_passes.max() if bins is None: k = range(1, max_nb_pass+1) # histogram nb passes if hold is False: pylab.clf() pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha, label=label, log=logy, width=1) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None): # compute histogram of the input reads once for all to be used # in the target_distribution method self.bins = bins self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True) lengths = self.bam_simul.df.read_length.values self.tokeep = [] vec = [] x = self.bam.df.read_length.mean() for i in range(self.bam_simul.df.shape[0]): can = lengths[i] aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)]) #acceptance probability u = pylab.uniform(0,1) if u < aprob: x = can vec.append(x) self.tokeep.append(True) else: self.tokeep.append(False) #plotting the results: #theoretical curve x = pylab.arange(xmin, xmax, step) y = self.target_distribution(x) pylab.subplot(211) pylab.title('Metropolis-Hastings') pylab.plot(vec) pylab.subplot(212) pylab.hist(vec[burn:], bins=bins, density=1) pylab.plot(x,y,'r-') pylab.ylabel('Frequency') pylab.xlabel('x') pylab.legend(('PDF','Samples')) if output_filename is not None: self.bam_simul.filter_bool(output_filename, self.tokeep)
def plot(self, bins=80, rwidth=0.8, **kwargs): pylab.clf() Y, X, _ = pylab.hist(self.data, bins=bins, rwidth=rwidth, **kwargs) pylab.xlabel(self.xlabel, fontsize=self.fontsize) pylab.ylabel(self.ylabel, fontsize=self.fontsize) """self.Y = Y self.X = X ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(self.data) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=self.fontsize) """ pylab.grid(self.grid) pylab.title(self.title) try: pylab.tight_layout() except:pass
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ mean_len = np.mean(self.df.loc[:,'read_length']) mean_GC = np.mean(self.df.loc[:,'GC_content']) if hold is False: pylab.clf() data = self.df.loc[:,['read_length','GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)
def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9, textcolor="red", **kargs): """A simple non-interactive plot of taxons :return: None if no taxon were found and a dataframe otherwise A Krona Javascript output is also available in :meth:`kraken_to_krona` .. plot:: :include-source: from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') .. seealso:: to generate the data see :class:`KrakenPipeline` or the standalone application **sequana_taxonomy**. """ if len(self._df) == 0: return if self._data_created == False: status = self.kraken_to_krona() if kind not in ['barh', 'pie']: logger.error('kind parameter: Only barh and pie are supported') return # This may have already been called but maybe not. This is not time # consuming, so we call it again here if len(self.taxons.index) == 0: return None df = self.get_taxonomy_biokit(list(self.taxons.index)) df.ix[-1] = ["Unclassified"] * 8 data = self.taxons.copy() data.ix[-1] = self.unclassified data = data/data.sum()*100 assert threshold > 0 and threshold < 100 others = data[data<threshold].sum() data = data[data>threshold] names = df.ix[data.index]['name'] data.index = names.values data.ix['others'] = others try: data.sort_values(inplace=True) except: data.sort(inplace=True) # text may be long so, let us increase the figsize a little bit pylab.figure(figsize=(10,8)) pylab.clf() if kind == "pie": ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%', radius=radius, **kargs) pylab.ylabel(" ") for text in ax.texts: # large, x-small, small, None, x-large, medium, xx-small, # smaller, xx-large, larger text.set_size("small") text.set_color(textcolor) for wedge in ax.patches: wedge.set_linewidth(1) wedge.set_edgecolor("k") self.ax = ax elif kind == "barh": ax = data.plot(kind=kind, **kargs) pylab.xlabel(" percentage ") return data
def hist_passes(self, maxp=50, fontsize=16): passes = self.df.nb_passes.copy() passes.clip_upper(maxp).hist(bins=maxp) pylab.xlim([0, maxp]) pylab.ylabel("# count", fontsize=fontsize) pylab.xlabel("Passes (max {})".format(maxp), fontsize=fontsize)