def plot_idr_vs_peaks(self, filename=None, savefig=False): # global_idr is actually -log10(idr) pylab.clf() X1 = pylab.linspace(0, self.threshold, 100) X2 = pylab.linspace(self.threshold, 1, 100) # convert global idr to proba df1 = self.df.query("idr<@self.threshold") df2 = self.df.query("idr>[email protected]") pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2) shift = len(df1) pylab.plot([shift + sum(df2['idr'] < x) for x in X2], X2, "-", color='k', lw=2) pylab.xlabel('Number of significant peaks') pylab.ylabel('IDR') pylab.axhline(0.05, color='b', ls='--') pylab.axvline(self.N_significant_peaks, color='b', ls='--') if savefig: pylab.savefig(filename)
def plot_bar_mapq(self, fontsize=16, filename=None): """Plots bar plots of the MAPQ (quality) of alignments .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam', "testing")) b.plot_bar_mapq() """ df = self.get_mapq_as_df() df.plot(kind='hist', bins=range(0, df.max().values[0] + 1), legend=False, grid=True, logy=True) pylab.xlabel("MAPQ", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) try: # This may raise issue on MAC platforms pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
def plot_bar_flags(self, logy=True, fontsize=16, filename=None): """Plot an histogram of the flags contained in the BAM .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam', "testing")) b.plot_bar_flags() .. seealso:: :class:`SAMFlags` for meaning of each flag """ df = self.get_flags_as_df() df = df.sum() pylab.clf() if logy is True: barplot = df.plot(kind='bar', logy=logy, grid=True) else: barplot = df.plot(kind='bar', grid=True) pylab.xlabel("flags", fontsize=fontsize) pylab.ylabel("count", fontsize=fontsize) pylab.tight_layout() if filename: pylab.savefig(filename) return barplot
def plot_gc_vs_coverage(self, filename=None, bins=None, Nlevels=6, fontsize=20, norm="log", ymin=0, ymax=100, contour=True, **kwargs): if Nlevels is None or Nlevels==0: contour = False data = self.df[['cov','gc']].copy() data['gc'] *= 100 data = data.dropna() if bins is None: bins = [100, min(int(data['gc'].max()-data['gc'].min()+1), max(5,self.bed.gc_window_size - 4))] bins[0] = max(10, min(bins[0], self.df['cov'].max())) from biokit import Hist2D h2 = Hist2D(data) try: h2.plot(bins=bins, xlabel="Per-base coverage", ylabel=r'GC content (%)', Nlevels=Nlevels, contour=contour, norm=norm, fontsize=fontsize, **kwargs) except: h2.plot(bins=bins, xlabel="Per-base coverage", ylabel=r'GC content (%)' , Nlevels=Nlevels, contour=False, norm=norm, fontsize=fontsize, **kwargs) pylab.ylim([ymin, ymax]) try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
def scatter_plot(self, filename=None, hold=False): """Scatter plot of the score versus length of each ortholog .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.scatter_plot() """ if hold is False: pylab.clf() colors = ["green", "orange", "red", "blue"] markers = ['o', 's', 'x', 'o'] for i, this in enumerate(["Complete", "Fragmented", "Missing", "Duplicated"]): mask = self.df.Status == "Complete" if sum(mask)>0: self.df[mask].plot(x="Length", y="Score", kind="scatter", color=colors[i], marker=markers[i], label="Complete") pylab.legend() pylab.grid() if filename: pylab.savefig(filename)
def venn(self, compa_list, direction="all", prefix=""): """ Plot a venn diagram comparing the list compa_list of dr gene lists. compa_list is a list of comparison names from Deseq2 results direction specifies either if up/down/all dr genes are considered prefix is a string to be added as prefix to the outfile name. compa_list can be a list of lists of comparisons to make. ie [["WT", "KO1"],["WT", "KO2"] """ from sequana.viz.venn import plot_venn # If compa_list is a list of lists of comparison if all(isinstance(l, list) for l in compa_list): fig, ax = pylab.subplots(6, 1, figsize=(6, 20)) ax = ax.flat for i, c in enumerate(compa_list): plot_venn( [self.dr_gene_lists[x][direction] for x in c], [compa_name for compa_name in c], ax=ax[i], ) # If compa is only a list of comparisons else: plot_venn( [self.dr_gene_lists[x][direction] for x in compa_list], [compa_name for compa_name in compa_list], ) out_dir = os.path.join(self.out_dir, "vennDiagrams") os.makedirs(out_dir, exist_ok=True) outfile = os.path.join(out_dir, f"{prefix}vennDiagrams_{direction}.pdf") pylab.savefig(outfile, bbox_inches="tight")
def scatter_plot(self, filename=None, hold=False): """Scatter plot of the score versus length of each ortholog .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.scatter_plot() Missing are not show since there is no information about contig . """ if hold is False: pylab.clf() colors = ["green", "orange", "red", "blue"] markers = ['o', 's', 'x', 'o'] for i, this in enumerate(["Complete", "Fragmented", "Duplicated"]): mask = self.df.Status == this if sum(mask) > 0: self.df[mask].plot(x="Length", y="Score", kind="scatter", color=colors[i], ax=pylab.gca(), marker=markers[i], label=this) pylab.legend() pylab.grid() if filename: pylab.savefig(filename)
def find_motif(bamfile, motif="CAGCAG", window=200, savefig=False, local_th=5, global_th=10): """ If at least 10 position contains at least 5 instances of the motif, then this is a hit and the alignment is kept """ b1 = BAM(bamfile) # FIND motif and create pictures count = 0 found = [] Ss = [] alns = [] for a in b1: count +=1 if a.query_sequence is None: continue seq = a.query_sequence X1 = [seq[i:i+window].count(motif) for i in range(len(seq))] S = sum([x>local_th for x in X1]) Ss.append(S) als.append(a) if S > global_th: found.append(True) off = a.query_alignment_start pylab.clf() pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1) if savefig: pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_"))) else: found.append(False) return alns, found, Ss
def plot_stacked_hist(self, output_filename=None, dpi=200, kind="barh", fontsize=10, edgecolor="k", lw=1, width=1, ytick_fontsize=10): df = self.get_df() df.T.plot(kind=kind, stacked=True, edgecolor=edgecolor, lw=lw, width=width) ax = pylab.gca() positions = pylab.yticks() #ax.set_yticklabel(positions, labels, fontsize=ytick_fontsize) pylab.xlabel("Percentage (%)", fontsize=fontsize) pylab.ylabel("Sample index/name", fontsize=fontsize) pylab.yticks(fontsize=ytick_fontsize) pylab.legend(title="kingdom") pylab.xlim([0, 100]) if output_filename: pylab.savefig(output_filename, dpi=dpi)
def plot_go_terms_up(filename, ontologies, case_name, df): self._temp_df[case_name] = df.copy() self._plus[case_name] = sum(df.plus_minus == '+') self._minus[case_name] = sum(df.plus_minus == '-') pylab.savefig(f"{config.output_dir}/Panther_up_{case_name}.png") pylab.savefig(filename) pylab.close()
def plot_hist_normalized_coverage(self, filename=None, binwidth=0.1, max_z=4): """ Barplot of the normalized coverage with gaussian fitting """ pylab.clf() # if there are a NaN -> can't set up binning d = self.df["scale"][self.range[0]:self.range[1]].dropna() # remove outlier -> plot crash if range between min and max is too high d = d[np.abs(d - d.mean()) <= (4 * d.std())] bins = self._set_bins(d, binwidth) self.mixture_fitting.data = d try: self.mixture_fitting.plot(self.gaussians_params, bins=bins, Xmin=0, Xmax=max_z) except ZeroDivisionError: pass pylab.grid(True) pylab.xlim([0,max_z]) pylab.xlabel("Normalised per-base coverage") try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
def plot_ranks(self, filename=None, savefig=False): # ranks # the *score* columns contains the scaled IDR value, min(int(log2(-125IDR), 1000). # e.g. peaks with an IDR of 0 have a score of 1000, idr 0.05 have a score of # int(-125log2(0.05)) = 540, and idr 1.0 has a score of 0. df1 = self.df.query('score>540') df2 = self.df.query('score<=540') pylab.clf() pylab.plot(df1.rep1_rank, df1.rep2_rank, 'ko', alpha=0.5, label='<0.05 IDR') pylab.plot(df2.rep1_rank, df2.rep2_rank, 'ro', alpha=0.5, label='>=0.05 IDR') pylab.xlabel("Peak rank - replicate 1") pylab.ylabel("Peak rank - replicate 2") N = len(self.df) pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--') #pylab.xlim([0,1.05]) #pylab.ylim([0,1.05]) pylab.legend(loc='lower right') if savefig: pylab.savefig(filename)
def barplot_summary(self, filename=None, color=["green", "red"], alpha=0.8): df = self.get_data_reads() under = df.query("name=='Undetermined'") total = df.query("name!='Undetermined'") total = total.groupby("lane").sum().reset_index() total["name"] = "Determined" df = pd.concat([under, total]) #sort=True) df = df.pivot(index="lane", columns="name", values="count") df = df[["Determined", "Undetermined"]] if df.sum().min() > 1e6: df /= 1e6 df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k') pylab.xlabel("Number of reads (M)") else: df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k') pylab.xlabel("Number of reads") pylab.legend() if filename: pylab.savefig(filename, dpi=200) return df
def plotter(filename, key): name = key.replace(" ", "_") pylab.ioff() histograms[key].plot(logy=False, lw=2, marker="o") pylab.title(name + "(%s)" % count) pylab.grid(True) pylab.savefig(filename) pylab.close() # need to close the figure otherwise warnings
def barplot_per_sample(self, alpha=0.5, width=0.8, filename=None): df = self.get_data_reads() # this is ugly but will do the job for now under = df.query("name=='Undetermined'") others = df.query("name!='Undetermined'") under = under.groupby("name").sum().reset_index() others = others.groupby("name").sum().reset_index() under = under[["name", "count"]].set_index("name") others = others[["name", "count"]].set_index("name") all_data = others.sort_index(ascending=False) all_data.columns = ["samples"] # appended at the end all_data.loc['undetermined'] = 0 # revert back all_data = all_data.loc[::-1] # just for legend under.columns = ['undetermined'] if all_data.sum().min() > 1e6: all_data /= 1e6 under /= 1e6 M = True else: M = False all_data.plot(kind="barh", alpha=alpha, zorder=1, width=width, ec='k') under.plot(kind="barh", alpha=alpha, color="red", ax=pylab.gca(), zorder=1, width=width, ec='k') pylab.ylim([-0.5, len(all_data) + 0.5]) if len(all_data) < 100: pylab.yticks(range(len(all_data)), all_data.index) pylab.legend() pylab.grid(True, zorder=-1) if M: pylab.xlabel("Number of reads (M)") else: pylab.xlabel("Number of reads") try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename, dpi=200)
def plot_go_terms_down(filename, ontologies, case_name, df): df = self.pe.plot_go_terms("down", ontologies=ontologies, compute_levels=self.enrichment_params['plot_compute_levels'], log=self.enrichment_params['plot_logx']) self._temp_df[case_name] = df.copy() self._plus[case_name] = sum(df.plus_minus == '+') self._minus[case_name] = sum(df.plus_minus == '-') pylab.savefig(f"{config.output_dir}/Panther_down_{case_name}.png") pylab.savefig(filename) pylab.close()
def run(self, dbname="multiple", output_prefix="kraken_final"): """Run the hierachical analysis This method does not return anything but creates a set of files: - kraken_final.out - krona_final.html - kraken.png (pie plot of the classified/unclassified reads) .. note:: the databases are run in the order provided in the constructor. """ # list of all output to merge at the end self._list_kraken_output = [] self._list_kraken_input = [] # Iteration over the databases for iteration in range(len(self.databases)): status = self._run_one_analysis(iteration) last_unclassified = self._list_kraken_input[-1] stat = os.stat(last_unclassified) if stat.st_size == 0: break # concatenate all kraken output files file_output_final = self.output_directory + os.sep + "%s.out" % output_prefix with open(file_output_final, 'w') as outfile: for fname in self._list_kraken_output: with open(fname) as infile: for line in infile: outfile.write(line) # create html report logger.info("Analysing results") result = KrakenResults(file_output_final) # TODO: this looks similar to the code in KrakenPipeline. could be factorised result.to_js("%s%s%s.html" % (self.output_directory, os.sep, output_prefix)) result.plot(kind="pie") pylab.savefig(self.output_directory + os.sep + "kraken.png") prefix = self.output_directory + os.sep result.kraken_to_json(prefix + "kraken.json", dbname) result.kraken_to_csv(prefix + "kraken.csv", dbname) # remove kraken intermediate files (including unclassified files) if self.unclassified_output: # Just cp the last unclassified file import shutil shutil.copy2(self._list_kraken_input[-1], self.unclassified_output) if not self.keep_temp_files: for f_temp in self._list_kraken_output: os.remove(f_temp) for f_temp in self._list_kraken_input: os.remove(f_temp)
def plot_hist_zscore(self, fontsize=16, filename=None, max_z=6, binwidth=0.5, **hist_kargs): """ Barplot of the zscore values """ pylab.clf() bins = self._set_bins(self.df["zscore"][self.range[0]:self.range[1]], binwidth) self.df["zscore"][self.range[0]:self.range[1]].hist( grid=True, bins=bins, **hist_kargs) pylab.xlabel("Z-Score", fontsize=fontsize) try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
def run(self, dbname="multiple", output_prefix="kraken_final"): """Run the hierachical analysis This method does not return anything but creates a set of files: - kraken_final.out - krona_final.html - kraken.png (pie plot of the classified/unclassified reads) .. note:: the databases are run in the order provided in the constructor. """ # list of all output to merge at the end self._list_kraken_output = [] self._list_kraken_input = [] # Iteration over the databases for iteration in range(len(self.databases)): self._run_one_analysis(iteration) # concatenate all kraken output files file_output_final = self.output_directory + os.sep + "%s.out" % output_prefix with open(file_output_final, 'w') as outfile: for fname in self._list_kraken_output: with open(fname) as infile: for line in infile: outfile.write(line) # create html report logger.info("Analysing results") result = KrakenResults(file_output_final) # TODO: this looks similar to the code in KrakenPipeline. could be factorised result.to_js("%s%s%s.html" % (self.output_directory, os.sep, output_prefix)) result.plot(kind="pie") pylab.savefig(self.output_directory + os.sep + "kraken.png") prefix = self.output_directory + os.sep result.kraken_to_json(prefix + "kraken.json", dbname) result.kraken_to_csv(prefix + "kraken.csv", dbname) # remove kraken intermediate files (including unclassified files) if not self.keep_temp_files: for f_temp in self._list_kraken_output: os.remove(f_temp) for f_temp in self._list_kraken_input: os.remove(f_temp)
def plot_bar_mapq(self, fontsize=16, filename=None, ): """Plots bar plots of the MAPQ (quality) of alignments .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam', "testing")) b.plot_bar_mapq() """ df = self.get_mapq_as_df() df.plot(kind='hist', bins=range(0,df.max().values[0]+1), legend=False, grid=True, logy=True) pylab.xlabel("MAPQ", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) pylab.tight_layout() if filename: pylab.savefig(filename)
def pie_plot(self, filename=None, hold=False): """Plot PIE plot of the status (complete / fragment / missed) .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.pie_plot() """ if hold is False: pylab.clf() self.df.groupby('Status').count()['# Busco id'].plot(kind="pie") pylab.ylabel("") #pylab.title("Distribution Complete/Fragmented/Missing") #pylab.legend() if filename: pylab.savefig(filename)
def plot_hist_coverage(self, logx=True, logy=True, fontsize=16, N=20, fignum=1, hold=False, alpha=0.5, filename=None, **kw_hist): """ """ if hold is False: pylab.figure(fignum) pylab.clf() ax = pylab.gca() ax.set_facecolor('#eeeeee') data = self.df['cov'].dropna().values maxcov = data.max() if logx is True and logy is True: bins = pylab.logspace(0, pylab.log10(maxcov), N) pylab.hist(data, bins=bins, log=True, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.semilogx() pylab.xlabel("Coverage (log scale)", fontsize=fontsize) pylab.ylabel("Count (log scale)", fontsize=fontsize) elif logx is False and logy is True: pylab.hist(data, bins=N, log=True, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.xlabel("Coverage", fontsize=fontsize) pylab.ylabel("Count (log scale)", fontsize=fontsize) elif logx is True and logy is False: bins = pylab.logspace(0, pylab.log10(maxcov), N) pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.xlabel("Coverage (log scale)", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) pylab.semilogx() else: pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha, **kw_hist) pylab.xlabel("Coverage", fontsize=fontsize) pylab.ylabel("Count", fontsize=fontsize) pylab.grid(True) if filename: pylab.savefig(filename)
def run_enrichment_kegg(self, organism, annot_col="Name", out_dir="enrichment"): # pragma: no cover out_dir = Path(out_dir) / "figures" out_dir.mkdir(exist_ok=True, parents=True) gene_lists_dict = self.get_gene_lists(annot_col=annot_col, dropna=True) enrichment = {} for compa in self.comparisons: gene_lists = gene_lists_dict[compa] ke = KeggPathwayEnrichment(gene_lists, organism, progress=False) ke.compute_enrichment() for direction in ["up", "down", "all"]: enrichment[(compa, direction)] = ke._get_final_df( ke.enrichment[direction].results, nmax=10000) pylab.figure() ke.scatterplot(direction) pylab.tight_layout() pylab.savefig(out_dir / f"kegg_{compa}_{direction}.pdf") pylab.savefig(out_dir / f"kegg_{compa}_{direction}.png") logger.info(f"KEGG enrichment for {compa} DONE.") df = pd.concat(enrichment).sort_index() df.index.rename(["comparison", "direction", "index"], inplace=True) self.enrichment_kegg = df # Export results (should be moved to enrichment.py at some point I think) with pd.ExcelWriter(out_dir.parent / "enrichment_kegg.xlsx") as writer: df = self.enrichment_kegg.copy() df.reset_index(inplace=True) df.to_excel(writer, "kegg", index=False) ws = writer.sheets["kegg"] try: ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1) except: logger.warning("Fixme")
def find_motif(self, motif, window=200, figure=False, savefig=False): b1 = BAM(self.bamfile) df = { "query_name": [], "hit": [], "length": [], "start": [], "end": [] } for a in b1: if a.query_sequence is None: continue seq = a.query_sequence X1 = [seq[i:i + window].count(motif) for i in range(len(seq))] S = sum([x >= self.local_threshold for x in X1]) df['query_name'].append(a.query_name) df['start'].append(a.reference_start) df['end'].append(a.reference_end) df['length'].append(a.rlen) df['hit'].append(S) if S >= self.global_threshold: off = a.query_alignment_start #pylab.clf() if figure: pylab.plot( range(off + a.reference_start, off + a.reference_start + len(seq)), X1) if savefig: pylab.savefig("{}_{}_{}.png".format( a.reference_name, S, a.query_name.replace("/", "_"))) df = pd.DataFrame(df) L = len(df.query("hit>5")) print(L) return df
def plot_rank_vs_idr_score(self, filename=None, savefig=False): # rank versus IDR scores f, axes = pylab.subplots(2, 1) df = self.df axes[0].plot( range(len(df)), df.sort_values(by='rep1_rank', ascending=False)['local_idr'], 'o') axes[0].set_ylabel("log10 IDR for replicate 1") axes[0].axvline(len(self.df) - self.N_significant_peaks, color='b', ls='--') axes[1].plot( range(len(df)), df.sort_values(by='rep2_rank', ascending=False)['local_idr'], 'ro') axes[1].set_ylabel("log10 IDR for replicate 2") axes[1].axvline(len(self.df) - self.N_significant_peaks, color='b', ls='--') if savefig: pylab.savefig(filename)
def plot_scores(self, filename=None, savefig=False): # scores from pylab import log10 pylab.clf() pylab.plot(log10(self.df.query('score>540')['rep1_signal']), log10(self.df.query('score>540')['rep2_signal']), 'ko', alpha=0.5, label='<0.05 IDR') pylab.plot(log10(self.df.query('score<540')['rep1_signal']), log10(self.df.query('score<540')['rep2_signal']), 'ro', alpha=0.5, label='>=0.05 IDR') N = pylab.ylim()[1] pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--') pylab.xlabel("Rep1 log10 score") pylab.ylabel("Rep2 log10 score") pylab.legend(loc='lower right') if savefig: pylab.savefig(filename)
def barplot(self, filename="lane{}_status.png", lanes=None): df = self.get_data_reads() if lanes is None: lanes = df.lane.unique() for lane in lanes: pylab.clf() query = "lane==@lane and name!='Undetermined'" counts = df.query(query)['count'] total = counts.sum() L = len(counts) query = "lane==@lane and name=='Undetermined'" under = df.query(query)['count'].sum() if total > 0: pylab.bar(range(L), counts, color="b", label="reads") if total == 0: color = "red" else: if 100 * under / total < 20: color = "green" elif 100 * under / total < 50: color = "orange" else: color = "red" pylab.bar(range(L, L + 1), under, color=color, label="undetermined") pylab.xticks([]) pylab.ylabel("Number of reads") try: pylab.legend(loc="lower left") except: pass pylab.title("Lane {}".format(lane)) pylab.savefig(filename.format(lane), dpi=200)
def run(self, output_filename_classified=None, output_filename_unclassified=None, only_classified_output=False): """Run the analysis using Kraken and create the Krona output .. todo:: reuse the KrakenResults code to simplify this method. """ # Run Kraken (KrakenAnalysis) kraken_results = self.output_directory + os.sep + "kraken.out" self.ka.run(output_filename=kraken_results, output_filename_unclassified=output_filename_unclassified, output_filename_classified=output_filename_classified, only_classified_output=only_classified_output) # Translate kraken output to a format understood by Krona and save png # image self.kr = KrakenResults(kraken_results) df = self.kr.plot(kind="pie") pylab.savefig(self.output_directory + os.sep + "kraken.png") prefix = self.output_directory + os.sep self.kr.kraken_to_json(prefix + "kraken.json", self.dbname) self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname) # Transform to Krona HTML from snakemake import shell kraken_html = self.output_directory + os.sep + "kraken.html" status = self.kr.kraken_to_krona(output_filename=prefix + "kraken.out.summary") if status is True: shell("ktImportText %s -o %s" % (prefix + "kraken.out.summary", kraken_html)) else: shell("touch {}".format(kraken_html))
def run(self, output_filename_classified=None, output_filename_unclassified=None, only_classified_output=False): """Run the analysis using Kraken and create the Krona output .. todo:: reuse the KrakenResults code to simplify this method. """ # Run Kraken (KrakenAnalysis) kraken_results = self.output_directory + os.sep + "kraken.out" self.ka.run( output_filename=kraken_results, output_filename_unclassified=output_filename_unclassified, output_filename_classified=output_filename_classified, only_classified_output=only_classified_output ) # Translate kraken output to a format understood by Krona and save png # image self.kr = KrakenResults(kraken_results) df = self.kr.plot(kind="pie") pylab.savefig(self.output_directory + os.sep + "kraken.png") prefix = self.output_directory + os.sep self.kr.kraken_to_json(prefix + "kraken.json", self.dbname) self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname) # Transform to Krona HTML from snakemake import shell kraken_html = self.output_directory + os.sep + "kraken.html" status = self.kr.kraken_to_krona(output_filename=prefix+"kraken.out.summary") if status is True: shell("ktImportText %s -o %s" % (prefix+"kraken.out.summary", kraken_html)) else: shell("touch {}".format(kraken_html))
def find_motif_bam(self, filename, motif, window=200, figure=False, savefig=False, local_threshold=None, global_threshold=None): from sequana import BAM b1 = BAM(filename) df = { "query_name": [], "hit": [], "length": [], "start": [], "end": [] } for a in b1: if a.query_sequence is None: continue seq = a.query_sequence X1, S = self.find_motif_from_sequence(seq, motif, window=window, local_threshold=local_threshold) df['query_name'].append(a.query_name) df['start'].append(a.reference_start) df['end'].append(a.reference_end) df['length'].append(a.rlen) df['hit'].append(S) if S >= self.global_threshold: off = a.query_alignment_start #pylab.clf() if figure: pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1) if savefig: pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_"))) df = pd.DataFrame(df) L = len(df.query("hit>5")) print(L) return df
def plot_scatter_up(filename): ke.scatterplot('up') pylab.savefig(filename)
def plot_barplot_up(filename): ke.barplot('up') pylab.savefig(filename)
def plot_scatter_down(filename): ke.scatterplot('down') pylab.savefig(filename)
def plot_barplot_down(filename): ke.barplot('down') pylab.savefig(filename)