def venn(self, compa_list, direction="all", prefix=""): """ Plot a venn diagram comparing the list compa_list of dr gene lists. compa_list is a list of comparison names from Deseq2 results direction specifies either if up/down/all dr genes are considered prefix is a string to be added as prefix to the outfile name. compa_list can be a list of lists of comparisons to make. ie [["WT", "KO1"],["WT", "KO2"] """ from sequana.viz.venn import plot_venn # If compa_list is a list of lists of comparison if all(isinstance(l, list) for l in compa_list): fig, ax = pylab.subplots(6, 1, figsize=(6, 20)) ax = ax.flat for i, c in enumerate(compa_list): plot_venn( [self.dr_gene_lists[x][direction] for x in c], [compa_name for compa_name in c], ax=ax[i], ) # If compa is only a list of comparisons else: plot_venn( [self.dr_gene_lists[x][direction] for x in compa_list], [compa_name for compa_name in compa_list], ) out_dir = os.path.join(self.out_dir, "vennDiagrams") os.makedirs(out_dir, exist_ok=True) outfile = os.path.join(out_dir, f"{prefix}vennDiagrams_{direction}.pdf") pylab.savefig(outfile, bbox_inches="tight")
def plot_indel_dist(self, fontsize=16): """Plot indel count (+ ratio) :Return: list of insertions, deletions and ratio insertion/deletion for different length starting at 1 .. plot:: :include-source: from sequana import sequana_data, BAM b = BAM(sequana_data("measles.fa.sorted.bam")) b.plot_indel_dist() What you see on this figure is the presence of 10 insertions of length 1, 1 insertion of length 2 and 3 deletions of length 1 # Note that in samtools, several insertions or deletions in a single alignment are ignored and only the first one seems to be reported. For instance 10M1I10M1I stored only 1 insertion in its report; Same comment for deletions. .. todo:: speed up and handle long reads cases more effitiently by storing INDELS as histograms rather than lists """ try: self.insertions except: self._set_indels() if len(self.insertions) ==0 or len(self.deletions) == 0: raise ValueError("No deletions or insertions found") N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1 D = [self.deletions.count(i) for i in range(N)] I = [self.insertions.count(i) for i in range(N)] R = [i/d if d!=0 else 0 for i,d in zip(I, D)] fig, ax = pylab.subplots() ax.plot(range(N), I, marker="x", label="Insertions") ax.plot(range(N), D, marker="x", label="Deletions") ax.plot(range(N), R, "--r", label="Ratio insertions/deletions") ax.set_yscale("symlog") pylab.ylim([1, pylab.ylim()[1]]) pylab.legend() pylab.grid() from matplotlib.ticker import MaxNLocator ax.xaxis.set_major_locator(MaxNLocator(integer=True)) pylab.xlabel("Indel length", fontsize=fontsize) pylab.ylabel("Indel count", fontsize=fontsize) return I, D, R
def plot_rank_vs_idr_score(self, filename=None, savefig=False): # rank versus IDR scores f, axes = pylab.subplots(2, 1) df = self.df axes[0].plot( range(len(df)), df.sort_values(by='rep1_rank', ascending=False)['local_idr'], 'o') axes[0].set_ylabel("log10 IDR for replicate 1") axes[0].axvline(len(self.df) - self.N_significant_peaks, color='b', ls='--') axes[1].plot( range(len(df)), df.sort_values(by='rep2_rank', ascending=False)['local_idr'], 'ro') axes[1].set_ylabel("log10 IDR for replicate 2") axes[1].axvline(len(self.df) - self.N_significant_peaks, color='b', ls='--') if savefig: pylab.savefig(filename)
def plot_all_skews(self, figsize=(10, 12), fontsize=16, alpha=0.5): if self._window is None: raise AttributeError("Please set a valid window to compute skew") # create figure # fig, axarr = pylab.subplots(10,1, sharex=True, figsize=figsize) fig, axarr = pylab.subplots(9, 1, sharex=True, figsize=figsize) main_title = "Window size = %d (%.0f %% of genome )\n\ GC content = %.0f %%, AT content = %.0f %%, ignored = %.0f %%" \ % (self._window, self._window*100/self.__len__(), self.gc_content()*100, (1-self.gc_content())*100, self._ignored_nuc*100) pylab.suptitle(main_title, fontsize=fontsize) # GC skew axarr[0].set_title("GC skew (blue) - Cumulative sum (red)") axarr[0].plot(list(self._GC_skew_slide[0]), 'b-', alpha=alpha) axarr[0].set_ylabel("(G -C) / (G + C)") axarr[1].plot(list(np.cumsum(self._GC_skew_slide[0])), 'r-', alpha=alpha) axarr[1].set_ylabel("(G -C) / (G + C)") # AT skew axarr[2].set_title("AT skew (blue) - Cumulative sum (red)") axarr[2].plot(list(self._AT_skew_slide[0]), 'b-', alpha=alpha) axarr[2].set_ylabel("(A -T) / (A + T)") axarr[3].plot(list(np.cumsum(self._AT_skew_slide[0])), 'r-', alpha=alpha) axarr[3].set_ylabel("(A -T) / (A + T)", rotation=0) # Xn axarr[4].set_title("Cumulative RY skew (Purine - Pyrimidine)") axarr[4].plot(self._Xn, 'g-', alpha=alpha) axarr[4].set_ylabel("(A + G) - (C + T)") # Yn axarr[5].set_title("Cumulative MK skew (Amino - Keto)") axarr[5].plot(self._Yn, 'g-', alpha=alpha) axarr[5].set_ylabel("(A + C) - (G + T)") # Zn axarr[6].set_title( "Cumulative H-bond skew (Weak H-bond - Strong H-bond)") axarr[6].plot(self._Zn, 'g-', alpha=alpha) axarr[6].set_ylabel("(A + T) - (G + C)") # GC content axarr[7].set_title("GC content") axarr[7].plot(list(self._GC_content_slide[0]), 'k-', alpha=alpha) axarr[7].set_ylabel("GC") # AT content axarr[8].set_title("AT content") axarr[8].plot(list(self._AT_content_slide[0]), 'k-', alpha=alpha) axarr[8].set_ylabel("AT") # # FFT # axarr[9].set_title("FFT") # axarr[9].plot(list(self._c_fft),'g-',alpha=alpha) # axarr[9].set_ylabel("FFT") fig.tight_layout() fig.subplots_adjust(top=0.88)
shuffle(shuffle_col) colors = [cmap(i) for i in shuffle_col] pylab.plot(res_best["qLength"], res_best["score_norm"], "bo", alpha=0.5) pylab.xlabel("Length of contig") pylab.ylabel("Score blasr (normalised by length)") pylab.title(title_plot) if save_plot: pylab.savefig(file_plot.replace(".png", "_scores.png")) else: pylab.show() ##### Plot by reference ref_found = list(res_best["reference"].unique()) fig, axarr = pylab.subplots(2 * len(ref_found), figsize=(15, 8 * len(ref_found))) #fig.suptitle("Coverage by contigs (blasr)\n%s" % title_plot, fontsize=10) for i in range(len(ref_found)): # keep only contigs aligned on this reference res_best_ref = res_best[res_best["reference"] == ref_found[i]] len_genome = df_genome_len.loc[df_genome_len["name"] == ref_found[i], "length"].values[0] # plot coverage found by blasr, with score ax = axarr[i * 2] list_contigs = plot_contigs(res_best_ref, ax, colors, mode="score") genome_not_covered = areas_not_covered(list_contigs, len_genome) # add grey on not covered areas for area in genome_not_covered: ax.axvspan(area[0], area[1], alpha=0.1, color='k')
list_input = [name.split('\n')[0] for name in f.readlines()] f.close() # list of labels f = open(labels_input, 'r') list_labels = [name.split('\n')[0] for name in f.readlines()] f.close() ################################ EXECUTE ############################################################################################## #pylab.figure(figsize=(5, 5)) fig1, ax1 = pylab.subplots(1,1, figsize=(5, 5)) fig2, ax2 = pylab.subplots(1,1, figsize=(5, 5)) res_PS = [] for i in range(len(list_input)): df = pd.read_csv(list_input[i]) ##### Precision without unknown taxons good_class_rank = df["good_classification_at_level"].sum() tot_class_rank = df["good_classification_at_level"].sum() + df["wrong_classification_at_level"].sum() tot = df["total_N_reads"].sum() wrong_class_above = df["wrong_classification_above_level"].sum() ##### Precision without unknown taxons # some reads are classified but we dont find any info : cannot ignore them
#colors = [cmap(i) for i in np.linspace(0,1,len(list_analysis))] # positions of genome gen_pos = [[i, i + step - 1] for i in range(0, len_genome, step)] y_pos = list(np.linspace(0, 1, len(analysis_names) + 2)) if custom_colors: y_col = [colors[i] for i in range(len(analysis_names))] else: y_col = [cmap(i) for i in np.linspace(0, 1, len(analysis_names))] pylab.close('all') # create figure fig, axarr = pylab.subplots(len(gen_pos), 1, figsize=(int(step / 20000), int(len(gen_pos)) * 1.1)) for i in range(len(gen_pos)): subplot_variant_position(df_result, i, gen_pos, axarr, analysis_names, y_pos, y_col, be_repeats_concat) # add grey at the end (no genome) ax = axarr[-1] ax.axvspan(len_genome, gen_pos[-1][1], alpha=0.5, color='k') #fig.subplots_adjust(bottom=0.2) #fig.tight_layout() pylab.subplots_adjust(hspace=hspace_subplots) pylab.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) #pylab.legend(loc="lower left") if file_fig != "show":
cmap = pylab.cm.get_cmap(colormap) # shuffle colors : in case 2 adjacent contigs have the same color, user can plot again to see better shuffle_col = list(np.linspace(0,1,res_best.shape[0])) shuffle(shuffle_col) colors = [cmap(i) for i in shuffle_col] pylab.plot(res_best["qLength"], res_best["score_norm"],"bo",alpha=0.5) pylab.xlabel("Length of contig") pylab.ylabel("Score blasr (normalised by length)") pylab.title(title_plot) if save_plot: pylab.savefig(file_plot.replace(".png","_scores.png")) else: pylab.show() fig, axarr = pylab.subplots(2,figsize=figsize, sharex=True) fig.suptitle("Coverage by contigs (blasr)\n%s" % title_plot, fontsize=10) # plot coverage found by blasr, with score ax = axarr[0] list_contigs = plot_contigs(res_best, ax, mode="score") genome_not_covered = areas_not_covered(list_contigs, len_genome) # add grey on not covered areas for area in genome_not_covered: ax.axvspan(area[0],area[1], alpha=0.1, color='k') ax.set_ylabel("Score blasr (normalised by length)") # plot coverage found by blasr, with random y distribution (to see if there are overlaps) ax = axarr[1] list_contigs = plot_contigs(res_best, ax, mode="random") for area in genome_not_covered: ax.axvspan(area[0],area[1], alpha=0.05, color='k')
# plot score nor normalised pylab.plot(res_blasr["qLength"], res_blasr["score"], "bo", alpha=0.5) pylab.xlabel("Length of contig") pylab.ylabel("Score blasr (not normalised)") pylab.title(title_plot) pylab.show() # plot score normalised by lenght pylab.plot(res_blasr["qLength"], res_blasr["score_norm"], "bo", alpha=0.5) pylab.xlabel("Length of contig") pylab.ylabel("Score blasr (normalised by length)") pylab.title(title_plot) pylab.show() fig, axarr = pylab.subplots(2, figsize=(15, 8), sharex=True) fig.suptitle("Coverage by contigs (blasr)\n%s" % title_plot, fontsize=10) # plot coverage found by blasr, with score ax = axarr[0] for i in range(res_blasr.shape[0]): res_to_plot = res_blasr.iloc[i, :] contig = res_to_plot["qName"] start = int(res_to_plot["tStart"]) end = int(res_to_plot["tEnd"]) score = float(res_to_plot["score_norm"]) ax.plot([start, end], [score] * 2, ls='-', lw=5, color=colors[i], solid_capstyle="butt") ax.set_ylabel("Score blasr (normalised by length)")
################################ IMPORT DATA ############################################################################################## list_files = read_fof(fof_BAM) description = pd.read_csv(file_description) list_BAM = [] labels = [] for f in list_files: short_name_bam = f.split("/")[-1] labels.append(description[description["Filename"] == short_name_bam] ["polymerase"].values[0]) list_BAM.append(pacbio.BAMPacbio(f)) # plot read length fig, ax = pylab.subplots(1, 1, figsize=figsize_read_len) for i in range(len(list_BAM)): bam = list_BAM[i] bam.hist_len(hold=True, grid=False, label=labels[i], title="") ax.legend() fig.tight_layout() if save_plots: pylab.savefig(filename_output.replace(".", "_read_len."), dpi=182) pylab.clf() else: pylab.show() # plot GC % fig, ax = pylab.subplots(1, 1, figsize=figsize_GC) for i in range(len(list_BAM)): bam = list_BAM[i]
def plot_all_skews(self,figsize=(10, 12), fontsize=16, alpha=0.5): if self._window is None: raise AttributeError("Please set a valid window to compute skew") # create figure # fig, axarr = pylab.subplots(10,1, sharex=True, figsize=figsize) fig, axarr = pylab.subplots(9,1, sharex=True, figsize=figsize) main_title = "Window size = %d (%.0f %% of genome )\n\ GC content = %.0f %%, AT content = %.0f %%, ignored = %.0f %%" \ % (self._window, self._window*100/self.__len__(), self.gc_content()*100, (1-self.gc_content())*100, self._ignored_nuc*100) pylab.suptitle(main_title, fontsize=fontsize) # GC skew axarr[0].set_title("GC skew (blue) - Cumulative sum (red)") axarr[0].plot(list(self._GC_skew_slide[0]),'b-',alpha=alpha) axarr[0].set_ylabel("(G -C) / (G + C)") axarr[1].plot(list(np.cumsum(self._GC_skew_slide[0])),'r-',alpha=alpha) axarr[1].set_ylabel("(G -C) / (G + C)") # AT skew axarr[2].set_title("AT skew (blue) - Cumulative sum (red)") axarr[2].plot(list(self._AT_skew_slide[0]),'b-',alpha=alpha) axarr[2].set_ylabel("(A -T) / (A + T)") axarr[3].plot(list(np.cumsum(self._AT_skew_slide[0])),'r-',alpha=alpha) axarr[3].set_ylabel("(A -T) / (A + T)", rotation=0) # Xn axarr[4].set_title("Cumulative RY skew (Purine - Pyrimidine)") axarr[4].plot(self._Xn,'g-',alpha=alpha) axarr[4].set_ylabel("(A + G) - (C + T)") # Yn axarr[5].set_title("Cumulative MK skew (Amino - Keto)") axarr[5].plot(self._Yn,'g-',alpha=alpha) axarr[5].set_ylabel("(A + C) - (G + T)") # Zn axarr[6].set_title("Cumulative H-bond skew (Weak H-bond - Strong H-bond)") axarr[6].plot(self._Zn,'g-',alpha=alpha) axarr[6].set_ylabel("(A + T) - (G + C)") # GC content axarr[7].set_title("GC content") axarr[7].plot(list(self._GC_content_slide[0]),'k-',alpha=alpha) axarr[7].set_ylabel("GC") # AT content axarr[8].set_title("AT content") axarr[8].plot(list(self._AT_content_slide[0]),'k-',alpha=alpha) axarr[8].set_ylabel("AT") # # FFT # axarr[9].set_title("FFT") # axarr[9].plot(list(self._c_fft),'g-',alpha=alpha) # axarr[9].set_ylabel("FFT") fig.tight_layout() fig.subplots_adjust(top=0.88)