def barplot_summary(self, filename=None, color=["green", "red"], alpha=0.8): df = self.get_data_reads() under = df.query("name=='Undetermined'") total = df.query("name!='Undetermined'") total = total.groupby("lane").sum().reset_index() total["name"] = "Determined" df = pd.concat([under, total]) #sort=True) df = df.pivot(index="lane", columns="name", values="count") df = df[["Determined", "Undetermined"]] if df.sum().min() > 1e6: df /= 1e6 df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k') pylab.xlabel("Number of reads (M)") else: df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k') pylab.xlabel("Number of reads") pylab.legend() if filename: pylab.savefig(filename, dpi=200) return df
def _parse_data(self): taxonomy = {} logger.info("Reading kraken data") columns = ["status", "taxon", "length"] # we select only col 0,2,3 to save memoty, which is required on very # large files try: # each call to concat in the for loop below # will take time and increase with chunk position. # for 15M reads, this has a big cost. So chunksize set to 1M # is better than 1000 and still reasonable in memory reader = pd.read_csv(self.filename, sep="\t", header=None, usecols=[0, 2, 3], chunksize=1000000) except pd.parser.CParserError: raise NotImplementedError # this section is for the case #only_classified_output when there is no found classified read self.unclassified = N # size of the input data set self.classified = 0 self._df = pd.DataFrame([], columns=columns) self._taxons = self._df.taxon return for chunk in reader: try: self._df self._df = pd.concat([self._df, chunk]) except AttributeError: self._df = chunk self._df.columns = columns count = sum(self._df.taxon == 1) if count: logger.warning("Found %s taxons with root ID (1)" % count) # This gives the list of taxons as index and their amount # above, we select only columns 0, 2, 3 the column are still labelled # 0, 2, 3 in the df self._taxons = self._df.groupby("taxon").size() try: self._taxons.drop(0, inplace=True) except: pass # 0 may not be there self._taxons.sort_values(ascending=False, inplace=True) category = self.df.groupby("status").size() if 'C' in category.index: self.classified = category['C'] else: self.classified = 0 if 'U' in category.index: self.unclassified = category['U'] else: self.unclassified = 0
def summary(self): """ Add information of filter. """ Sdefault = self.rnadiff.summary() self.rnadiff.log2_fc = 1 S1 = self.rnadiff.summary() # set options options = { 'scrollX': 'true', 'pageLength': 20, 'scrollCollapse': 'true', 'dom': '', 'buttons': [] } S = pd.concat([Sdefault, S1]) N = len(Sdefault) df = pd.DataFrame({ 'comparison_link': [1] * len(S), 'comparison': S.index.values, 'Description': ['Number of DGE (any FC)'] * N + ['Number of DGE (|FC| > 1)'] * N, 'Down': S['down'].values, 'Up': S['up'].values, 'Total': S['all'].values }) df = df[[ 'comparison', 'Description', 'Down', 'Up', 'Total', 'comparison_link' ]] df['comparison_link'] = [f"#{name}_table_all" for name in Sdefault.index] + \ [f"#{name}_table_sign" for name in Sdefault.index] dt = DataTable(df, 'dge') dt.datatable.set_links_to_column('comparison_link', 'comparison', new_page=False) dt.datatable.datatable_options = options js_all = dt.create_javascript_function() html = dt.create_datatable(float_format='%d') self.sections.append({ 'name': "Summary", 'anchor': 'filters_option', 'content': f"""<p>Here below is a summary of thfinal Differententially Gene Expression (DGE) analysis. You can find two entries per comparison. The first one has no filter except for an adjusted p-value of 0.05. The second shows the expressed genes with a filter of the log2 fold change of 1 (factor 2 in a normal scale). Clicking on any of the link will lead you to section of the comparison. {js_all} {html} </p>""" })
def merge(self, overlap=0.2): df = pd.concat([self.df1, self.df2]).sort_values(['chr', 'start']) # if overlap at least one base, we merge the peaks and label them with # common information, otherwise we report the original peak merged = [] prev = None overlaps = 0 N1 = 0 N2 = 0 N12 = 0 skip_next = True for k, current in df.iterrows(): if skip_next: prev = current skip_next = False continue # if current overlaps the prev start or end, there is overlap # or if current included in prev there current and prev overlaps if current['start'] <= prev['start'] and current['end'] >= prev[ 'start']: overlap = True N12 += 1 elif current['start'] <= prev['end'] and current['end'] >= prev[ 'end']: overlap = True N12 += 1 elif current['start'] >= prev['start'] and current['end'] <= prev[ 'end']: overlap = True N12 += 1 else: overlap = False if prev['name'].startswith('1_vs_6_7'): N1 += 1 elif prev['name'].startswith('2_vs_6_7'): N2 += 1 if overlap: m = min(current['start'], prev['start']) M = max(current['end'], prev['end']) data = current.copy() data['start'] = m data['end'] = M data['stop'] = M #FIXME same as end. decided on one value data['category'] = 'both' merged.append(data) skip_next = True else: m = min(current['start'], prev['start']) M = max(current['end'], prev['end']) merged.append(prev) skip_next = False prev = current df = pd.DataFrame(merged) df = df.reset_index(drop=True) return df
def _parse_data(self): taxonomy = {} logger.info("Reading kraken data") columns = ["status", "taxon", "length"] # we select only col 0,2,3 to save memoty, which is required on very # large files try: # each call to concat in the for loop below # will take time and increase with chunk position. # for 15M reads, this has a big cost. So chunksize set to 1M # is better than 1000 and still reasonable in memory reader = pd.read_csv(self.filename, sep="\t", header=None, usecols=[0,2,3], chunksize=1000000) except pd.parser.CParserError: raise NotImplementedError # this section is for the case #only_classified_output when there is no found classified read self.unclassified = N # size of the input data set self.classified = 0 self._df = pd.DataFrame([], columns=columns) self._taxons = self._df.taxon return for chunk in reader: try: self._df self._df = pd.concat([self._df, chunk]) except AttributeError: self._df = chunk self._df.columns = columns count = sum(self._df.taxon == 1) if count: logger.warning("Found %s taxons with root ID (1)" % count) # This gives the list of taxons as index and their amount # above, we select only columns 0, 2, 3 the column are still labelled # 0, 2, 3 in the df self._taxons = self._df.groupby("taxon").size() try: self._taxons.drop(0, inplace=True) except: pass # 0 may not be there self._taxons.sort_values(ascending=False, inplace=True) category = self.df.groupby("status").size() if 'C' in category.index: self.classified = category['C'] else: self.classified = 0 if 'U' in category.index: self.unclassified = category['U'] else: self.unclassified = 0
def _get_total_df(self, filtered=False): """Concatenate all rnadiff results in a single dataframe. FIXME: Columns relative to significative comparisons are not using self.log2_fc and self.alpha """ dfs = [] for compa, res in self.comparisons.items(): df = res.filt_df if filtered else res.df df = df.transpose().reset_index() df["file"] = res.name df = df.set_index(["file", "index"]) dfs.append(df) df = pd.concat(dfs, sort=True).transpose() # Add number of comparisons which are significative for a given gene num_sign_compa = (df.loc[:, (slice(None), "padj")] < 0.05).sum(axis=1) df.loc[:, ("statistics", "num_of_significative_comparisons")] = num_sign_compa # Add list of comparisons which are significative for a given gene df_sign_padj = df.loc[:, (slice(None), "padj")] < 0.05 sign_compa = df_sign_padj.loc[:, (slice(None), "padj")].apply( # Extract column names (comparison names) for significative comparisons lambda row: {col_name[0] for sign, col_name in zip(row, row.index) if sign}, axis=1, ) df.loc[:, ("statistics", "significative_comparisons")] = sign_compa if self.annotation is not None and self.fc_attribute and self.fc_feature: df = pd.concat([self.annotation, df], axis=1) else: logger.warning( "Missing any of gff, fc_attribute or fc_feature. No annotation will be added." ) return df
def plot_corrplot_counts_normed(self, samples=None, log2=True, lower='pie', upper='text'): from sequana.viz import corrplot if samples is None: samples = self.r1.counts_raw.columns df1 = self.r1.counts_norm[samples] df2 = self.r2.counts_norm[samples] df = pd.concat([df1, df2], keys=['r1', 'r2'], axis=1) if log2: df = pylab.log2(df) c = corrplot.Corrplot(df).plot(upper=upper, lower=lower) return df.corr()
def plot_feature_most_present(self): """""" df = [] for x, y in self.counts_raw.idxmax().iteritems(): most_exp_gene_count = self.counts_raw.stack().loc[y, x] total_sample_count = self.counts_raw.sum().loc[x] df.append({ "label": x, "gene_id": y, "count": most_exp_gene_count, "total_sample_count": total_sample_count, "most_exp_percent": most_exp_gene_count / total_sample_count * 100, }) df = pd.DataFrame(df).set_index("label") df = pd.concat([self.design_df, df], axis=1) pylab.clf() p = pylab.barh( df.index, df.most_exp_percent, color=df.group_color, zorder=10, lw=1, ec="k", height=0.9, ) for idx, rect in enumerate(p): pylab.text( 2, # * rect.get_height(), idx, # rect.get_x() + rect.get_width() / 2.0, df.gene_id.iloc[idx], ha="center", va="center", rotation=0, zorder=20, ) self._format_plot( # title="Counts monopolized by the most expressed gene", # xlabel="Sample", xlabel="Percent of total reads", ) pylab.tight_layout()
def running_median(self, n, circular=False): """Compute running median of genome coverage :param int n: window's size. :param bool circular: if a mapping is circular (e.g. bacteria whole genome sequencing), set to True Store the results in the :attr:`df` attribute (dataframe) with a column named *rm*. .. versionchanged:: 0.1.21 Use Pandas rolling function to speed up computation. """ self.bed.window_size = n self.bed.circular = circular # in py2/py3 the division (integer or not) has no impact mid = int(n / 2) self.range = [None, None] try: if circular: # BASED on running_median pure implementation, could be much # slower than pure pandas rolling function. Keep those 4 lines # for book keeping though. #cover = list(self.df["cov"]) #cover = cover[-mid:] + cover + cover[:mid] #rm = running_median.RunningMedian(cover, n).run() #self.df["rm"] = rm[mid:-mid] rm = pd.concat([self.df['cov'][-mid:], self.df['cov'], self.df['cov'][:mid]]).rolling( n, center=True).median() self.df["rm"] = rm[mid:-mid] else: rm = self.df['cov'].rolling(n, center=True).median() # Like in RunningMedian, we copy the NAN with real data rm[0:mid] = self.df['cov'][0:mid] rm[-mid:] = self.df['cov'][-mid:] #rm = running_median.RunningMedian(cover, n).run() self.df["rm"] = rm # set up slice for gaussian prediction self.range = [mid, -mid] except: self.df["rm"] = self.df["cov"]
def get_stats(self): import pandas as pd filenames, mode = self._get_files("*.json") if mode == "pe": df1 = pd.read_json(filenames[0]) df2 = pd.read_json(filenames[1]) df = pd.concat([df1, df2]) # Should have been sorted ! df.index = ['R1', 'R2'] else: df = pd.read_json(filenames[0]) df.index = ['R1'] df = df[["A", "C", "G", "T", "N", "n_reads", "mean quality", "GC content", "average read length", "total bases"]] for this in "ACGTN": df[this] /= df["total bases"] df[this] *= 100 return df
def run_enrichment_kegg(self, organism, annot_col="Name", out_dir="enrichment"): # pragma: no cover out_dir = Path(out_dir) / "figures" out_dir.mkdir(exist_ok=True, parents=True) gene_lists_dict = self.get_gene_lists(annot_col=annot_col, dropna=True) enrichment = {} for compa in self.comparisons: gene_lists = gene_lists_dict[compa] ke = KeggPathwayEnrichment(gene_lists, organism, progress=False) ke.compute_enrichment() for direction in ["up", "down", "all"]: enrichment[(compa, direction)] = ke._get_final_df( ke.enrichment[direction].results, nmax=10000) pylab.figure() ke.scatterplot(direction) pylab.tight_layout() pylab.savefig(out_dir / f"kegg_{compa}_{direction}.pdf") pylab.savefig(out_dir / f"kegg_{compa}_{direction}.png") logger.info(f"KEGG enrichment for {compa} DONE.") df = pd.concat(enrichment).sort_index() df.index.rename(["comparison", "direction", "index"], inplace=True) self.enrichment_kegg = df # Export results (should be moved to enrichment.py at some point I think) with pd.ExcelWriter(out_dir.parent / "enrichment_kegg.xlsx") as writer: df = self.enrichment_kegg.copy() df.reset_index(inplace=True) df.to_excel(writer, "kegg", index=False) ws = writer.sheets["kegg"] try: ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1) except: logger.warning("Fixme")
def to_csv(self, output_filename, **kwargs): """ Write all data in a csv. :param str output_filename: csv output file name. :param **dict kwargs: parameters of :meth:`pandas.DataFrame.to_csv`. """ # Concatenate all df df_list = [chrom.get_df() for chrom in self.chr_list] df = pd.concat(df_list) header = ("# sequana_coverage thresholds:{0} window_size:{1} circular:" "{2}".format(self.thresholds.get_args(), self.window_size, self.circular)) if self.genbank_filename: header += ' genbank:' + self.genbank_filename if self.gc_window_size: header += ' gc_window_size:{0}'.format(self.gc_window_size) with open(output_filename, "w") as fp: print(header, file=fp) for chrom in self.chr_list: print("# {0}".format(chrom.get_gaussians()), file=fp) df.to_csv(fp, **kwargs)
def plot_count_per_sample(self, fontsize=12, rotation=45): """Number of mapped and annotated reads (i.e. counts) per sample. Each color for each replicate .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_count_per_sample() """ pylab.clf() df = self.counts_raw.sum().rename("total_counts") df = pd.concat([self.design_df, df], axis=1) pylab.bar( df.index, df.total_counts / 1000000, color=df.group_color, lw=1, zorder=10, ec="k", width=0.9, ) pylab.xlabel("Samples", fontsize=fontsize) pylab.ylabel("reads (M)", fontsize=fontsize) pylab.grid(True, zorder=0) pylab.title("Total read count per sample", fontsize=fontsize) pylab.xticks(rotation=rotation, ha="right") # pylab.xticks(range(N), self.sample_names) try: pylab.tight_layout() except: pass
def plot_percentage_null_read_counts(self): """Bars represent the percentage of null counts in each samples. The dashed horizontal line represents the percentage of feature counts being equal to zero across all samples .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_percentage_null_read_counts() """ pylab.clf() # how many null counts ? df = (self.counts_raw == 0).sum() / self.counts_raw.shape[0] * 100 df = df.rename("percent_null") df = pd.concat([self.design_df, df], axis=1) pylab.bar(df.index, df.percent_null, color=df.group_color, ec="k", lw=1, zorder=10) all_null = (self.counts_raw == 0).all(axis=1).sum() / self.counts_raw.shape[0] pylab.axhline(all_null, ls="--", color="black", alpha=0.5) pylab.xticks(rotation=45, ha="right") pylab.ylabel("Proportion of null counts (%)") pylab.grid(True, zorder=0) pylab.tight_layout()
def run_enrichment_go(self, taxon, annot_col="Name", out_dir="enrichment"): # pragma: no cover out_dir = Path(out_dir) / "figures" out_dir.mkdir(exist_ok=True, parents=True) gene_lists_dict = self.get_gene_lists(annot_col=annot_col, Nmax=2000, dropna=True) enrichment = {} ontologies = { "GO:0003674": "BP", "GO:0008150": "MF", "GO:0005575": "CC" } failed_enrichments = [] for compa in self.comparisons: gene_lists = gene_lists_dict[compa] pe = PantherEnrichment(gene_lists, taxon) pe.compute_enrichment(ontologies=ontologies.keys(), progress=False) for direction in ["up", "down", "all"]: if not pe.enrichment[direction]: logger.warning( f"No enrichment computed, so no plots computed for {compa} {direction} {ontology}" ) failed_enrichments.append({ "comparison": compa, "direction": direction, "GO": "all", "reason": "no enrichment computed", }) continue for ontology in ontologies.keys(): pylab.figure() enrichment_df = pe.plot_go_terms(direction, ontology, compute_levels=False) if enrichment_df.empty: failed_enrichments.append({ "comparison": compa, "direction": direction, "GO": ontology, "reason": "no enrichment found", }) else: enrichment[(compa, direction, ontology)] = enrichment_df pylab.tight_layout() pylab.savefig( out_dir / f"go_{compa}_{direction}_{ontologies[ontology]}.pdf" ) pe.save_chart( enrichment_df, out_dir / f"chart_{compa}_{direction}_{ontologies[ontology]}.png", ) logger.info(f"Panther enrichment for {compa} DONE.") df = pd.concat(enrichment).sort_index() df.index.rename(["comparison", "direction", "GO_category", "index"], inplace=True) self.enrichment_go = df self.failed_go_enrichments = pd.DataFrame(failed_enrichments) # Export results (should be moved to enrichment.py at some point I think) with pd.ExcelWriter(out_dir.parent / "enrichment_go.xlsx") as writer: df = self.enrichment_go.copy() df.reset_index(inplace=True) df.to_excel(writer, "go", index=False) ws = writer.sheets["go"] try: ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1) except: logger.warning("XLS formatting issue.")
res.append(record.features[rec_i].type) # CDS res.append(b) # start res.append(e) # end res.append(strand) # strand quals = record.features[rec_i].qualifiers res.append(" ; ".join(quals["db_xref"]) if "db_xref" in quals else None) # gene ID (maybe more than one) res.append(" ; ".join(quals["gene"]) if "gene" in quals else None) # gene name (maybe more than one) res.append(" ; ".join(quals["product"]) if "product" in quals else None) # product of the gene res.append(" ; ".join(quals["note"]) if "note" in quals else None) # note (description) else: ########### variant not in CDS : append empty line res = [None] * len(header_df_results) # append to final result result_annot.append(res) df_result_annot = pd.DataFrame(result_annot) df_result_annot.columns = header_df_results df_result_annot.index = df.index df_ouput = pd.concat([df, df_result_annot], axis=1) df_ouput.to_csv(file_output) ################################ ################################################################################################ ################################ ################################################################################################ ################################ ################################################################################################
".txt", "").replace("fof", "") else: title = fof_input.replace("fof_", "").replace(".txt", "") ################################ INPUT DATA ############################################################################################## # list of input files f = open(fof_input, 'r') list_input = [name.split('\n')[0] for name in f.readlines()] f.close() df = [] for file_input in list_input: df.append(pd.read_csv(file_input)) df = pd.concat(df) df["kmer_size"] = df["label"].str.split("_").str[-2] df["kmer_size"] = df["kmer_size"].str.replace("k", "").astype(int) df["DB_size"] = df["label"].str.split("_").str[-1] ## filter out small kmer df = df[df["kmer_size"] > min_kmer_size] ## plot lines fig1, ax1 = pylab.subplots(1, 1, figsize=(5, 5)) all_size = df["DB_size"].unique() for i in range(len(all_size)): size = all_size[i] df_to_plot = df[df["DB_size"] == size] p = df_to_plot["precison_with_unknown"]
# unclassified df = df_result_merged[ (df_result_merged["status"]=='U') & (df_result_merged["name"] == name)] cl.append(df.shape[0]) # total number of reads cl.append(names.count(name)) classification.append(cl) classification_columns = ["good_classification_at_level", "wrong_classification_at_level", "unknown_taxon_at_level", "good_classification_above_level", "wrong_classification_above_level", "unknown_taxon_above_level","Unclassified","total_N_reads"] classification = pd.DataFrame(classification) classification.columns = classification_columns df_read_lineage_result = pd.concat([df_read_lineage,classification],axis=1) df_read_lineage_result.to_csv(filename_output,index=None) """ df_result_merged[(df_result_merged["status"]=='U') & (df_result_merged["name"] == "P_fermentans")] df_result_merged[(df_result_merged["status"]=='C') & (df_result_merged["name"] == "P_fermentans")] tax = 610130 info_taxon = find_tax_info(k_result, tax) res = {"ID":tax, "superkingdom": 0, "phylum": 0, "class": 0, "order": 0, "family": 0, "genus": 0, "species" : 0} get_lineage_tax(k_result,info_taxon, res) """
list_files = [] for name in f.readlines(): f_name = name.split('\n')[0] list_files.append(f_name) f.close() # concat all result files list_to_concat = [] for file_result in list_files: df = pd.read_csv(file_result, sep=",") short_name = file_result.split("/")[-1].split("__")[0:3] df["rotation"] = [int(short_name[2])] * df.shape[0] df["reference"] = [short_name[1]] * df.shape[0] list_to_concat.append(df) #list_to_concat = [pd.read_csv(file_result,sep=",") for file_result in list_files] result = pd.concat(list_to_concat) # import csv file with genome length df_genome_len = pd.read_csv(file_genome_len, sep=',', header=None) df_genome_len.columns = ["name", "length"] #print(df_genome_len) ################################ EXECUTE ############################################################################################## # for each contig, choose the alignement with best score set_contigs = set(result["qName"]) best_contigs = [] for contig in set_contigs: df = result[result["qName"] == contig] best_score = min(df["score_norm"])
def add_individual_report(self, comp, name, counter): style = "width:45%" description = """<p> In the dispersion estimation and model fitting is done, statistical testing is performed. The distribution of raw p-values computed by the statistical test is expected to be a mixture of a uniform distribution on [0, 1] and a peak around 0 corresponding to the differentially expressed features. This may not always be the case. </p>""" def plot_pvalue_hist(filename): import pylab pylab.ioff() pylab.clf() comp.plot_pvalue_hist() pylab.savefig(filename) pylab.close() def plot_padj_hist(filename): import pylab pylab.ioff() pylab.clf() comp.plot_padj_hist() pylab.savefig(filename) pylab.close() img1 = self.create_embedded_png(plot_pvalue_hist, "filename", style=style) img2 = self.create_embedded_png(plot_padj_hist, "filename", style=style) # FIXME. pvalues adjusted are not relevant so commented for now img2 = "" self.sections.append({ "name": f"6.{counter}.a pvalue distribution ({name})", "anchor": f"dge_summary", "content": description + img1 + img2 }) def plot_volcano(filename): import pylab pylab.ioff() pylab.clf() comp.plot_volcano() pylab.savefig(filename) pylab.close() html_volcano = """<p>The volcano plot here below shows the differentially expressed features with a adjusted p-value below 0.05 (dashed back line). The volcano plot represents the log10 of the adjusted P value as a function of the log2 ratio of differential expression. </p>""" #img3 = self.create_embedded_png(plot_volcano, "filename", style=style) img3 = "" fig = comp.plot_volcano(plotly=True, annotations=self.rnadiff.annotation) from plotly import offline plotly = offline.plot(fig, output_type="div", include_plotlyjs=False) self.sections.append({ "name": f"6.{counter}.b volcano plots ({name})", "anchor": f"{name}_volcano", "content": html_volcano + img3 + "<hr>" + plotly }) # finally, let us add the tables from pylab import log10 df = comp.df.copy() #.reset_index() # here we need to add the annotation if possible try: df = pd.concat( [df, self.rnadiff.annotation.annotation.loc[comp.df.index]], axis=1) except Exception as err: logger.critical(f"Could not add annotation. {err}") df = df.reset_index() fold_change = 2**df['log2FoldChange'] log10padj = -log10(df['padj']) df.insert( df.columns.get_loc('log2FoldChange') + 1, 'FoldChange', fold_change) df.insert(df.columns.get_loc('padj') + 1, 'log10_padj', log10padj) try: del df['dispGeneEst'] #del df['dispFit'] #del df['dispMap'] except: pass for x in ['lfcSE', 'stat', 'dispersion']: try: del df[x] except: pass # set options options = { 'scrollX': 'true', 'pageLength': 10, 'scrollCollapse': 'true', 'dom': 'Bfrtip', 'buttons': ['copy', 'csv'] } datatable = DataTable(df, f'{name}_table_all') datatable.datatable.datatable_options = options js_all = datatable.create_javascript_function() html_tab_all = datatable.create_datatable(float_format='%.3e') df_sign = df.query( "padj<=0.05 and (log2FoldChange>1 or log2FoldChange<-1)") datatable = DataTable(df_sign, f'{name}_table_sign') datatable.datatable.datatable_options = options js_sign = datatable.create_javascript_function() html_tab_sign = datatable.create_datatable(float_format='%.3e') self.sections.append({ 'name': f"6.{counter}.c {name} Tables ({name})", 'anchor': f"{name} stats", 'content': f"""<p>The following tables give all DGE results. The first table contains all significant genes (adjusted p-value below 0.05 and absolute fold change of at least 0.5). The following tables contains all results without any filtering. Here is a short explanation for each column: <ul> <li> baseMean: base mean over all samples</li> <li> norm.sampleName: rounded normalized counts per sample</li> <li> FC: fold change in natural base</li> <li> log2FoldChange: log2 Fold Change estimated by the model. Reflects change between the condition versus the reference condition</li> <li> stat: Wald statistic for the coefficient (contrast) tested</li> <li> pvalue: raw p-value from statistical test</li> <li> padj: adjusted pvalue. Used for cutoff at 0.05 </li> <li> betaConv: convergence of the coefficients of the model </li> <li> maxCooks: maximum Cook's distance of the feature </li> <li> outlier: indicate if the feature is an outlier according to Cook's distance </li> </ul> </p> <h3>Significative only<a id="{name}_table_sign"></a></h3> here below is a subset of the next table. It contains all genes below adjusted p-value of 0.05 and absolute log2 fold change above 1. {js_sign} {html_tab_sign} <h3>All genes<a id="{name}_table_all"></a></h3> {js_all} {html_tab_all}""" })
# add column with type of mutation mut_indel = type_variants_smrt(df_smrt, 'ALT') df_smrt['mut_indel'] = mut_indel df_to_merge = df_smrt[[ 'POS', 'REF', 'ALT', 'QUAL', 'mut_indel', 'data_type' ]] df_to_merge.columns = [ 'position', 'reference', 'alternative', 'score', 'mut_indel', 'data_type' ] list_df_to_merge.append(df_to_merge) concat_variant = pd.concat(list_df_to_merge) concat_variant.sort_values(by='position', axis=0, inplace=True) print("Create results table") # format columns names score_names = [analysis + '_score' for analysis in analysis_names] col_names = [] for i in range(len(analysis_names)): col_names.extend([analysis_names[i], score_names[i]]) res_variant_names = ['position'] + col_names + ['mut_indel'] res_variant = [] for i in set(concat_variant['position']): df = concat_variant[concat_variant['position'] == i] # position
f.close() # concat all result files list_to_concat = [] for file_result in list_files: df = pd.read_csv(file_result,sep=",") short_name = file_result.split("/")[-1].split("002929_")[-1].split(".fasta")[0] short_name = short_name.split("_") df["ref_type"] = [short_name[0]]*df.shape[0] if len(short_name) > 1: df["rotation"] = [int(short_name[1])]*df.shape[0] else: df["rotation"] = [0]*df.shape[0] list_to_concat.append(df) #list_to_concat = [pd.read_csv(file_result,sep=",") for file_result in list_files] result = pd.concat(list_to_concat) ################################ EXECUTE ############################################################################################## # for each contig, choose the alignement with best score set_contigs = set(result["qName"]) best_contigs = [] for contig in set_contigs: df = result[result["qName"] == contig] best_score = min(df["score_norm"]) #print(contig, best_score) best_contigs.append(pd.DataFrame(df[df["score_norm"] == best_score].iloc[0,:]).T) res_best = pd.concat(best_contigs)
def plot_pca( self, n_components=2, colors=None, plotly=False, max_features=500, genes_to_remove=[], ): """ .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data path = sequana_data("rnadiff/rnadiff_onecond_1") r = RNADiffResults(path) colors = { 'surexp1': 'r', 'surexp2':'r', 'surexp3':'r', 'surexp1': 'b', 'surexp2':'b', 'surexp3':'b'} r.plot_pca(colors=colors) """ from sequana.viz import PCA # Get most variable genes (n=max_features) top_features = (self.counts_vst.var(axis=1).sort_values( ascending=False).index[:max_features]) if genes_to_remove: top_features = [ x for x in top_features if x not in genes_to_remove ] counts_top_features = self.counts_vst.loc[top_features, :] p = PCA(counts_top_features) if plotly is True: assert n_components == 3 variance = p.plot( n_components=n_components, colors=colors, show_plot=False, max_features=max_features, ) from plotly import express as px df = pd.DataFrame(p.Xr) df.index = p.df.columns df.columns = ["PC1", "PC2", "PC3"] df["size"] = [10] * len(df) # same size for all points ? df = pd.concat([df, self.design_df], axis=1) df["label"] = df.index df["group_color"] = df[self.condition] fig = px.scatter_3d( df, x="PC1", y="PC2", z="PC3", color="group_color", labels={ "PC1": "PC1 ({}%)".format(round(100 * variance[0], 2)), "PC2": "PC2 ({}%)".format(round(100 * variance[1], 2)), "PC3": "PC3 ({}%)".format(round(100 * variance[2], 2)), }, height=800, text="label", ) return fig else: variance = p.plot( n_components=n_components, colors=self.design_df.group_color, max_features=max_features, ) return variance
def plot_volcano( self, padj=0.05, add_broken_axes=False, markersize=4, limit_broken_line=[20, 40], plotly=False, annotations=None, ): """ .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_volcano() """ if plotly: from plotly import express as px df = self.df.copy() if annotations is not None: try: df = pd.concat([df, annotations.annotation], axis=1) except Exception as err: logger.warning( f"Could not merge rnadiff table with annotation. Full error is: {err}" ) df["log_adj_pvalue"] = -pylab.log10(df.padj) df["significance"] = [ "<{}".format(padj) if x else ">={}".format(padj) for x in df.padj < padj ] if "Name" in df.columns: hover_name = "Name" elif "gene_id" in df.columns: hover_name = "gene_id" elif "locus_tag" in df.columns: hover_name = "locus_tag" elif "ID" in df.columns: hover_name = "ID" else: hover_name = None fig = px.scatter( df, x="log2FoldChange", y="log_adj_pvalue", hover_name=hover_name, hover_data=["baseMean"], log_y=False, opacity=0.5, color="significance", height=600, labels={"log_adj_pvalue": "log adjusted p-value"}, ) # axes[0].axhline( # -np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)" # i) # in future version of plotly, a add_hlines will be available. For # now, this is the only way to add axhline fig.update_layout(shapes=[ dict( type="line", xref="x", x0=df.log2FoldChange.min(), x1=df.log2FoldChange.max(), yref="y", y0=-pylab.log10(padj), y1=-pylab.log10(padj), line=dict(color="black", width=1, dash="dash"), ) ]) return fig from brokenaxes import brokenaxes M = max(-pylab.log10(self.df.padj.dropna())) br1, br2 = limit_broken_line if M > br1: if add_broken_axes: bax = brokenaxes(ylims=((0, br1), (M - 10, M)), xlims=None) else: bax = pylab else: bax = pylab d1 = self.df.query("padj>@padj") d2 = self.df.query("padj<=@padj") bax.plot( d1.log2FoldChange, -np.log10(d1.padj), marker="o", alpha=0.5, color="k", lw=0, markersize=markersize, ) bax.plot( d2.log2FoldChange, -np.log10(d2.padj), marker="o", alpha=0.5, color="r", lw=0, markersize=markersize, ) bax.grid(True) try: bax.set_xlabel("fold change") bax.set_ylabel("log10 adjusted p-value") except: bax.xlabel("fold change") bax.ylabel("log10 adjusted p-value") m1 = abs(min(self.df.log2FoldChange)) m2 = max(self.df.log2FoldChange) limit = max(m1, m2) try: bax.set_xlim([-limit, limit]) except: bax.xlim([-limit, limit]) try: y1, _ = bax.get_ylim() ax1 = bax.axs[0].set_ylim([br2, y1[1] * 1.1]) except: y1, y2 = bax.ylim() bax.ylim([0, y2]) bax.axhline(-np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)") return bax if colors is None: colors = {} for sample in self.sample_names: colors[sample] = self.colors[self.get_cond_from_sample(sample)] if plotly is True: assert n_components == 3 variance = p.plot( n_components=n_components, colors=colors, show_plot=False, max_features=max_features, ) from plotly import express as px df = pd.DataFrame(p.Xr) df.columns = ["PC1", "PC2", "PC3"] df["names"] = self.sample_names df["colors"] = [colors[x] for x in self.sample_names] df["size"] = [10] * len(df) df[self.condition] = [ self.get_cond_from_sample(sample) for sample in self.sample_names ] fig = px.scatter_3d( df, x="PC1", y="PC2", z="PC3", color=self.condition, labels={ "PC1": "PC1 ({}%)".format(round(100 * variance[0], 2)), "PC2": "PC2 ({}%)".format(round(100 * variance[1], 2)), "PC3": "PC3 ({}%)".format(round(100 * variance[2], 2)), }, height=800, text="names", ) return fig else: variance = p.plot(n_components=n_components, colors=colors, max_features=max_features) return variance
def summary(self): return pd.concat(res.summary() for compa, res in self.comparisons.items())