Пример #1
0
    def barplot_summary(self,
                        filename=None,
                        color=["green", "red"],
                        alpha=0.8):

        df = self.get_data_reads()
        under = df.query("name=='Undetermined'")

        total = df.query("name!='Undetermined'")
        total = total.groupby("lane").sum().reset_index()
        total["name"] = "Determined"

        df = pd.concat([under, total])  #sort=True)

        df = df.pivot(index="lane", columns="name", values="count")
        df = df[["Determined", "Undetermined"]]
        if df.sum().min() > 1e6:
            df /= 1e6
            df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k')
            pylab.xlabel("Number of reads (M)")
        else:
            df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k')
            pylab.xlabel("Number of reads")
        pylab.legend()

        if filename:
            pylab.savefig(filename, dpi=200)
        return df
Пример #2
0
    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename,
                                 sep="\t",
                                 header=None,
                                 usecols=[0, 2, 3],
                                 chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
            #only_classified_output when there is no found classified read
            self.unclassified = N  # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass  # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0
Пример #3
0
    def summary(self):
        """ Add information of filter.
        """
        Sdefault = self.rnadiff.summary()
        self.rnadiff.log2_fc = 1
        S1 = self.rnadiff.summary()

        # set options
        options = {
            'scrollX': 'true',
            'pageLength': 20,
            'scrollCollapse': 'true',
            'dom': '',
            'buttons': []
        }

        S = pd.concat([Sdefault, S1])

        N = len(Sdefault)
        df = pd.DataFrame({
            'comparison_link': [1] * len(S),
            'comparison':
            S.index.values,
            'Description':
            ['Number of DGE (any FC)'] * N + ['Number of DGE (|FC| > 1)'] * N,
            'Down':
            S['down'].values,
            'Up':
            S['up'].values,
            'Total':
            S['all'].values
        })
        df = df[[
            'comparison', 'Description', 'Down', 'Up', 'Total',
            'comparison_link'
        ]]

        df['comparison_link'] = [f"#{name}_table_all" for name in Sdefault.index] + \
                                [f"#{name}_table_sign" for name in Sdefault.index]

        dt = DataTable(df, 'dge')
        dt.datatable.set_links_to_column('comparison_link',
                                         'comparison',
                                         new_page=False)
        dt.datatable.datatable_options = options
        js_all = dt.create_javascript_function()
        html = dt.create_datatable(float_format='%d')
        self.sections.append({
            'name':
            "Summary",
            'anchor':
            'filters_option',
            'content':
            f"""<p>Here below is a summary of thfinal Differententially Gene
Expression (DGE) analysis. You can find two entries per comparison. The first
one has no filter except for an adjusted p-value of 0.05. The second shows the
expressed genes with a filter of the log2 fold change of 1 (factor 2 in a normal
scale). Clicking on any of the link will lead you to section of the comparison. 
{js_all} {html} </p>"""
        })
Пример #4
0
    def merge(self, overlap=0.2):
        df = pd.concat([self.df1, self.df2]).sort_values(['chr', 'start'])
        # if overlap at least one base, we merge the peaks and label them with
        # common information, otherwise we report the original peak

        merged = []
        prev = None
        overlaps = 0
        N1 = 0
        N2 = 0
        N12 = 0
        skip_next = True
        for k, current in df.iterrows():
            if skip_next:
                prev = current
                skip_next = False
                continue

            # if current overlaps the prev start or end, there is overlap
            # or if current included in prev there current and prev overlaps
            if current['start'] <= prev['start'] and current['end'] >= prev[
                    'start']:
                overlap = True
                N12 += 1
            elif current['start'] <= prev['end'] and current['end'] >= prev[
                    'end']:
                overlap = True
                N12 += 1
            elif current['start'] >= prev['start'] and current['end'] <= prev[
                    'end']:
                overlap = True
                N12 += 1
            else:
                overlap = False
                if prev['name'].startswith('1_vs_6_7'):
                    N1 += 1
                elif prev['name'].startswith('2_vs_6_7'):
                    N2 += 1

            if overlap:
                m = min(current['start'], prev['start'])
                M = max(current['end'], prev['end'])
                data = current.copy()
                data['start'] = m
                data['end'] = M
                data['stop'] = M  #FIXME same as end. decided on one value
                data['category'] = 'both'
                merged.append(data)
                skip_next = True
            else:
                m = min(current['start'], prev['start'])
                M = max(current['end'], prev['end'])
                merged.append(prev)
                skip_next = False

            prev = current
        df = pd.DataFrame(merged)
        df = df.reset_index(drop=True)
        return df
Пример #5
0
    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename, sep="\t", header=None,
                               usecols=[0,2,3], chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
                #only_classified_output when there is no found classified read
            self.unclassified = N # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0
Пример #6
0
    def _get_total_df(self, filtered=False):
        """Concatenate all rnadiff results in a single dataframe.

        FIXME: Columns relative to significative comparisons are not using
        self.log2_fc and self.alpha
        """

        dfs = []

        for compa, res in self.comparisons.items():
            df = res.filt_df if filtered else res.df
            df = df.transpose().reset_index()
            df["file"] = res.name
            df = df.set_index(["file", "index"])
            dfs.append(df)

        df = pd.concat(dfs, sort=True).transpose()

        # Add number of comparisons which are significative for a given gene
        num_sign_compa = (df.loc[:, (slice(None), "padj")] < 0.05).sum(axis=1)
        df.loc[:, ("statistics",
                   "num_of_significative_comparisons")] = num_sign_compa

        # Add list of comparisons which are significative for a given gene
        df_sign_padj = df.loc[:, (slice(None), "padj")] < 0.05
        sign_compa = df_sign_padj.loc[:, (slice(None), "padj")].apply(
            # Extract column names (comparison names) for significative comparisons
            lambda row:
            {col_name[0]
             for sign, col_name in zip(row, row.index) if sign},
            axis=1,
        )
        df.loc[:, ("statistics", "significative_comparisons")] = sign_compa

        if self.annotation is not None and self.fc_attribute and self.fc_feature:
            df = pd.concat([self.annotation, df], axis=1)
        else:
            logger.warning(
                "Missing any of gff, fc_attribute or fc_feature. No annotation will be added."
            )

        return df
Пример #7
0
 def plot_corrplot_counts_normed(self, samples=None, log2=True, lower='pie', upper='text'):
     from sequana.viz import corrplot
     if samples is None:
         samples = self.r1.counts_raw.columns
     df1 = self.r1.counts_norm[samples]
     df2 = self.r2.counts_norm[samples]
     df = pd.concat([df1, df2], keys=['r1', 'r2'], axis=1)
     if log2:
         df = pylab.log2(df)
     c = corrplot.Corrplot(df).plot(upper=upper,  lower=lower)
     return df.corr()
Пример #8
0
    def plot_feature_most_present(self):
        """"""

        df = []

        for x, y in self.counts_raw.idxmax().iteritems():

            most_exp_gene_count = self.counts_raw.stack().loc[y, x]
            total_sample_count = self.counts_raw.sum().loc[x]

            df.append({
                "label":
                x,
                "gene_id":
                y,
                "count":
                most_exp_gene_count,
                "total_sample_count":
                total_sample_count,
                "most_exp_percent":
                most_exp_gene_count / total_sample_count * 100,
            })

        df = pd.DataFrame(df).set_index("label")
        df = pd.concat([self.design_df, df], axis=1)

        pylab.clf()
        p = pylab.barh(
            df.index,
            df.most_exp_percent,
            color=df.group_color,
            zorder=10,
            lw=1,
            ec="k",
            height=0.9,
        )

        for idx, rect in enumerate(p):
            pylab.text(
                2,  # * rect.get_height(),
                idx,  # rect.get_x() + rect.get_width() / 2.0,
                df.gene_id.iloc[idx],
                ha="center",
                va="center",
                rotation=0,
                zorder=20,
            )

        self._format_plot(
            # title="Counts monopolized by the most expressed gene",
            # xlabel="Sample",
            xlabel="Percent of total reads", )
        pylab.tight_layout()
Пример #9
0
    def running_median(self, n, circular=False):
        """Compute running median of genome coverage

        :param int n: window's size.
        :param bool circular: if a mapping is circular (e.g. bacteria
            whole genome sequencing), set to True

        Store the results in the :attr:`df` attribute (dataframe) with a
        column named *rm*.

        .. versionchanged:: 0.1.21
            Use Pandas rolling function to speed up computation.

        """
        self.bed.window_size = n
        self.bed.circular = circular
        # in py2/py3 the division (integer or not) has no impact
        mid = int(n / 2)
        self.range = [None, None]
        try:
            if circular:
                # BASED on running_median pure implementation, could be much
                # slower than pure pandas rolling function. Keep those 4 lines
                # for book keeping though.
                #cover = list(self.df["cov"])
                #cover = cover[-mid:] + cover + cover[:mid]
                #rm = running_median.RunningMedian(cover, n).run()
                #self.df["rm"] = rm[mid:-mid]
                rm = pd.concat([self.df['cov'][-mid:],
                                self.df['cov'],
                                self.df['cov'][:mid]]).rolling(
                                n, center=True).median()
                self.df["rm"] = rm[mid:-mid]

            else:
                rm = self.df['cov'].rolling(n, center=True).median()
                # Like in RunningMedian, we copy the NAN with real data
                rm[0:mid] = self.df['cov'][0:mid]
                rm[-mid:] = self.df['cov'][-mid:]
                #rm = running_median.RunningMedian(cover, n).run()

                self.df["rm"] = rm
                # set up slice for gaussian prediction
                self.range = [mid, -mid]
        except:
            self.df["rm"] = self.df["cov"]
Пример #10
0
 def get_stats(self):
     import pandas as pd
     filenames, mode = self._get_files("*.json")
     if mode == "pe":
         df1 = pd.read_json(filenames[0])
         df2 = pd.read_json(filenames[1])
         df  = pd.concat([df1, df2])
         # Should have been sorted !
         df.index = ['R1', 'R2']
     else:
         df = pd.read_json(filenames[0])
         df.index = ['R1']
     df = df[["A", "C", "G", "T", "N", "n_reads", "mean quality", "GC content",
             "average read length", "total bases"]]
     for this in "ACGTN":
         df[this] /= df["total bases"] 
         df[this] *= 100
     return df
Пример #11
0
 def get_stats(self):
     import pandas as pd
     filenames, mode = self._get_files("*.json")
     if mode == "pe":
         df1 = pd.read_json(filenames[0])
         df2 = pd.read_json(filenames[1])
         df  = pd.concat([df1, df2])
         # Should have been sorted !
         df.index = ['R1', 'R2']
     else:
         df = pd.read_json(filenames[0])
         df.index = ['R1']
     df = df[["A", "C", "G", "T", "N", "n_reads", "mean quality", "GC content",
             "average read length", "total bases"]]
     for this in "ACGTN":
         df[this] /= df["total bases"] 
         df[this] *= 100
     return df
Пример #12
0
    def run_enrichment_kegg(self,
                            organism,
                            annot_col="Name",
                            out_dir="enrichment"):  # pragma: no cover

        out_dir = Path(out_dir) / "figures"
        out_dir.mkdir(exist_ok=True, parents=True)

        gene_lists_dict = self.get_gene_lists(annot_col=annot_col, dropna=True)
        enrichment = {}

        for compa in self.comparisons:
            gene_lists = gene_lists_dict[compa]
            ke = KeggPathwayEnrichment(gene_lists, organism, progress=False)
            ke.compute_enrichment()

            for direction in ["up", "down", "all"]:
                enrichment[(compa, direction)] = ke._get_final_df(
                    ke.enrichment[direction].results, nmax=10000)
                pylab.figure()
                ke.scatterplot(direction)
                pylab.tight_layout()
                pylab.savefig(out_dir / f"kegg_{compa}_{direction}.pdf")
                pylab.savefig(out_dir / f"kegg_{compa}_{direction}.png")

            logger.info(f"KEGG enrichment for {compa} DONE.")

        df = pd.concat(enrichment).sort_index()
        df.index.rename(["comparison", "direction", "index"], inplace=True)

        self.enrichment_kegg = df

        # Export results (should be moved to enrichment.py at some point I think)
        with pd.ExcelWriter(out_dir.parent / "enrichment_kegg.xlsx") as writer:
            df = self.enrichment_kegg.copy()
            df.reset_index(inplace=True)
            df.to_excel(writer, "kegg", index=False)
            ws = writer.sheets["kegg"]
            try:
                ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1)
            except:
                logger.warning("Fixme")
Пример #13
0
    def to_csv(self, output_filename, **kwargs):
        """ Write all data in a csv.

        :param str output_filename: csv output file name.
        :param **dict kwargs: parameters of :meth:`pandas.DataFrame.to_csv`.
        """
        # Concatenate all df
        df_list = [chrom.get_df() for chrom in self.chr_list]
        df = pd.concat(df_list)
        header = ("# sequana_coverage thresholds:{0} window_size:{1} circular:"
                  "{2}".format(self.thresholds.get_args(), self.window_size,
                  self.circular))
        if self.genbank_filename:
            header += ' genbank:' + self.genbank_filename
        if self.gc_window_size:
            header += ' gc_window_size:{0}'.format(self.gc_window_size)
        with open(output_filename, "w") as fp:
            print(header, file=fp)
            for chrom in self.chr_list:
                print("# {0}".format(chrom.get_gaussians()), file=fp)
            df.to_csv(fp, **kwargs)
Пример #14
0
    def plot_count_per_sample(self, fontsize=12, rotation=45):
        """Number of mapped and annotated reads (i.e. counts) per sample. Each color
        for each replicate

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_count_per_sample()

        """
        pylab.clf()
        df = self.counts_raw.sum().rename("total_counts")
        df = pd.concat([self.design_df, df], axis=1)

        pylab.bar(
            df.index,
            df.total_counts / 1000000,
            color=df.group_color,
            lw=1,
            zorder=10,
            ec="k",
            width=0.9,
        )

        pylab.xlabel("Samples", fontsize=fontsize)
        pylab.ylabel("reads (M)", fontsize=fontsize)
        pylab.grid(True, zorder=0)
        pylab.title("Total read count per sample", fontsize=fontsize)
        pylab.xticks(rotation=rotation, ha="right")
        # pylab.xticks(range(N), self.sample_names)
        try:
            pylab.tight_layout()
        except:
            pass
Пример #15
0
    def plot_percentage_null_read_counts(self):
        """Bars represent the percentage of null counts in each samples.  The dashed
        horizontal line represents the percentage of feature counts being equal
        to zero across all samples

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_percentage_null_read_counts()

        """
        pylab.clf()
        # how many null counts ?
        df = (self.counts_raw == 0).sum() / self.counts_raw.shape[0] * 100
        df = df.rename("percent_null")
        df = pd.concat([self.design_df, df], axis=1)

        pylab.bar(df.index,
                  df.percent_null,
                  color=df.group_color,
                  ec="k",
                  lw=1,
                  zorder=10)

        all_null = (self.counts_raw
                    == 0).all(axis=1).sum() / self.counts_raw.shape[0]

        pylab.axhline(all_null, ls="--", color="black", alpha=0.5)

        pylab.xticks(rotation=45, ha="right")
        pylab.ylabel("Proportion of null counts (%)")
        pylab.grid(True, zorder=0)
        pylab.tight_layout()
Пример #16
0
    def run_enrichment_go(self,
                          taxon,
                          annot_col="Name",
                          out_dir="enrichment"):  # pragma: no cover

        out_dir = Path(out_dir) / "figures"
        out_dir.mkdir(exist_ok=True, parents=True)

        gene_lists_dict = self.get_gene_lists(annot_col=annot_col,
                                              Nmax=2000,
                                              dropna=True)
        enrichment = {}
        ontologies = {
            "GO:0003674": "BP",
            "GO:0008150": "MF",
            "GO:0005575": "CC"
        }
        failed_enrichments = []

        for compa in self.comparisons:
            gene_lists = gene_lists_dict[compa]
            pe = PantherEnrichment(gene_lists, taxon)
            pe.compute_enrichment(ontologies=ontologies.keys(), progress=False)

            for direction in ["up", "down", "all"]:
                if not pe.enrichment[direction]:
                    logger.warning(
                        f"No enrichment computed, so no plots computed for {compa} {direction} {ontology}"
                    )
                    failed_enrichments.append({
                        "comparison":
                        compa,
                        "direction":
                        direction,
                        "GO":
                        "all",
                        "reason":
                        "no enrichment computed",
                    })
                    continue

                for ontology in ontologies.keys():
                    pylab.figure()
                    enrichment_df = pe.plot_go_terms(direction,
                                                     ontology,
                                                     compute_levels=False)
                    if enrichment_df.empty:
                        failed_enrichments.append({
                            "comparison":
                            compa,
                            "direction":
                            direction,
                            "GO":
                            ontology,
                            "reason":
                            "no enrichment found",
                        })
                    else:
                        enrichment[(compa, direction,
                                    ontology)] = enrichment_df
                        pylab.tight_layout()
                        pylab.savefig(
                            out_dir /
                            f"go_{compa}_{direction}_{ontologies[ontology]}.pdf"
                        )
                        pe.save_chart(
                            enrichment_df,
                            out_dir /
                            f"chart_{compa}_{direction}_{ontologies[ontology]}.png",
                        )

            logger.info(f"Panther enrichment for {compa} DONE.")

        df = pd.concat(enrichment).sort_index()
        df.index.rename(["comparison", "direction", "GO_category", "index"],
                        inplace=True)

        self.enrichment_go = df
        self.failed_go_enrichments = pd.DataFrame(failed_enrichments)

        # Export results (should be moved to enrichment.py at some point I think)
        with pd.ExcelWriter(out_dir.parent / "enrichment_go.xlsx") as writer:
            df = self.enrichment_go.copy()
            df.reset_index(inplace=True)
            df.to_excel(writer, "go", index=False)
            ws = writer.sheets["go"]
            try:
                ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1)
            except:
                logger.warning("XLS formatting issue.")
Пример #17
0
        res.append(record.features[rec_i].type)  # CDS
        res.append(b)  # start
        res.append(e)  # end
        res.append(strand)  # strand
        quals = record.features[rec_i].qualifiers
        res.append(" ; ".join(quals["db_xref"]) if "db_xref" in quals else
                   None)  # gene ID (maybe more than one)
        res.append(" ; ".join(quals["gene"]) if "gene" in quals else
                   None)  # gene name (maybe more than one)
        res.append(" ; ".join(quals["product"])
                   if "product" in quals else None)  # product of the gene
        res.append(" ; ".join(quals["note"])
                   if "note" in quals else None)  # note (description)
    else:
        ########### variant not in CDS : append empty line
        res = [None] * len(header_df_results)

    # append to final result
    result_annot.append(res)

df_result_annot = pd.DataFrame(result_annot)
df_result_annot.columns = header_df_results
df_result_annot.index = df.index

df_ouput = pd.concat([df, df_result_annot], axis=1)
df_ouput.to_csv(file_output)

################################  ################################################################################################
################################  ################################################################################################
################################  ################################################################################################
Пример #18
0
        ".txt", "").replace("fof", "")
else:
    title = fof_input.replace("fof_", "").replace(".txt", "")

################################ INPUT DATA ##############################################################################################

# list of input files
f = open(fof_input, 'r')
list_input = [name.split('\n')[0] for name in f.readlines()]
f.close()

df = []
for file_input in list_input:
    df.append(pd.read_csv(file_input))

df = pd.concat(df)
df["kmer_size"] = df["label"].str.split("_").str[-2]
df["kmer_size"] = df["kmer_size"].str.replace("k", "").astype(int)

df["DB_size"] = df["label"].str.split("_").str[-1]

## filter out small kmer
df = df[df["kmer_size"] > min_kmer_size]

## plot lines
fig1, ax1 = pylab.subplots(1, 1, figsize=(5, 5))
all_size = df["DB_size"].unique()
for i in range(len(all_size)):
    size = all_size[i]
    df_to_plot = df[df["DB_size"] == size]
    p = df_to_plot["precison_with_unknown"]
Пример #19
0
	
	# unclassified
	df = df_result_merged[ (df_result_merged["status"]=='U') & (df_result_merged["name"] == name)]
	cl.append(df.shape[0])

	# total number of reads
	cl.append(names.count(name))
	
	classification.append(cl)

classification_columns = ["good_classification_at_level", "wrong_classification_at_level", "unknown_taxon_at_level",
"good_classification_above_level", "wrong_classification_above_level", "unknown_taxon_above_level","Unclassified","total_N_reads"]
classification = pd.DataFrame(classification)
classification.columns = classification_columns

df_read_lineage_result = pd.concat([df_read_lineage,classification],axis=1)
df_read_lineage_result.to_csv(filename_output,index=None)


"""
df_result_merged[(df_result_merged["status"]=='U') & (df_result_merged["name"] == "P_fermentans")]
df_result_merged[(df_result_merged["status"]=='C') & (df_result_merged["name"] == "P_fermentans")]

tax = 610130
info_taxon = find_tax_info(k_result, tax)

res = {"ID":tax, "superkingdom": 0, "phylum": 0, "class": 0, "order": 0, "family": 0, "genus": 0, "species" : 0}
get_lineage_tax(k_result,info_taxon, res)

"""
Пример #20
0
list_files = []
for name in f.readlines():
    f_name = name.split('\n')[0]
    list_files.append(f_name)
f.close()

# concat all result files
list_to_concat = []
for file_result in list_files:
    df = pd.read_csv(file_result, sep=",")
    short_name = file_result.split("/")[-1].split("__")[0:3]
    df["rotation"] = [int(short_name[2])] * df.shape[0]
    df["reference"] = [short_name[1]] * df.shape[0]
    list_to_concat.append(df)
#list_to_concat = [pd.read_csv(file_result,sep=",") for file_result in list_files]
result = pd.concat(list_to_concat)

# import csv file with genome length
df_genome_len = pd.read_csv(file_genome_len, sep=',', header=None)
df_genome_len.columns = ["name", "length"]
#print(df_genome_len)

################################ EXECUTE ##############################################################################################

# for each contig, choose the alignement with best score
set_contigs = set(result["qName"])

best_contigs = []
for contig in set_contigs:
    df = result[result["qName"] == contig]
    best_score = min(df["score_norm"])
Пример #21
0
    def add_individual_report(self, comp, name, counter):
        style = "width:45%"

        description = """<p>
In the dispersion estimation and model fitting is done, statistical testing is
performed. The distribution of raw p-values computed by the statistical test 
is expected to be a mixture of a uniform distribution on [0, 1] and a peak
around 0 corresponding to the differentially expressed features. This may not
always be the case. </p>"""

        def plot_pvalue_hist(filename):
            import pylab
            pylab.ioff()
            pylab.clf()
            comp.plot_pvalue_hist()
            pylab.savefig(filename)
            pylab.close()

        def plot_padj_hist(filename):
            import pylab
            pylab.ioff()
            pylab.clf()
            comp.plot_padj_hist()
            pylab.savefig(filename)
            pylab.close()

        img1 = self.create_embedded_png(plot_pvalue_hist,
                                        "filename",
                                        style=style)
        img2 = self.create_embedded_png(plot_padj_hist,
                                        "filename",
                                        style=style)

        # FIXME. pvalues adjusted are not relevant so commented for now
        img2 = ""

        self.sections.append({
            "name": f"6.{counter}.a pvalue distribution ({name})",
            "anchor": f"dge_summary",
            "content": description + img1 + img2
        })

        def plot_volcano(filename):
            import pylab
            pylab.ioff()
            pylab.clf()
            comp.plot_volcano()
            pylab.savefig(filename)
            pylab.close()

        html_volcano = """<p>The volcano plot here below shows the differentially
expressed features with a adjusted p-value below 0.05 (dashed back line). 
The volcano plot represents the log10 of the adjusted P
value as a function of the log2 ratio of differential expression. </p>"""
        #img3 = self.create_embedded_png(plot_volcano, "filename", style=style)
        img3 = ""
        fig = comp.plot_volcano(plotly=True,
                                annotations=self.rnadiff.annotation)
        from plotly import offline
        plotly = offline.plot(fig, output_type="div", include_plotlyjs=False)

        self.sections.append({
            "name": f"6.{counter}.b volcano plots ({name})",
            "anchor": f"{name}_volcano",
            "content": html_volcano + img3 + "<hr>" + plotly
        })

        # finally, let us add the tables
        from pylab import log10

        df = comp.df.copy()  #.reset_index()

        # here we need to add the annotation if possible
        try:
            df = pd.concat(
                [df, self.rnadiff.annotation.annotation.loc[comp.df.index]],
                axis=1)
        except Exception as err:
            logger.critical(f"Could not add annotation. {err}")

        df = df.reset_index()

        fold_change = 2**df['log2FoldChange']
        log10padj = -log10(df['padj'])
        df.insert(
            df.columns.get_loc('log2FoldChange') + 1, 'FoldChange',
            fold_change)
        df.insert(df.columns.get_loc('padj') + 1, 'log10_padj', log10padj)

        try:
            del df['dispGeneEst']
            #del df['dispFit']
            #del df['dispMap']
        except:
            pass

        for x in ['lfcSE', 'stat', 'dispersion']:
            try:
                del df[x]
            except:
                pass
        # set options
        options = {
            'scrollX': 'true',
            'pageLength': 10,
            'scrollCollapse': 'true',
            'dom': 'Bfrtip',
            'buttons': ['copy', 'csv']
        }

        datatable = DataTable(df, f'{name}_table_all')
        datatable.datatable.datatable_options = options
        js_all = datatable.create_javascript_function()
        html_tab_all = datatable.create_datatable(float_format='%.3e')

        df_sign = df.query(
            "padj<=0.05 and (log2FoldChange>1 or log2FoldChange<-1)")
        datatable = DataTable(df_sign, f'{name}_table_sign')
        datatable.datatable.datatable_options = options
        js_sign = datatable.create_javascript_function()
        html_tab_sign = datatable.create_datatable(float_format='%.3e')

        self.sections.append({
            'name':
            f"6.{counter}.c {name} Tables ({name})",
            'anchor':
            f"{name} stats",
            'content':
            f"""<p>The following tables give all DGE results. The
first table contains all significant genes (adjusted p-value below 0.05 and
absolute fold change of at least 0.5). The following tables contains all results
without any filtering. Here is a short explanation for each column:
<ul>
<li> baseMean: base mean over all samples</li>
<li> norm.sampleName: rounded normalized counts per sample</li>
<li> FC: fold change in natural base</li>
<li> log2FoldChange: log2 Fold Change estimated by the model. Reflects change
between the condition versus the reference condition</li>
<li> stat: Wald statistic for the coefficient (contrast) tested</li>
<li> pvalue: raw p-value from statistical test</li>
<li> padj: adjusted pvalue. Used for cutoff at 0.05 </li>
<li> betaConv: convergence of the coefficients of the model </li>
<li> maxCooks: maximum Cook's distance of the feature </li>
<li> outlier: indicate if the feature is an outlier according to Cook's distance
</li>
</ul>
</p>
<h3>Significative only<a id="{name}_table_sign"></a></h3>
here below is a subset of the next table. It contains all genes below adjusted
p-value of 0.05 and absolute log2 fold change above 1.
{js_sign} {html_tab_sign} 

<h3>All genes<a id="{name}_table_all"></a></h3>
{js_all} {html_tab_all}"""
        })
Пример #22
0
    # add column with type of mutation
    mut_indel = type_variants_smrt(df_smrt, 'ALT')
    df_smrt['mut_indel'] = mut_indel

    df_to_merge = df_smrt[[
        'POS', 'REF', 'ALT', 'QUAL', 'mut_indel', 'data_type'
    ]]
    df_to_merge.columns = [
        'position', 'reference', 'alternative', 'score', 'mut_indel',
        'data_type'
    ]

    list_df_to_merge.append(df_to_merge)

concat_variant = pd.concat(list_df_to_merge)
concat_variant.sort_values(by='position', axis=0, inplace=True)

print("Create results table")
# format columns names
score_names = [analysis + '_score' for analysis in analysis_names]
col_names = []
for i in range(len(analysis_names)):
    col_names.extend([analysis_names[i], score_names[i]])

res_variant_names = ['position'] + col_names + ['mut_indel']
res_variant = []

for i in set(concat_variant['position']):
    df = concat_variant[concat_variant['position'] == i]
    # position
f.close()

# concat all result files
list_to_concat = []
for file_result in list_files:
	df = pd.read_csv(file_result,sep=",")
	short_name = file_result.split("/")[-1].split("002929_")[-1].split(".fasta")[0]
	short_name = short_name.split("_")
	df["ref_type"] = [short_name[0]]*df.shape[0]
	if len(short_name) > 1:
		df["rotation"] = [int(short_name[1])]*df.shape[0]
	else:
		df["rotation"] = [0]*df.shape[0]
	list_to_concat.append(df)
#list_to_concat = [pd.read_csv(file_result,sep=",") for file_result in list_files]
result = pd.concat(list_to_concat)


################################ EXECUTE ##############################################################################################

# for each contig, choose the alignement with best score
set_contigs = set(result["qName"])

best_contigs = []
for contig in set_contigs:
	df = result[result["qName"] == contig]
	best_score = min(df["score_norm"])
	#print(contig, best_score)
	best_contigs.append(pd.DataFrame(df[df["score_norm"] == best_score].iloc[0,:]).T)

res_best = pd.concat(best_contigs)
Пример #24
0
    def plot_pca(
        self,
        n_components=2,
        colors=None,
        plotly=False,
        max_features=500,
        genes_to_remove=[],
    ):
        """

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            path = sequana_data("rnadiff/rnadiff_onecond_1")
            r = RNADiffResults(path)

            colors = {
                'surexp1': 'r',
                'surexp2':'r',
                'surexp3':'r',
                'surexp1': 'b',
                'surexp2':'b',
                'surexp3':'b'}
            r.plot_pca(colors=colors)
        """
        from sequana.viz import PCA

        # Get most variable genes (n=max_features)
        top_features = (self.counts_vst.var(axis=1).sort_values(
            ascending=False).index[:max_features])

        if genes_to_remove:
            top_features = [
                x for x in top_features if x not in genes_to_remove
            ]

        counts_top_features = self.counts_vst.loc[top_features, :]

        p = PCA(counts_top_features)

        if plotly is True:
            assert n_components == 3
            variance = p.plot(
                n_components=n_components,
                colors=colors,
                show_plot=False,
                max_features=max_features,
            )
            from plotly import express as px

            df = pd.DataFrame(p.Xr)
            df.index = p.df.columns
            df.columns = ["PC1", "PC2", "PC3"]
            df["size"] = [10] * len(df)  # same size for all points ?

            df = pd.concat([df, self.design_df], axis=1)
            df["label"] = df.index
            df["group_color"] = df[self.condition]

            fig = px.scatter_3d(
                df,
                x="PC1",
                y="PC2",
                z="PC3",
                color="group_color",
                labels={
                    "PC1": "PC1 ({}%)".format(round(100 * variance[0], 2)),
                    "PC2": "PC2 ({}%)".format(round(100 * variance[1], 2)),
                    "PC3": "PC3 ({}%)".format(round(100 * variance[2], 2)),
                },
                height=800,
                text="label",
            )
            return fig
        else:
            variance = p.plot(
                n_components=n_components,
                colors=self.design_df.group_color,
                max_features=max_features,
            )

        return variance
Пример #25
0
    def plot_volcano(
        self,
        padj=0.05,
        add_broken_axes=False,
        markersize=4,
        limit_broken_line=[20, 40],
        plotly=False,
        annotations=None,
    ):
        """

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_volcano()

        """

        if plotly:
            from plotly import express as px

            df = self.df.copy()

            if annotations is not None:
                try:
                    df = pd.concat([df, annotations.annotation], axis=1)
                except Exception as err:
                    logger.warning(
                        f"Could not merge rnadiff table with annotation. Full error is: {err}"
                    )
            df["log_adj_pvalue"] = -pylab.log10(df.padj)
            df["significance"] = [
                "<{}".format(padj) if x else ">={}".format(padj)
                for x in df.padj < padj
            ]

            if "Name" in df.columns:
                hover_name = "Name"
            elif "gene_id" in df.columns:
                hover_name = "gene_id"
            elif "locus_tag" in df.columns:
                hover_name = "locus_tag"
            elif "ID" in df.columns:
                hover_name = "ID"
            else:
                hover_name = None
            fig = px.scatter(
                df,
                x="log2FoldChange",
                y="log_adj_pvalue",
                hover_name=hover_name,
                hover_data=["baseMean"],
                log_y=False,
                opacity=0.5,
                color="significance",
                height=600,
                labels={"log_adj_pvalue": "log adjusted p-value"},
            )
            # axes[0].axhline(
            # -np.log10(0.05), lw=2, ls="--", color="r", label="pvalue threshold (0.05)"
            # i)
            # in future version of plotly, a add_hlines will be available. For
            # now, this is the only way to add axhline
            fig.update_layout(shapes=[
                dict(
                    type="line",
                    xref="x",
                    x0=df.log2FoldChange.min(),
                    x1=df.log2FoldChange.max(),
                    yref="y",
                    y0=-pylab.log10(padj),
                    y1=-pylab.log10(padj),
                    line=dict(color="black", width=1, dash="dash"),
                )
            ])

            return fig

        from brokenaxes import brokenaxes

        M = max(-pylab.log10(self.df.padj.dropna()))

        br1, br2 = limit_broken_line
        if M > br1:
            if add_broken_axes:
                bax = brokenaxes(ylims=((0, br1), (M - 10, M)), xlims=None)
            else:
                bax = pylab
        else:
            bax = pylab

        d1 = self.df.query("padj>@padj")
        d2 = self.df.query("padj<=@padj")
        bax.plot(
            d1.log2FoldChange,
            -np.log10(d1.padj),
            marker="o",
            alpha=0.5,
            color="k",
            lw=0,
            markersize=markersize,
        )
        bax.plot(
            d2.log2FoldChange,
            -np.log10(d2.padj),
            marker="o",
            alpha=0.5,
            color="r",
            lw=0,
            markersize=markersize,
        )

        bax.grid(True)
        try:
            bax.set_xlabel("fold change")
            bax.set_ylabel("log10 adjusted p-value")
        except:
            bax.xlabel("fold change")
            bax.ylabel("log10 adjusted p-value")

        m1 = abs(min(self.df.log2FoldChange))
        m2 = max(self.df.log2FoldChange)
        limit = max(m1, m2)
        try:
            bax.set_xlim([-limit, limit])
        except:
            bax.xlim([-limit, limit])
        try:
            y1, _ = bax.get_ylim()
            ax1 = bax.axs[0].set_ylim([br2, y1[1] * 1.1])
        except:
            y1, y2 = bax.ylim()
            bax.ylim([0, y2])
        bax.axhline(-np.log10(0.05),
                    lw=2,
                    ls="--",
                    color="r",
                    label="pvalue threshold (0.05)")
        return bax

        if colors is None:
            colors = {}
            for sample in self.sample_names:
                colors[sample] = self.colors[self.get_cond_from_sample(sample)]

        if plotly is True:
            assert n_components == 3
            variance = p.plot(
                n_components=n_components,
                colors=colors,
                show_plot=False,
                max_features=max_features,
            )
            from plotly import express as px

            df = pd.DataFrame(p.Xr)
            df.columns = ["PC1", "PC2", "PC3"]
            df["names"] = self.sample_names
            df["colors"] = [colors[x] for x in self.sample_names]
            df["size"] = [10] * len(df)
            df[self.condition] = [
                self.get_cond_from_sample(sample)
                for sample in self.sample_names
            ]
            fig = px.scatter_3d(
                df,
                x="PC1",
                y="PC2",
                z="PC3",
                color=self.condition,
                labels={
                    "PC1": "PC1 ({}%)".format(round(100 * variance[0], 2)),
                    "PC2": "PC2 ({}%)".format(round(100 * variance[1], 2)),
                    "PC3": "PC3 ({}%)".format(round(100 * variance[2], 2)),
                },
                height=800,
                text="names",
            )
            return fig
        else:
            variance = p.plot(n_components=n_components,
                              colors=colors,
                              max_features=max_features)

        return variance
Пример #26
0
 def summary(self):
     return pd.concat(res.summary()
                      for compa, res in self.comparisons.items())