Exemplo n.º 1
0
def create_report(report_string, pubmed_string, image_files):
    image_string = """"""
    for idx in range(len(image_files)):
        i = str(idx)
        image_string += ".. image:: {image_files["+i+"]}"
        if idx != len(image_files)-1:
            image_string += """
    """

    report("""
    RNA-seq gene information report
    ===================================================
    {report_string}

    PubMed
    ----------------------------------------------------
    {pubmed_string}

    """+image_string, snakemake.output[0], metadata="Authors: Daan Gilissen and Koen Rademaker (support: [email protected])")
Exemplo n.º 2
0
        "bcftools call -mv - > {output} ) 2> {log}"



rule call:
    input:
        CALL + "all.vcf"



rule call_report:
    input:
        CALL + "all.vcf"
    output:
        protected(REPORT_CALL + "call.html")
    run:
        from snakemake.utils import report
        with open(input[0]) as vcf:
            n_calls = sum(1 for l in vcf if not l.startswith("#"))

        report("""
        An example variant calling workflow
        ===================================

        Reads were mapped to the Yeast
        reference genome and variants were called jointly with
        SAMtools/BCFtools.

        This resulted in {n_calls} variants (see Table T1_).
        """, output[0], T1=input[0])
Exemplo n.º 3
0
def main(samples, contig_stats, gene_tables, mapping_logs, report_out, combined_stats):
    sample_data = {}
    for sample in samples:
        sample_data[sample] = {}
        for c_stat in contig_stats:
            # underscore version was for simplified local testing
            # if "%s_" % sample in c_stat:
            if "%s/" % sample in c_stat:
                sample_data[sample]["contig_stats"] = c_stat
        for g_table in gene_tables:
            # if "%s_" % sample in g_table:
            if "%s/" % sample in g_table:
                sample_data[sample]["gene_table"] = g_table
        for mapping_log in mapping_logs:
            # if "%s_" % sample in mapping_log:
            if "%s/" % sample in mapping_log:
                sample_data[sample]["mapping_log"] = mapping_log
    df = parse_map_stats(sample_data, combined_stats)
    div = {}
    labels = {
        "Percent_Assembled_Reads": "Percent of Assembled Reads",
        "contig_bp": "Total BP",
        "n_contigs": "Contigs (count)",
        "N_Predicted_Genes": "Predicted Genes (count)",
    }
    for variable in [
        "Percent_Assembled_Reads", "contig_bp", "n_contigs", "N_Predicted_Genes"
    ]:
        y_axis_label = labels[variable]
        div[variable] = offline.plot(
            df[variable].iplot(
                asFigure=True,
                kind="bar",
                xTitle="Samples",
                layout=go.Layout(
                    xaxis=dict(tickangle=45), yaxis=dict(title=y_axis_label)
                ),
            ),
            **PLOTLY_PARAMS,
        )
    div["N50"] = offline.plot(
        df[["N50", "N90"]].iplot(
            asFigure=True,
            kind="bar",
            xTitle="Samples",
            layout=go.Layout(xaxis=dict(tickangle=45), yaxis=(dict(title="Bases"))),
        ),
        **PLOTLY_PARAMS,
    )
    report_str = """

.. raw:: html

    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>


=============================================================
ATLAS_ - Assembly Summary
=============================================================

.. _ATLAS: https://github.com/metagenome-atlas/atlas

.. contents::
    :backlinks: none


Summary
-------

N50
***

.. raw:: html

    {div[N50]}


Assembly Length
***************

.. raw:: html

    {div[contig_bp]}


Number of Contigs
*****************

.. raw:: html

    {div[n_contigs]}


Number of Predicted Genes
*************************

.. raw:: html

    {div[N_Predicted_Genes]}


Percent of Assembled Reads
**************************

.. raw:: html

    {div[Percent_Assembled_Reads]}


For more information see Table_1_


Downloads
---------

"""
    report(report_str, report_out, Table_1=combined_stats, stylesheet=os.path.join(atlas_dir,'report', "report.css"))
Exemplo n.º 4
0
def main(report_out, read_counts, zipfiles_raw, zipfiles_QC, min_quality):
    div = {}

    # N reads / N bases
    df = pd.read_table(read_counts, index_col=[0, 1])
    for variable in ['Total_Reads', 'Total_Bases']:
        data = df[variable].unstack()[df.loc[df.index[0][0]].index.drop(
            'clean')]
        div[variable] = offline.plot(
            data.iplot(
                asFigure=True,
                kind='bar',
                xTitle='Samples',
                yTitle=variable.replace('_', ' '),
                layout=go.Layout(xaxis=dict(tickangle=45)),
            ),
            **PLOTLY_PARAMS,
        )

    Report_numbers = """

Total reads per sample
~~~~~~~~~~~~~~~~~~~~~~

.. raw:: html

    {div[Total_Reads]}

============   ===================================
Step           Output
============   ===================================
raw            the input reads
deduplicated   after (optional) deduplication step
filtered       trimmed, PhiX filtered
qc             passing reads
============   ===================================

Total bases per sample
~~~~~~~~~~~~~~~~~~~~~~
.. raw:: html

    {div[Total_Bases]}

For details see Table Table1_.
"""

    Report_read_quality_qc = """

Reads quality after QC
~~~~~~~~~~~~~~~~~~~~~~
"""

    Quality_pe, Quality_se = get_stats_from_zips(zipfiles_QC)
    max_quality = 1 + np.nanmax(
        (Quality_pe.max().max(), Quality_se.max().max()))
    if Quality_pe.shape[0] > 0:
        div['quality_qc_pe'] = get_pe_read_quality_plot(
            Quality_pe, [min_quality, max_quality])
        Report_read_quality_qc += """
Paired end
**********
.. raw:: html

    {div[quality_qc_pe]}

Single end
**********

Paired end reads that lost their mate during filtering.

"""
    div['quality_qc_se'] = draw_se_read_quality(Quality_se,
                                                [min_quality, max_quality])
    Report_read_quality_qc += """
.. raw:: html

    {div[quality_qc_se]}

"""
    Report_read_quality_raw = """

Reads quality before QC
~~~~~~~~~~~~~~~~~~~~~~~

.. raw:: html

    {div[quality_raw]}

"""
    Quality_pe, Quality_se = get_stats_from_zips(zipfiles_raw)
    if Quality_pe.shape[0] > 0:
        div['quality_raw'] = get_pe_read_quality_plot(
            Quality_pe, [min_quality, max_quality])
    elif Quality_se.shape[0] > 0:
        div['quality_raw'] = draw_se_read_quality(Quality_se,
                                                  [min_quality, max_quality])
    else:
        raise IndexError()

    report_str = """

.. raw:: html

    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>


=============================================================
ATLAS_ - QC Summary
=============================================================

.. _ATLAS: https://github.com/metagenome-atlas/atlas

.. contents::
    :backlinks: none


Summary
-------


"""+\
    Report_numbers+\
    Report_read_quality_qc+\
    Report_read_quality_raw+"""

Downloads
---------

"""

    report(
        report_str,
        report_out,
        Table1=read_counts,
        stylesheet=os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                "report.css"),
    )
Exemplo n.º 5
0
           Per-rule benchmark summary statistics are in Table T1_.

           Per-job compiled benchmark statistics are in Table T2_.


           Overall resource use
           --------------------

           These figures show a summary of the resources used across the
           workflow. 

           {resource_reports}


           Resources per module
           --------------------

           These figures detail the resources used for job, broken down
           into different rules (indicated by color) per module, with each
           type of resource indicated by a separate figure panel. 

           {module_reports}

           """.format(**locals())
        print(report_text)
        # generate report
        report(report_text, output.report, **figs.__dict__)


Exemplo n.º 6
0
def main(bin_table, report_out):

    div = {}

    df = pd.read_csv(bin_table, sep="\t", index_col=0)

    df.to_csv(bin_table, sep="\t")

    div["bin_scatter"] = offline.plot(
        {
            "data": [{
                "x": df.loc[df["Sample"] == sample, "Completeness"],
                "y": df.loc[df["Sample"] == sample, "Contamination"],
                "name": sample,
                "mode": "markers",
                "text": df.index[df["Sample"] == sample],
                "hoverinfo": "text",
                "showlegend": True,
            } for sample in df.Sample.unique()],
            "layout": {
                "xaxis": {
                    "title": "Completeness"
                },
                "yaxis": {
                    "title": "Contamination"
                },
            },
        },
        **PLOTLY_PARAMS,
    )
    # subset the checkm stats dataframe
    df = df[(df.Contamination <= 5) & (df.Completeness >= 90)].copy()
    df.index.name = "Bin ID"
    df.reset_index(inplace=True)
    df = df[[
        "Bin ID",
        "Completeness",
        "Contamination",
        "Taxonomy (contained)",
        "Taxonomy (sister lineage)",
        "GC",
        "Genome size (Mbp)",
        "Gene count",
    ]]
    df["Taxonomy (contained)"] = df["Taxonomy (contained)"].apply(
        lambda s: "; ".join(str(s).split(";")[-2:]))
    df["Taxonomy (sister lineage)"] = df["Taxonomy (sister lineage)"].apply(
        lambda s: "; ".join(str(s).split(";")[-1:]))
    with pd.option_context("display.precision", 3):
        div["table"] = df.to_html(index=False).replace("\n", "\n" + 10 * " ")
    report_str = """

.. raw:: html

    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>


=============================================================
ATLAS_ - Bin Summary
=============================================================

.. _ATLAS: https://github.com/metagenome-atlas/atlas

.. contents::
    :backlinks: none


Summary
-------

Recovered Bins
**************

.. raw:: html

    {div[bin_scatter]}

In some cases, percentages can be above 100% (See: `CheckM Issue 107`_).

.. _CheckM Issue 107: https://github.com/Ecogenomics/CheckM/issues/107


Best Bins
*********

Genomes with >90% completeness and <5% contamination:

.. raw:: html

    {div[table]}


See full list at Table_1_.


Downloads
---------

    """
    report(
        report_str,
        report_out,
        Table_1=bin_table,
        stylesheet=os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                "report.css"),
    )
	input:
		OUT_DIR + '{genome}/flagstat'
	output:
		OUT_DIR + '{genome}/summary_flagstat.txt'
	shell:
		"Rscript {SCRIPT_DIR}/merge_flagstats.R {input} {output}"
	
rule report:
	input:
		T1=OUT_DIR + '{genome}/summary_flagstat.txt'
	output:
		html=OUT_DIR + '{genome}/report_align_{genome}.html'
	run:
		report("""
		============================================
		RESUME DE L'ALIGNEMENT SUR {REFERENCES}
		============================================
		Voir le tableau T1_
		
		Total_Reads = Nombre de reads total apres alignement dans fichier bam
		
		Mapped = Nombre de reads qui s'alignent sur le genome de reference
		
		properly_paired = Nombre de reads qui s'alignent sur le genome de reference de maniere appariee (R1 et R2 alignees sur la meme ref)
		
		Attention pour avoir le nb de reads TOTALES passant QC  (nb de read en input) et le nb de reads qui s'alignent sur {REFERENCES} il faut soustraire les Supplementary et Secondary
		(reads qui s'alignent sur plusieurs refs et qui sont comptees plusieurs fois)
		
		""", output.html, metadata="Laurence Josset", **input)
	
Exemplo n.º 8
0
from snakemake.utils import report

with open(snakemake.input.T1) as vcf:
    n_calls = sum(1 for l in vcf if not l.startswith("#"))

report(
    """
An example variant calling workflow
===================================

Reads were mapped to the Yeast
reference genome and variants were called jointly with
SAMtools/BCFtools.

This resulted in {n_calls} variants (see Table T1_).
Benchmark results for BWA can be found in the tables T2_.
""", snakemake.output[0], **snakemake.input)
Exemplo n.º 9
0
    def snakemake_report(self):
        """
        Generate a SnakeMake report.

        Generates an html report containing diagnostic plots for read coverage
        quality. All images are saved to a `plots` subdirecty in the specified
        output directory from `self.output_dir`. Finally, a `.csv` file
        containing read summaries is written to `self.output_dir`. 
        """

        # check if output dir exists
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)

        # output paths
        csv_path = os.path.join(self.output_dir, 'read_summary.csv')

        # report loc
        html_path = os.path.join(self.output_dir, 'report.html')

        # plot loc
        plot_dir = os.path.join(self.output_dir, 'plots')
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        violin_path = os.path.join(plot_dir, 'violin.png')
        cdf_path = os.path.join(plot_dir, 'cdf.png')
        stacked_bar_path = os.path.join(plot_dir, 'stacked_barplot.png')

        # write csv file
        self.read_df.to_csv(csv_path)

        # clear any pyplot figures just in case
        plt.cla()

        # create plots
        self.create_violin_plot()
        plt.savefig(violin_path)
        plt.cla()

        self.create_cdf_plot()
        plt.savefig(cdf_path)
        plt.cla()

        self.create_stack_barplot()
        plt.savefig(stacked_bar_path)
        plt.cla()

        rst_markup = """
        ============================================
        Read Summary Following Fastp Quality Control
        ============================================

        Read Counts per Treatment
        =========================
        .. image:: plots/violin.png

        Read Coverage Quality
        =====================
        .. image:: plots/cdf.png

        Coverage Quality per Treatment
        ==============================
        .. image:: plots/stacked_barplot.png
        """.format(violin_path, cdf_path, stacked_bar_path)
        report(rst_markup,
               html_path,
               metadata="Author: Dakota Hawkins ([email protected])")
Exemplo n.º 10
0
report("""
{title}
    .. role:: commd
    .. role:: red
    .. role:: green

**CASCABEL** is designed to run amplicon sequence analysis across single or multiple read libraries. This report consists of the ASV table creation and taxonomic assignment for all the combined accepted reads of given samples or libraries, if multiple.

{txtDescription}

Filter and Trim
---------------
Once that all the individual libraries were demultiplexed, the fastq files from all the samples for all the libraries were processed together. 

The filter and trimming steps were both performed with the **filterAndTrim()** function from the R package dada2, according to user parameters.

:red:`Tool:` dada2_ 

:red:`Version:` {dada2Version}

:green:`Function:` filterAndTrim()

:green:`Max Expected Errors (maxEE) FW:` {snakemake.config[dada2_filter][maxEE_FW]}

:green:`Max Expected Errors (maxEE) RV:` {snakemake.config[dada2_filter][maxEE_RV]}

:green:`Forward read truncation:` {snakemake.config[dada2_filter][truncFW]}

:green:`Reverse read truncation:` {snakemake.config[dada2_filter][truncRV]}

**Command:**


:commd:`Scripts/asvFilter.R $PWD {snakemake.config[dada2_filter][generateQAplots]} {snakemake.config[dada2_filter][truncFW]} {snakemake.config[dada2_filter][truncRV]} {snakemake.config[dada2_filter][maxEE_FW]} {snakemake.config[dada2_filter][maxEE_RV]} {snakemake.config[dada2_filter][cpus]} {snakemake.config[dada2_filter][extra_params]} {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/filter_summary.out`


**Output file:**

:green:`- Filtered fastq files:`   {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/<Library>/demultiplexed/filtered/

:green:`- Summary:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/filter_summary.out


:red:`Note:` To speed up downstream computation, consider tightening maxEE. If too few reads are passing the filter, consider relaxing maxEE, perhaps especially on the reverse reads.

Make sure that your forward and reverse reads overlap after length truncation.

{asvFilterBenchmark}

 
Amplicon Sequence Variants
----------------------------
In order to identify ASVs, dada2 workflow require to execute several steps. Following a summary of these steps and its main parameters. 

:red:`Tool:` dada2_ 

:red:`Version:` {dada2Version}

Learn errors
~~~~~~~~~~~~~~~~
The first step after filtering the reads is to learn the errors from the fastq files.

:green:`Function:` learnErrors(filteredFQ)

{errorPlots}

ASV inference
~~~~~~~~~~~~~~~
The amplicon sequence variant identification consists of a high resolution sample inference from the amplicon data using the learned errors. 
 
:green:`Function:` dada(filteredFQ, errors, pool='{snakemake.config[dada2_asv][pool]}')
 
Merge pairs
~~~~~~~~~~~~~~~
In this step, forward and reverse reads are paired in order to create full denoised sequences.

:green:`Function:` mergePairs(dadaF, dadaR)

:green:`Min overlap:` {snakemake.config[dada2_merge][minOverlap]}

:green:`Max mismatch:` {snakemake.config[dada2_merge][maxMismatch]}

Length filtering   
~~~~~~~~~~~~~~~~~~
Sequences that are much longer or shorter than expected may be the result of non-specific priming.

:green:`- Shortest length:` {shorts}

:green:`- Longest length:` {longs}

{chimeras}

**Output files:**

:green:`- Representative ASV sequences:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/representative_seq_set.fasta

The total number of different ASVs is: {totalAsvs}


Assign taxonomy
----------------
Given a set of sequences, assign the taxonomy of each sequence.

{assignTaxoStr}

The percentage of successfully assigned ASVs is: {prcAssignedAsvs}

**Output file:**

:green:`- ASV taxonomy assignation:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/representative_seq_set_tax_assignments.txt


The previous steps were performed within a Cascabel R script according to the following command:

**Command**

:commd:`Scripts/asvDada2.R $PWD  {snakemake.config[dada2_asv][pool]}   {snakemake.config[dada2_asv][cpus]}    {snakemake.config[dada2_asv][generateErrPlots]}   {snakemake.config[dada2_asv][extra_params]}  {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/    {snakemake.config[rm_reads][shorts]}    {snakemake.config[rm_reads][longs]}   {snakemake.config[rm_reads][offset]}    {snakemake.config[dada2_asv][chimeras]}    {snakemake.config[dada2_taxonomy][db]}   {snakemake.config[dada2_taxonomy][add_sps][db_sps]}    {snakemake.config[dada2_taxonomy][add_sps][add]}   {snakemake.config[dada2_taxonomy][extra_params]}  {snakemake.config[dada2_merge][minOverlap]}  {snakemake.config[dada2_merge][maxMismatch]}  {snakemake.config[dada2_taxonomy][add_sps][extra_params]}`  


{dada2Benchmark}

Make ASV table
---------------
Tabulates the number of times an ASV is found in each sample, and adds the taxonomic predictions for each ASV in the last column.

**Command:**

:commd:`cat {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/representative_seq_set_tax_assignments.txt | awk 'NR==FNR{{if(NR>1){{tax=$2;for(i=3;i<=NF;i++){{tax=tax";"$i}};h[$1]=tax;}}next;}} {{if(FNR==1){{print $0"\\ttaxonomy"}}else{{print $0"\\t"h[$1]}}' - {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/asv_table.txt  >  {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/asvTable.txt`

**Output file:**

:green:`- ASV table:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/asvTable.txt

{otuTableBenchmark}

Convert ASV table
------------------
Convert from txt to the BIOM table format.

:red:`Tool:` [BIOM]_

:red:`Version:` {convertBiomVersion}

**Command:**

:commd:`biom convert -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/asvTable.txt -o {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/asvTable.biom {snakemake.config[biom][tableType]} --table type "OTU table"  --to-hdf5 --process-obs-metdata taxonomy`

**Output file:**

:green:`- Biom format table:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/asvTable.biom

{convertOtuBenchmark}

Summarize Taxa
---------------
Summarize information of the representation of taxonomic groups within each sample.

:red:`Tool:` [QIIME]_ - summarize_taxa.py

:red:`Version:` {summTaxaVersion}

**Command:**

:commd:`summarize_taxa.py -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/otuTable.biom {snakemake.config[summTaxa][extra_params]} -o {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/summary/`

**Output file:**

:green:`- Taxonomy summarized counts at different taxonomy levels:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/summary/otuTable_L**N**.txt

Where **N** is the taxonomy level. Default configuration produces levels from 2 to 6.

{summTaxaBenchmark}

Filter ASV table
-----------------
Filter ASVs from an ASV table based on their observed counts or identifier.

:red:`Tool:` [QIIME]_ - filter_otus_from_otu_table.py

:red:`Version:` {filterOTUNoSVersion}

:green:`Minimum observation counts:` {snakemake.config[filterOtu][n]}

**Command:**

:commd:`filter_otus_from_otu_table.py -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/asvTable.biom -o {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/asvTable_noSingletons.biom {snakemake.config[filterOtu][extra_params]} -n {snakemake.config[filterOtu][n]}`

**Output file:**

:green:`- Biom table:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/otuTable_noSingletons.biom

{asvNoSingletonsBenchmark}

Convert Filtered ASV table
---------------------------
Convert the filtered OTU table from the BIOM table format to a human readable format

:red:`Tool:` [BIOM]_

:red:`Version:` {convertBiomVersion}

**Command:**

:commd:`biom convert -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_dada2/asvTable_noSingletons.biom -o {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/asvTable_noSingletons.txt {snakemake.config[biom][tableType]} {snakemake.config[biom][headerKey]} {snakemake.config[biom][extra_params]} {snakemake.config[biom][outFormat]}`

**Output file:**

:green:`- TSV format table:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/asv/taxonomy_dada2/asvTable_noSingletons.txt

{filterASVTableBenchmark}

Filter representative sequences
---------------------------------
Remove sequences according to the filtered OTU biom table.

:red:`Tool:` [QIIME]_ - filter_fasta.py

:red:`Version:` {filterFastaVersion}

**Command:**

:commd:`filter_fasta.py -f {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/asv/representative_seq_set.fasta -o {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/asv/taxonomy_dada2/representative_seq_set_noSingletons.fasta {snakemake.config[filterFasta][extra_params]} -b {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/asv/taxonomy_dada2/otuTable_noSingletons.biom`

**Output file:**

:green:`- Filtered fasta file:` {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/asv/taxonomy_dada2/representative_seq_set_noSingletons.fasta


{alignmentReport}

{kronaReport}

Final counts
-------------

{countTxt}

.. image:: report_files/sequence_numbers_asv.png

:red:`Note:`

:green:`- Assigned ASVs percentage` is the amount of successfully assigned ASVs.

:green:`- No singletons percentage` is the percentage of no singletons ASVs in reference to the complete ASV table.

:green:`- Assigned No singletons` is the amount of successfully no singletons assigned ASVs.

References
------------

.. [QIIME] QIIME. Caporaso JG, Kuczynski J, Stombaugh J, Bittinger K, Bushman FD, Costello EK, Fierer N, Gonzalez Pena A, Goodrich JK, Gordon JI, Huttley GA, Kelley ST, Knights D, Koenig JE, Ley RE, Lozupone CA, McDonald D, Muegge BD, Pirrung M, Reeder J, Sevinsky JR, Turnbaugh PJ, Walters WA, Widmann J, Yatsunenko T, Zaneveld J, Knight R. 2010. QIIME allows analysis of high-throughput community sequencing data. Nature Methods 7(5): 335-336.

.. [Cutadapt] Cutadapt v1.15 .Marcel Martin. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.Journal, 17(1):10-12, May 2011. http://dx.doi.org/10.14806/ej.17.1.200

.. [vsearch] Rognes T, Flouri T, Nichols B, Quince C, Mahé F. (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584. doi: 10.7717/peerj.2584

.. [Krona] Ondov BD, Bergman NH, and Phillippy AM. Interactive metagenomic visualization in a Web browser. BMC Bioinformatics. 2011 Sep 30; 12(1):385.

.. [BIOM] The Biological Observation Matrix (BIOM) format or: how I learned to stop worrying and love the ome-ome. Daniel McDonald, Jose C. Clemente, Justin Kuczynski, Jai Ram Rideout, Jesse Stombaugh, Doug Wendel, Andreas Wilke, Susan Huse, John Hufnagle, Folker Meyer, Rob Knight, and J. Gregory Caporaso.GigaScience 2012, 1:7. doi:10.1186/2047-217X-1-7

{variable_refs}


""", snakemake.output[0], metadata="Author: J. Engelmann & A. Abdala ")
Exemplo n.º 11
0
report("""
{title}
    .. role:: commd
    .. role:: red
    .. role:: green

**CASCABEL** is designed to run amplicon sequence analysis across single or multiple read libraries. This report consists of the OTU creation and taxonomic assignment for all the combined accepted reads of given samples or libraries, if multiple.

{txtDescription}

Combine Reads
---------------

Merge all the reads of the individual libraries into one single file.

**Command:**

{catCommand}

**Output file:**

:green:`- Merged reads:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/seqs_fw_rev_filtered.fasta

The total number of reads is: {totalReads}

{combineBenchmark}

{dereplicateReport}

Cluster OTUs
-------------

Assigns similar sequences to operational taxonomic units, or OTUs, by clustering sequences based on a user-defined similarity threshold.

:red:`Tool:` [QIIME]_ - pick_otus.py

:red:`Version:` {clusterOtuVersion}

:green:`Method:` [{snakemake.config[pickOTU][m]}]_

:green:`Identity:` {snakemake.config[pickOTU][s]}

**Command:**

:commd:`pick_otus.py -m {snakemake.config[pickOTU][m]} -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/seqs_fw_rev_filtered.fasta -o {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/otu/ {snakemake.config[pickOTU][extra_params]} -s {snakemake.config[pickOTU][s]}`

**Output files:**

:green:`- OTU List:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/seqs_fw_rev_filtered_otus.txt

:green:`- Log file:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/seqs_fw_rev_filtered_otus.log

The total number of different OTUS is: {totalOtus}

{otuBenchmark}

Pick representatives
-----------------------
Pick a single representative sequence for each OTU.

:red:`Tool:` [QIIME]_ - pick_rep_set.py

:red:`Version:` {pickRepVersion}

:green:`Method:` {snakemake.config[pickRep][m]}

**Command:**

:commd:`pick_rep_set.py -m {snakemake.config[pickRep][m]} -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/seqs_fw_rev_filtered_otus.txt -f {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/seqs_fw_rev_filtered.fasta -o {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/otu/representative_seq_set.fasta {snakemake.config[pickRep][extra_params]} --log_fp {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/otu/representative_seq_set.log`

**Output file:**

:green:`- Fasta file with representative sequences:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/representative_seq_set.fasta

{pikRepBenchmark}

Assign taxonomy
----------------
Given a set of sequences, assign the taxonomy of each sequence.

{assignTaxoStr}

The percentage of successfully assigned OTUs is: {prcAssignedOtus}

**Output file:**

:green:`- OTU taxonomy assignation:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/representative_seq_set_tax_assignments.txt

{assignTaxaBenchmark}

Make OTU table
---------------
Tabulates the number of times an OTU is found in each sample, and adds the taxonomic predictions for each OTU in the last column.

:red:`Tool:` [QIIME]_ - make_otu_table.py

:red:`Version:` {makeOTUVersion}

**Command:**

:commd:`make_otu_table.py -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/seqs_fw_rev_filtered_otus.txt -t {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/representative_seq_set_tax_assignments.txt {snakemake.config[makeOtu][extra_params]} -o {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable.biom`

**Output file:**

:green:`- Biom format table:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable.biom

{otuTableBenchmark}

Convert OTU table
------------------
Convert from the BIOM table format to a human readable format.

:red:`Tool:` [BIOM]_

:red:`Version:` {convertBiomVersion}

**Command:**

:commd:`biom convert -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable.biom -o {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable.txt {snakemake.config[biom][tableType]} {snakemake.config[biom][headerKey]} {snakemake.config[biom][extra_params]} {snakemake.config[biom][outFormat]}`

**Output file:**

:green:`- TSV format table:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable.txt

{convertOtuBenchmark}

Summarize Taxa
---------------
Summarize information of the representation of taxonomic groups within each sample.

:red:`Tool:` [QIIME]_ - summarize_taxa.py

:red:`Version:` {summTaxaVersion}

**Command:**

:commd:`summarize_taxa.py -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable.biom {snakemake.config[summTaxa][extra_params]} -o {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/summary/`

**Output file:**

:green:`- Taxonomy summarized counts at different taxonomy levels:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/summary/otuTable_L**N**.txt

Where **N** is the taxonomy level. Default configuration produces levels from 2 to 6.

{summTaxaBenchmark}

Filter OTU table
-----------------
Filter OTUs from an OTU table based on their observed counts or identifier.

:red:`Tool:` [QIIME]_ - filter_otus_from_otu_table.py

:red:`Version:` {filterOTUNoSVersion}

:green:`Minimum observation counts:` {snakemake.config[filterOtu][n]}

**Command:**

:commd:`filter_otus_from_otu_table.py -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable.biom -o {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable_noSingletons.biom {snakemake.config[filterOtu][extra_params]} -n {snakemake.config[filterOtu][n]}`

**Output file:**

:green:`- Biom table:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable_noSingletons.biom

{otuNoSingletonsBenchmark}

Convert Filtered OTU table
---------------------------
Convert the filtered OTU table from the BIOM table format to a human readable format

:red:`Tool:` [BIOM]_

:red:`Version:` {convertBiomVersion}

**Command:**

:commd:`biom convert -i {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable_noSingletons.biom -o {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable_noSingletons.txt {snakemake.config[biom][tableType]} {snakemake.config[biom][headerKey]} {snakemake.config[biom][extra_params]} {snakemake.config[biom][outFormat]}`

**Output file:**

:green:`- TSV format table:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/otuTable_noSingletons.txt

{otuNoSingletonsBenchmark}

Filter representative sequences
---------------------------------
Remove sequences according to the filtered OTU biom table.

:red:`Tool:` [QIIME]_ - filter_fasta.py

:red:`Version:` {filterFastaVersion}

**Command:**

:commd:`filter_fasta.py -f {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/otu/representative_seq_set.fasta -o {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/representative_seq_set_noSingletons.fasta {snakemake.config[filterFasta][extra_params]} -b {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/otu/otuTable_noSingletons.biom`

**Output file:**

:green:`- Filtered fasta file:` {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.run}/otu/taxonomy_{snakemake.config[assignTaxonomy][tool]}/representative_seq_set_noSingletons.fasta

{filterBenchmark}

{alignmentReport}

{kronaReport}

Final counts
-------------

{countTxt}

.. image:: report_files/sequence_numbers_all.png

:red:`Note:`

:green:`- Assigned OTUs percentage` is the amount of successfully assigned OTUs.

:green:`- No singletons percentage` is the percentage of no singletons OTUs in reference to the complete OTU table.

:green:`- Assigned No singletons` is the amount of successfully no singletons assigned OTUs.

References
------------

.. [QIIME] QIIME. Caporaso JG, Kuczynski J, Stombaugh J, Bittinger K, Bushman FD, Costello EK, Fierer N, Gonzalez Pena A, Goodrich JK, Gordon JI, Huttley GA, Kelley ST, Knights D, Koenig JE, Ley RE, Lozupone CA, McDonald D, Muegge BD, Pirrung M, Reeder J, Sevinsky JR, Turnbaugh PJ, Walters WA, Widmann J, Yatsunenko T, Zaneveld J, Knight R. 2010. QIIME allows analysis of high-throughput community sequencing data. Nature Methods 7(5): 335-336.

.. [Cutadapt] Cutadapt v1.15 .Marcel Martin. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.Journal, 17(1):10-12, May 2011. http://dx.doi.org/10.14806/ej.17.1.200

.. [vsearch] Rognes T, Flouri T, Nichols B, Quince C, Mahé F. (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584. doi: 10.7717/peerj.2584

.. [Krona] Ondov BD, Bergman NH, and Phillippy AM. Interactive metagenomic visualization in a Web browser. BMC Bioinformatics. 2011 Sep 30; 12(1):385.

.. [BIOM] The Biological Observation Matrix (BIOM) format or: how I learned to stop worrying and love the ome-ome. Daniel McDonald, Jose C. Clemente, Justin Kuczynski, Jai Ram Rideout, Jesse Stombaugh, Doug Wendel, Andreas Wilke, Susan Huse, John Hufnagle, Folker Meyer, Rob Knight, and J. Gregory Caporaso.GigaScience 2012, 1:7. doi:10.1186/2047-217X-1-7

{variable_refs}


""", snakemake.output[0], metadata="Author: J. Engelmann & A. Abdala ")
Exemplo n.º 12
0
links = {}
for vals in product(*scenarios.values()):
    sc = dict(zip(scenarios.keys(), vals))
    headline = ", ".join("{}={}".format(k, v) for k, v in sc.items())
    text += headline + "\n" + '-' * len(headline) + "\n\n"
    for p in param_values:
        sc[param] = p
        fn = tmpl.format(**sc)
        #links[fn] = [os.path.join(plot_dir, fn + '.pdf')]
        text += dedent('''
            .. figure:: {data}
               :scale: 50 %

               {param} = {value}

        ''').format(param=param,
                    value=p,
                    link=fn,
                    data=data_uri(os.path.join(plot_dir, fn + '.png')))

# text += dedent('''

#     Attachments
#     -----------
#     ''')

# text += '{} = {}: {}_\n\n.. image:: {}\n\n'.format(wildcards.param, p, fn, data_uri('results/plots/' + fn + '.png'))

report(text=text, path=snakemake.output.html, stylesheet='report.css', **links)
Exemplo n.º 13
0
report(
    """
===================================
Report for {sample}
===================================

Quality Control
===================================
1. ``guppy_barcoder`` was run if the samples were multiplexed.
2. Reads were aligned to the TB reference {reference} using Minimap2_.
3. All reads which did not map to {reference} were removed. Prior to filtering there were {num_reads_pre_filter} reads. After filtering there remains {num_reads_post_filter}. This means {percent_reads_mapped}% of reads mapped to {reference}. For more stats on the pre-filtered reads see `stats_pre_filter`_ and for post-filtered reads see `stats_post_filter`_. For quality control plots of the reads after this step (and read percent identity to {reference}) see `plot_post_filter`_. Stats were produced with NanoStat_ and plots with Pistis_.

Mykrobe Analysis
===================================

**Phylogenetic group:** {phylo_group}

**Species:** {species}

**Lineage:** {lineage}

A summary of the susceptiblity information from `Mykrobe predict`_ is shown here. For the full report, see mykrobe_. If resistance is identified for a drug then the predicted responsible variant(s) is given, along with supporting information.

{mykrobe_report}


.. _Minimap2: https://github.com/lh3/minimap2
.. _NanoStat: https://github.com/wdecoster/nanostat
.. _Pistis: https://github.com/mbhall88/pistis
.. _`Mykrobe predict`: https://github.com/Mykrobe-tools/mykrobe
""",
    snakemake.output[0],
    metadata="Author: Michael Hall ([email protected])",
    **snakemake.input
)
Exemplo n.º 14
0
def main(combined_stats, report_out):

    df = pd.read_csv(combined_stats, sep="\t", index_col=0)
    div = {}
    labels = {
        "Percent_Assembled_Reads": "Percent of Assembled Reads",
        "contig_bp": "Total BP",
        "n_contigs": "Contigs (count)",
        "N_Predicted_Genes": "Predicted Genes (count)",
    }
    for variable in [
        "Percent_Assembled_Reads",
        "contig_bp",
        "n_contigs",
        "N_Predicted_Genes",
    ]:
        y_axis_label = labels[variable]
        div[variable] = offline.plot(
            df[variable].iplot(
                asFigure=True,
                kind="bar",
                xTitle="Samples",
                layout=go.Layout(
                    xaxis=dict(tickangle=45), yaxis=dict(title=y_axis_label)
                ),
            ),
            **PLOTLY_PARAMS,
        )
    div["L50"] = offline.plot(
        df[["L50", "L90"]].iplot(
            asFigure=True,
            kind="bar",
            xTitle="Samples",
            layout=go.Layout(xaxis=dict(tickangle=45), yaxis=(dict(title="Bases"))),
        ),
        **PLOTLY_PARAMS,
    )
    report_str = """

.. raw:: html

    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>


=============================================================
ATLAS_ - Assembly Summary
=============================================================

.. _ATLAS: https://github.com/metagenome-atlas/atlas

.. contents::
    :backlinks: none


Summary
-------

Fragmentation
*************

L50/L90 is a measure of how fractionated assemblies are:
50%/ 90% of the assembly is made up of contigs of Length L50/L90 or longer. Sometimes refered to as N50/N90.


.. raw:: html

    {div[L50]}


Assembly Length
***************

.. raw:: html

    {div[contig_bp]}


Number of Contigs
*****************

.. raw:: html

    {div[n_contigs]}


Number of Predicted Genes
*************************

.. raw:: html

    {div[N_Predicted_Genes]}


Percent of Assembled Reads
**************************

.. raw:: html

    {div[Percent_Assembled_Reads]}


For more information see Table_1_


Downloads
---------

"""
    report(
        report_str,
        report_out,
        Table_1=combined_stats,
        stylesheet=os.path.join(atlas_dir, "report", "report.css"),
    )
Exemplo n.º 15
0
        report("""
        ============================
        RNA-seq analysis - Jeff Cole
        ============================
        
        :Project:              Jeff Cole
        :Collaboration:        Alain Dolla
        :Analysis workflow:    Justine Long, Jeanne Chèneby & Jacques van Helden
        
        Contents
        ========
        
        - `Flowcharts`_
        - `Datasets`_
            - `Sample identifiers`_
            - `Raw reads`_
        - `Result files`_
            - `Quality control (raw reads)`_
            - `Quality control (trimmed reads)`_
            - `Trimming (reads forward, with Sickle)`_
            - `Trimming (reads reverse, with Sickle)`_
            - `Alignment (Bowtie2)`_
            - `Alignment in BAM (Samtools)`_
            - `Sorting by names or positions (Samtools)`_
            - `Index of the mapping results`_
            - `Count (HTSeq)`_
            - `Parameters file for DE`_
            - `All count in one file`_
            - `Differential Expression (edgeR)`_
            - `Differential Expression (DESeq2)`_


        -----------------------------------------------------

        Flowcharts
        ==========

        - Sample treatment: dag_pdf_
        - Workflow: rulegraph_pdf_

        .. image:: rulegraph.png

        -----------------------------------------------------

        Datasets
        ========
        
        Sample identifiers
        ------------------

        {SAMPLE_IDS_OL} 


        Raw reads
        ----------
        {RAWR_FILES_OL}



        Result files
        ============

        Quality control (raw reads)
        ---------------------------

        {RAW_FASTQC_OL}
        
        
        Trimming (reads forward, with Sickle)
        -------------------------------------

        {TRIMMED_FW_OL}

        Trimming (reads reverse, with Sickle)
        -------------------------------------

        {TRIMMED_REV_OL}

        Quality control (trimmed reads)
        -------------------------------

        {TRIMMED_FASTQC_OL}
        
        Alignment (Bowtie2)
        -------------------

        {BOWTIE2_OL}


        Alignment in BAM (Samtools)
        ----------------------------

        {SAM_BAM_OL}


        Sorting by names or positions (Samtools)
        ----------------------------------------

        {BAM_SORTED_OL}


        Index of the mapping results 
        -----------------------------

        {BAM_INDEX_OL}


        Count (HTSeq)
        -------------

        {FEATURECOUNTS_OL}


        Parameters file for DE
        ----------------------

            {PARAMS_R_OL}


        All count in one file
        ----------------------

            {ALL_COUNTS_OL}


        Differential Expression (edgeR)
        -------------------------------

        {RESULTS_EDGER_OL}


        Differential Expression (DESeq2)
        --------------------------------

        {RESULTS_DESEQ2_OL}







        """, output.html, metadata="Jacques van Helden ([email protected])", **input)
        expand("logs/transrate/{sample}.transrate.log", sample=SAMPLES),
        expand( config["homedir"] + "busco/run_{sample}.busco/short_summary_{sample}.busco.txt", sample=SAMPLES),
        expand( config["homedir"] + "busco/run_{sample}.good.busco/short_summary_{sample}.good.busco.txt", sample=SAMPLES),
        "transrate/summary/cat.transrate.assemblies"
    output:
        "report.html"
    run:
        from snakemake.utils import report
        with open(input[0]) as trans.logs:
            print(trans.logs)

        report("""
        An example variant calling workflow
        ===================================

        Reads were mapped to the Yeast
        reference genome and variants were called jointly with
        SAMtools/BCFtools.

        This resulted in {n_calls} variants (see Table T1_).
        """, output[0], T1=input[0])


# Finishing up --------------------------------------------------------------
# onsuccess:
#     print("Workflow finished, no error")

# onerror:
#     print("An error occurred with the snakemake run")
#     # here it would be good to include timstamping
#     # TODO config for email
#     shell:
rule report:
	input:
		T1=ALN_DIR+ 'summary_flagstat.txt',
		F1=ALN_DIR + 'cov.pdf',
		T2=ALN_DIR+ 'mean_coverage.txt'
	output:
		html=ALN_DIR+ 'report_align_ref.html'
	run:
		REFERENCES = ['nCoV_noA']
		report("""
		============================================
		RESUME DE L'ALIGNEMENT SUR {REFERENCES}
		============================================
		Analyse des reads NE S'ALIGNANT PAS SUR lE GENOME HUMAIN

		Voir le tableau T1_

		Nombre de reads total passant QC

		Mapped = Nombre de reads qui s'alignent sur le sample de reference

		properly_paired = Nombre de reads qui s'alignent sur le sample de reference de maniere appariee (R1 et R2 alignees sur le sample)

		Attention pour avoir le nb de reads TOTALES et le nb de reads qui s'alignent sur {REFERENCES} il faut soustraire les Supplementary et Secondary
		(reads qui s'alignent sur plusieurs refs et qui sont comptees plusieurs fois)

		Tableau T2_ : depth of coverage at each position

		Figure F1_ : coverage. A VERIFIER POUR VALIDATION DES ECHANTILLONS

		""", output.html, metadata="Laurence Josset", **input)        
Exemplo n.º 18
0
def main(report_out, read_counts, zipfiles_QC, min_quality, zipfiles_raw=None):
    div = {}

    # N reads / N bases
    df = pd.read_csv(read_counts, index_col=[0, 1], sep="\t")
    for variable in ["Total_Reads", "Total_Bases"]:

        data = df[variable].unstack()[df.loc[df.index[0][0]].index]

        if "clean" in data.columns:
            data.drop("clean", axis=1, inplace=True)

        div[variable] = offline.plot(
            data.plot(
                kind="bar",
                xTitle="Samples",
                yTitle=variable.replace("_", " "),
                layout=go.Layout(xaxis=dict(tickangle=45)),
            ),
            **PLOTLY_PARAMS,
        )

    Report_numbers = """

Total reads per sample
~~~~~~~~~~~~~~~~~~~~~~

.. raw:: html

    {div[Total_Reads]}

{Legend}

Total bases per sample
~~~~~~~~~~~~~~~~~~~~~~
.. raw:: html

    {div[Total_Bases]}

For details see Table Table1_.
"""
    if data.shape[1] > 1:
        Legend = """
============   ===================================
Step           Output
============   ===================================
raw            the input reads
deduplicated   after (optional) deduplication step
filtered       trimmed, quality filtered
qc             final reads, contaminants removed
============   ===================================
"""
    else:
        Legend = ""

    Report_read_quality_qc = """

Reads quality after QC
~~~~~~~~~~~~~~~~~~~~~~
"""

    Quality_pe, Quality_se = get_stats_from_zips(zipfiles_QC)

    max_quality = 1 + np.nanmax(
        (Quality_pe.max().max(), Quality_se.max().max()))
    if Quality_pe.shape[0] > 0:
        div["quality_qc_pe"] = get_pe_read_quality_plot(
            Quality_pe, [min_quality, max_quality])
        Report_read_quality_qc += """
Paired end
**********
.. raw:: html

    {div[quality_qc_pe]}


"""

    if Quality_se.shape[0] > 0:

        if (Quality_se.shape[0] > 0) & (Quality_se.shape[0] > 0):
            Report_read_quality_qc += """
Single end
+++++++++++

Paired end reads that lost their mate during filtering.

"""

        div["quality_qc_se"] = draw_se_read_quality(Quality_se,
                                                    [min_quality, max_quality])
        Report_read_quality_qc += """

.. raw:: html

    {div[quality_qc_se]}

"""

    if zipfiles_raw is None:
        Report_read_quality_raw = ""
    else:

        Report_read_quality_raw = """

Reads quality before QC
~~~~~~~~~~~~~~~~~~~~~~~

.. raw:: html

    {div[quality_raw]}

"""
        Quality_pe, Quality_se = get_stats_from_zips(zipfiles_raw)
        if Quality_pe.shape[0] > 0:
            div["quality_raw"] = get_pe_read_quality_plot(
                Quality_pe, [min_quality, max_quality])
        elif Quality_se.shape[0] > 0:
            div["quality_raw"] = draw_se_read_quality(
                Quality_se, [min_quality, max_quality])
        else:
            raise IndexError()

    report_str = ("""

.. raw:: html

    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>


=============================================================
ATLAS_ - QC Summary
=============================================================

.. _ATLAS: https://github.com/metagenome-atlas/atlas

.. contents::
    :backlinks: none


Summary
-------


""" + Report_numbers + Report_read_quality_qc + Report_read_quality_raw + """

Downloads
---------

""")

    report(
        report_str,
        report_out,
        Table1=read_counts,
        stylesheet=os.path.join(os.path.abspath(os.path.dirname(__file__)),
                                "report.css"),
    )
Exemplo n.º 19
0
report("""
Amplicon Analysis Report for Library: {snakemake.wildcards.sample}
=====================================================================
    .. role:: commd
    .. role:: red
    .. role:: green

**CASCABEL** is designed to run amplicon sequence analysis across single or multiple read libraries.

The objective of this pipeline is to create different output files which allow the user to explore data in a simple and meaningful way, as well as facilitate downstream analysis, based on the generated output files.

Another aim of **CASCABEL** is also to encourage the documentation process, by creating this report in order to assure data analysis reproducibility.

{txtDescription}

Following you can see all the steps that were taken in order to get the final results of the pipeline.

Raw Data
---------
The raw data for this library can be found at:

:green:`- FW raw reads:` {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.sample}/rawdata/fw.fastq

:green:`- RV raw reads:` {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.sample}/rawdata/rv.fastq

:red:`Number of total reads:` {rawCountsStr}

Quality Control
------------------
Evaluate quality on raw reads.

:red:`Tool:` [FastQC]_

:red:`Version:` {fqVersion}

**Command:**

:commd:`fastqc {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.sample}/rawdata/fw.fastq {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.sample}/rawdata/rv.fastq --extract -o {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.sample}/qc/`

You can follow the links below, in order to see the complete FastQC report:

:green:`- FastQC for sample {snakemake.wildcards.sample}_1:` FQ1_

    .. _FQ1: ../../../samples/{snakemake.wildcards.sample}/qc/fw_fastqc.html

:green:`- FastQC for sample {snakemake.wildcards.sample}_2:` FQ2_

    .. _FQ2: ../../../samples/{snakemake.wildcards.sample}/qc/rv_fastqc.html

{fqBench}


Read pairing
----------------
Align paired end reads and merge them into one single sequence in case they overlap.

:red:`Tool:` [PEAR]_

:red:`version:` {pearversion}

**Command:**

:commd:`pear -f {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.sample}/rawdata/fw.fastq -r {snakemake.wildcards.PROJECT}/samples/{snakemake.wildcards.sample}/rawdata/rv.fastq -t {snakemake.config[pear][t]} -v {snakemake.config[pear][v]} -j {snakemake.config[pear][j]} -p {snakemake.config[pear][p]} -o {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/{snakemake.wildcards.sample}_data/peared/seqs > {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/{snakemake.wildcards.sample}_data/peared/seqs.assembled.fastq`

**Output files:**

:green:`- Merged reads:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/{snakemake.wildcards.sample}_data/peared/seqs.assembled.fastq

:green:`- Log file:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/{snakemake.wildcards.sample}_data/peared/pear.log

:red:`Number of peared reads:` {pearedCountsStr} =  {prcPeared}%

{pearBench}

{fastQCPearStr}

{extractBCStr}

{correctBCStr}

{splitStr}

{demultiplexFQ}

{combineFR}

{cutAdaptStr}


Remove too long and too short reads
------------------------------------
Remove very short and long reads, with lengths more than some standard deviation below or above the mean to be short or long respectively

:green:`- Minimun length expected (shorts):` {shorts}

:green:`- Maximun length expected (longs):` {longs}

**Command:**

:commd:`awk '!/^>/ {{ next }} {{ getline seq }} length(seq) > shorts  && length(seq) < longs {{ print $0 \"\\n\" seq }}'  {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/{snakemake.wildcards.sample}_data/seqs_fw_rev_accepted.fna  >  {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/{snakemake.wildcards.sample}_data/seqs_fw_rev_filtered.fasta`

**Sequence distribution before remove reads**

.. image:: report_files/seqs_dist_hist.{snakemake.wildcards.sample}.png
    :height: 400px
    :width: 400px
    :align: center


**Output file:**

:green:`- Fasta file with correct sequence length:` {snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/{snakemake.wildcards.sample}_data/seqs_fw_rev_filtered.fasta

{trimmedStr}

{rmShorLongBench}


{quimeraStr}


{sampleDistChart}


Final counts
-------------

{countTxt}

.. image:: report_files/sequence_numbers.{snakemake.wildcards.sample}.png

OTU report
---------------------------

Cascabel report on downstream analyses in combination with multiple libraries (if supplied), can be found at the following link: otu_report_ ({snakemake.wildcards.PROJECT}/runs/{snakemake.wildcards.run}/otu_report_{snakemake.config[assignTaxonomy][tool]}.html)

    .. _otu_report: otu_report_{snakemake.config[assignTaxonomy][tool]}.html

References
------------------

.. [FastQC] FastQC v0.11.3. Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data

.. [PEAR] PEAR: a fast and accurate Illumina Paired-End reAd mergeR. Zhang et al (2014) Bioinformatics 30(5): 614-620 | doi:10.1093/bioinformatics/btt593

.. [QIIME] QIIME. Caporaso JG, Kuczynski J, Stombaugh J, Bittinger K, Bushman FD, Costello EK, Fierer N, Gonzalez Pena A, Goodrich JK, Gordon JI, Huttley GA, Kelley ST, Knights D, Koenig JE, Ley RE, Lozupone CA, McDonald D, Muegge BD, Pirrung M, Reeder J, Sevinsky JR, Turnbaugh PJ, Walters WA, Widmann J, Yatsunenko T, Zaneveld J, Knight R. 2010. QIIME allows analysis of high-throughput community sequencing data. Nature Methods 7(5): 335-336.

.. [Cutadapt] Cutadapt v1.15 .Marcel Martin. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.Journal, 17(1):10-12, May 2011. http://dx.doi.org/10.14806/ej.17.1.200

.. [Vsearch] Rognes T, Flouri T, Nichols B, Quince C, Mahé F. (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584. doi: 10.7717/peerj.2584


{variable_refs}


""",
       snakemake.output[0],
       metadata="Author: J. Engelmann & A. Abdala ")
Exemplo n.º 20
0
def main(clean_logs, unique_logs, merge_logs, summary_tables, r1_quality_files,
         html, conda_env, function_table, zipped_file):
    clean_logs = glob(clean_logs)
    unique_logs = glob(unique_logs)
    merge_logs = glob(merge_logs)
    summary_tables = glob(summary_tables)
    r1_quality_files = glob(r1_quality_files)
    classifications_per_sample = compile_summary_df(summary_tables)
    value_cols = get_sample_name(summary_tables, "_classifications.txt")
    fig = build_taxonomy_plot(taxonomy_table, value_cols)
    plots = offline.plot(fig, **PLOTLY_PARAMS)
    html_tbl = parse_log_files(merge_logs, unique_logs, clean_logs,
                               classifications_per_sample)
    quality_plot = build_quality_plot(r1_quality_files)
    conda_env = get_conda_env_str(conda_env)
    report_str = """

.. raw:: html

    {STYLE}
    {SCRIPT}

=============================================================
PerSeq_ - Per sequence functional and taxonomic assignments
=============================================================

.. _PerSeq: https://github.com/PNNL-CompBio/perseq


.. contents::
    :backlinks: none
    :depth: 2


Summary
-------

Sequence Counts
***************

.. raw:: html

    <div style="overflow-x:auto;">
    {html_tbl}
    </div>

Sequence Quality
****************

.. raw:: html

    {quality_plot}


Taxonomy Assignment Summary
***************************

Samples are sorted based on their Shannon index calculated from taxonomically
annotated sequences. The order is most to least diverse.

.. raw:: html

    {plots}

Methods
-------

Paired-end sequences were evaluated for quality using VSEARCH [1]. Sequence
reads are quality trimmed after successful merging using bbmerge [2].
Sequences are allowed to be extended up 300 bp
during the merging process to account for non-overlapping R1 and R2 sequences
(``k=60 extend2=60 iterations=5 qtrim2=t``). If optional subsampling is selected,
Seqtk is used to downsample FASTQ files for faster processing [3]. Merged sequences are
deduplicated using the clumpify tool [2] then, by default, filtered of PhiX and
rRNA using bbsplit [2]. An arbitrary number of Name:FASTA pairs may be
specified during the decontamination process. Functional annotation and
taxonomic classification were performed following the decontamination step.

Functional Annotation
*********************

The blastx algorithm of DIAMOND [4] was used to align nucleotide sequences to
the KEGG protein reference database [5] consisting of non-redundant, family
level fungal eukaryotes and genus level prokaryotes
(``--strand=both --evalue 0.00001``). The highest scoring alignment per
sequence was used for functional annotation.

Taxonomic Annotation
********************

Kmer-based taxonomic classification was performed on the merged reads using
Kaiju [6] in greedy mode (``-a greedy -E 0.05``). NCBI's nr database [7]
containing reference sequences for archaea, bacteria, viruses, fungi, and
microbial eukaryotes was used as the reference index for Kaiju.

References
**********

1. Rognes T, Flouri T, Nichols B, Quince C, Mahé F. VSEARCH: a versatile open source tool for metagenomics. PeerJ. PeerJ Inc; 2016;4:e2584.
2. Bushnell B. BBTools [Internet]. Available from: https://sourceforge.net/projects/bbmap/
3. Li H. Seqtk [Internet]. Available from: https://github.com/lh3/seqtk
4. Buchfink B, Xie C, Huson DH. Fast and sensitive protein alignment using DIAMOND. Nat. Methods. Nature Publishing Group; 2015;12:59–60.
5. Kanehisa M, Sato Y, Kawashima M, Furumichi M, Tanabe M. KEGG as a reference resource for gene and protein annotation. Nucleic Acids Res. 2016;44:D457–62.
6. Menzel P, Ng KL, Krogh A. Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nat Commun. Nature Publishing Group; 2016;7:11257.
7. NCBI Resource Coordinators. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2018;46:D8–D13.


Execution Environment
---------------------

::

    {conda_env}

Output
------

Classification Tables
*********************

Per sample classifications in tables/ contain:

.. table::
    :name: classificationtable

    =========================  ==========================================================================================================================================
    Header ID                  Definition
    =========================  ==========================================================================================================================================
    aa_alignment_length        The length of the DIAMOND blastx hit
    aa_percent_id              The percent ID of the DIAMOND blastx hit; could be used to increase post-processing stringency
    ec                         Enzyme Commission number from KEGG; semicolon delimited where multiple
    ko                         KEGG entry ID
    product                    KEGG gene ID <semicolon> KEGG product
    read_id                    The sequence identifier (unique)
    kaiju_alignment_length     The length of the Kaiju hit
    kaiju_classification       The Kaiju classification in order of superkingdom, phylum, class, order, family, genus, species; "NA" for each taxonomic level not defined
    blastx_lca_classification  The LCA result from the blastx HSPs
    =========================  ==========================================================================================================================================

Summary Tables
**************

Taxonomy
````````

Per taxonomy assignments in tables named **summaries/taxonomy/<level>.txt**
contain:

.. table::
    :name: taxonomydeftable

    ====================  ======================================================
    Header ID             Definition
    ====================  ======================================================
    taxonomy_<level>      taxonomic level into which counts have been summed
    samples names         non-normalized, per sample sum at this taxonomic level
    ====================  ======================================================

Function
````````

Per function assignments in tables named **summaries/function/<type>.txt**
contain:

.. table::
    :name: functiondeftable

    ====================  ===================================================================
    Header ID             Definition
    ====================  ===================================================================
    <type>                either KO, EC, or product into which counts have been summed
    samples names         non-normalized, per sample sum for this particular functional group
    level_1               KEGG hierarchy [level 1] if KO defined in first column
    level_2               KEGG hierarchy [level 2] if KO defined in first column
    level_3               KEGG hierarchy [level 3] if KO defined in first column
    ====================  ===================================================================

Combined
````````

Per taxonomy+function assignments in tables named
**summaries/combined/<type>_<level>.txt** contain:

.. table::
    :name: combineddeftable

    ====================  ====================================================================
    Header ID             Definition
    ====================  ====================================================================
    <type>                either KO, EC, or product; counts are summed using <type>+<taxonomy>
    taxonomy_<level>      taxonomic level; counts are summed using <type>+<taxonomy>
    sample names          non-normalized, per sample sum for this particular functional group
    level_1               KEGG hierarchy [level 1] if KO defined in first column
    level_2               KEGG hierarchy [level 2] if KO defined in first column
    level_3               KEGG hierarchy [level 3] if KO defined in first column
    ====================  ====================================================================

Downloads
---------

"""
    report(
        report_str,
        html,
        downloads=zipped_file,
        # file1=function_table,
        # file2=taxonomy_table,
        # file3=taxonomy_function_table,
        # kronaplot_tax=krona_tax,
        # kronaplot_ec=krona_ec,
        stylesheet="",
    )