Пример #1
0
    def gq_seq_utils_exploratory_analysis_rnaseq_denovo_filtered(self):
        """
        Exploratory analysis using the gqSeqUtils R package using a subset of filtered transcripts
        """
        # Run exploratory analysis on filtered components
        # Extract filtered components from counts file    
        jobs=[]
        exploratory_output_dir = os.path.join("filtered_assembly","exploratory")
        counts_file = os.path.join("filtered_assembly", "isoforms.counts.matrix")
        trinotate_annotation_report_filtered = os.path.join("trinotate", "trinotate_annotation_report.tsv" + ".isoforms_filtered.tsv")
        trinotate_annotation_report_filtered_header="trinotate/trinotate_annotation_report.tsv.isoforms_filtered_header.tsv"
        lengths_file=os.path.join("differential_expression", "isoforms.lengths.tsv")
        lengths_filtered_file = os.path.join("filtered_assembly", "isoforms.lengths.tsv")
        
        jobs.append(concat_jobs([
                        Job(command="mkdir -p " + exploratory_output_dir),
                        Job([trinotate_annotation_report_filtered], 
                            [trinotate_annotation_report_filtered_header], 
                            command="sed '1s/^/ \\n/' " + trinotate_annotation_report_filtered  + " > " + trinotate_annotation_report_filtered_header), 
                        tools.py_parseMergeCsv([ trinotate_annotation_report_filtered_header, os.path.join("differential_expression", "isoforms.counts.matrix") ],
                                    "\\\\t",
                                    counts_file,
                                    "\'\'",
                                    left_join=True,
                                    exclude="\'\'"
                                    ),
                        tools.py_parseMergeCsv([ trinotate_annotation_report_filtered_header, lengths_file ],
                                    "\\\\t",
                                    lengths_filtered_file,
                                    "\'\' transcript_id",
                                    left_join=True,
                                    exclude="\' \'"
                                    )

                    ], name="filter_annotated_components_exploratory"
                    )
                )

        # gqSeqUtils function call        
        jobs.append(concat_jobs([
            Job(command="mkdir -p " + exploratory_output_dir),
            gq_seq_utils.exploratory_analysis_rnaseq_denovo(
                counts_file,
                lengths_filtered_file,
                exploratory_output_dir                
            )
        ], name="gq_seq_utils_exploratory_analysis_rnaseq_denovo"))

        # Render Rmarkdown Report
        jobs.append(
            rmarkdown.render(
             job_input            = os.path.join(exploratory_output_dir, "index.tsv"),
             job_name             = "gq_seq_utils_exploratory_analysis_rnaseq_denovo_filtered_report",
             input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.gq_seq_utils_exploratory_analysis_rnaseq_filtered.Rmd") ,
             render_output_dir    = 'report',
             module_section       = 'report', 
             prerun_r             = 'report_dir="report/filtered_assembly"; exploratory_dir="' + exploratory_output_dir + '";' 
             )
        )
        return jobs
Пример #2
0
    def differential_expression(self):    
        """
        Performs differential gene expression analysis using [DESEQ](http://bioconductor.org/packages/release/bioc/html/DESeq.html) and [EDGER](http://www.bioconductor.org/packages/release/bioc/html/edgeR.html).
        Merge the results of the analysis in a single csv file. Also, performs Gene Ontology analysis for RNA-Seq denovo Assembly using the Bioconductor's R package [goseq](http://www.bioconductor.org/packages/release/bioc/html/goseq.html).
        Generates GO annotations for differential genes and isoforms expression analysis, based on associated GOTERMS generated by trinotate.
        """
        output_directory = "differential_expression"
        jobs = []
        trinotate_annotation_report = os.path.join("trinotate", "trinotate_annotation_report.tsv")
        report_dir= 'report'
        input_rmarkdown_file=os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.differential_expression_goseq.Rmd") 
                
        # Run DGE and merge dge results with annotations
        for item in "genes","isoforms":
            jobs.append(concat_jobs( self.differential_expression_and_goseq_rsem(output_directory, item, trinotate_annotation_report)
                        , name= "differential_expression_" + item)
            )            
        # DGE Report
        # Render Rmarkdown Report
        output_files = []                
        for job_item in jobs:
            output_files.extend([output_file for output_file in job_item.output_files if output_file not in output_files])

        jobs.append(
            rmarkdown.render(
             job_input            = output_files,
             job_name             = "differential_expression_goseq_rnaseq_denovo_report",
             input_rmarkdown_file = input_rmarkdown_file,
             render_output_dir    = 'report',
             module_section       = 'report', 
             prerun_r             = 'design_file="' +  os.path.relpath(self.args.design.name, self.output_dir) +
                                    '"; report_dir="' + report_dir + '"; source_dir="' + output_directory + '"; ' + 'top_n_results=10; contrasts=c("' + '","'.join(contrast.name for contrast in self.contrasts) + '");'
             )
        )
        return jobs
Пример #3
0
    def gq_seq_utils_exploratory_analysis_rnaseq_denovo(self):
        """
        Exploratory analysis using the gqSeqUtils R package.
        """

        jobs = []

        # gqSeqUtils function call
        jobs.append(concat_jobs([
            Job(command="mkdir -p exploratory"),
            gq_seq_utils.exploratory_analysis_rnaseq_denovo(
                os.path.join("differential_expression", "genes.counts.matrix"),
                os.path.join("differential_expression", "genes.lengths.tsv"),
                "exploratory"
            )
        ], name="gq_seq_utils_exploratory_analysis_rnaseq_denovo"))

        # Render Rmarkdown Report
        jobs.append(
            rmarkdown.render(
             job_input            = os.path.join("exploratory", "index.tsv"),
             job_name             = "gq_seq_utils_exploratory_analysis_rnaseq_denovo_report",
             input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.gq_seq_utils_exploratory_analysis_rnaseq.Rmd") ,
             render_output_dir    = 'report',
             module_section       = 'report', # TODO: this or exploratory?
             prerun_r             = 'report_dir="report";' # TODO: really necessary or should be hard-coded in exploratory.Rmd?
             )
        )
  
        return jobs    
Пример #4
0
    def gq_seq_utils_exploratory_analysis_rnaseq_light(self):
        """
        Exploratory analysis using the gqSeqUtils R package adapted for RnaSeqLight
        """

        jobs = []
        abundance_file=os.path.join(self.output_dir,"kallisto/All_readsets", "all_readsets.abundance_genes.csv")
        # gqSeqUtils function call
        jobs.append(concat_jobs([
            Job(command="mkdir -p exploratory"),
            gq_seq_utils.exploratory_analysis_rnaseq_light(
                abundance_file,
                config.param('gq_seq_utils_exploratory_analysis_rnaseq_light', 'genes', type='filepath'),
                "exploratory"
            )
        ], name="gq_seq_utils_exploratory_analysis_rnaseq_light", samples=self.samples))

        jobs.append(
            rmarkdown.render(
                job_input            = os.path.join("exploratory", "index.tsv"),
                job_name             = "report.gq_seq_utils_exploratory_analysis_rnaseq",
                input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqLight.gq_seq_utils_exploratory_analysis_rnaseq_light.Rmd"),
                samples              = self.samples,
                render_output_dir    = 'report',
                module_section       = 'report',
                prerun_r             = 'report_dir="report";'
            )
        )

        return jobs
Пример #5
0
    def gq_seq_utils_exploratory_analysis_rnaseq(self):
        """
        Exploratory analysis using the gqSeqUtils R package.
        """

        jobs = []

        # gqSeqUtils function call
        sample_fpkm_readcounts = [[
            sample.name,
            os.path.join("cufflinks", sample.name, "isoforms.fpkm_tracking"),
            os.path.join("raw_counts", sample.name + ".readcounts.csv")
        ] for sample in self.samples]
        jobs.append(concat_jobs([
            Job(command="mkdir -p exploratory"),
            gq_seq_utils.exploratory_analysis_rnaseq(
                os.path.join("DGE", "rawCountMatrix.csv"),
                "cuffnorm",
                config.param('gq_seq_utils_exploratory_analysis_rnaseq', 'genes', type='filepath'),
                "exploratory"
            )
        ], name="gq_seq_utils_exploratory_analysis_rnaseq"))

        # Render Rmarkdown Report
        jobs.append(
            rmarkdown.render(
             job_input            = os.path.join("exploratory", "index.tsv"),
             job_name             = "gq_seq_utils_exploratory_analysis_rnaseq_report",
             input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeq.gq_seq_utils_exploratory_analysis_rnaseq.Rmd") ,
             render_output_dir    = 'report',
             module_section       = 'report', # TODO: this or exploratory?
             prerun_r             = 'report_dir="report";' # TODO: really necessary or should be hard-coded in exploratory.Rmd?
             )
        )



        report_file = os.path.join("report", "RnaSeq.cuffnorm.md")
        jobs.append(
            Job(
                [os.path.join("cufflinks", "AllSamples","merged.gtf")],
                [report_file],
                command="""\
mkdir -p report && \\
zip -r report/cuffAnalysis.zip cufflinks/ cuffdiff/ cuffnorm/ && \\
cp \\
  {report_template_dir}/{basename_report_file} \\
  {report_file}""".format(
                    report_template_dir=self.report_template_dir,
                    basename_report_file=os.path.basename(report_file),
                    report_file=report_file
                ),
                report_files=[report_file],
                name="cuffnorm_report")
        )

        return jobs
Пример #6
0
    def kallisto_count_matrix(self):

        jobs=[]
        kallisto_directory="kallisto"
        all_readset_directory="All_readsets"
        output_dir=os.path.join(self.output_dir,kallisto_directory, all_readset_directory)

        #per trancripts
        input_abundance_files_transcripts = [os.path.join(self.output_dir,kallisto_directory, readset.sample.name, "abundance_transcripts.tsv") for readset in self.readsets]
        job_name_transcripts="kallisto_count_matrix.transcripts"
        data_type_transcripts="transcripts"
        job=tools.r_create_kallisto_count_matrix(input_abundance_files_transcripts, output_dir, data_type_transcripts, job_name_transcripts)
        job.samples = self.samples
        jobs.append(job)

        #per genes
        input_abundance_files_genes = [os.path.join(self.output_dir,kallisto_directory, readset.sample.name, "abundance_genes.tsv") for readset in self.readsets]
        job_name_genes="kallisto_count_matrix.genes"
        data_type_genes="genes"
        job=tools.r_create_kallisto_count_matrix(input_abundance_files_genes, output_dir, data_type_genes, job_name_genes)
        job.samples = [readset.sample]
        jobs.append(job)

        #copy tx2genes file
        jobs.append(
            Job(
                [os.path.join(self.output_dir, "kallisto", "All_readsets","all_readsets.abundance_genes.csv"), os.path.join(self.output_dir, "kallisto", "All_readsets","all_readsets.abundance_transcripts.csv")],
                [],
                command="""\
cp \\
  {tx2genes_file} \\
  {report_dir}""".format(
                    tx2genes_file=config.param('kallisto', 'transcript2genes', type="filepath"),
                    report_dir="report"
                ),
                name="report.copy_tx2genes_file",
                samples=self.samples
            )
        )

        # Create kallisto report
        jobs.append(
            rmarkdown.render(
                job_input            = [os.path.join(self.output_dir, "kallisto", "All_readsets","all_readsets.abundance_genes.csv"), os.path.join(self.output_dir, "kallisto", "All_readsets","all_readsets.abundance_transcripts.csv")],
                job_name             = "report.kallisto_count_matrix",
                input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqLight.kallisto.Rmd"),
                samples              = self.samples,
                render_output_dir    = 'report',
                module_section       = 'report',
                prerun_r             = 'report_dir="report";'
            )
        )

        return jobs
Пример #7
0
    def trinotate(self):
        """
        Perform transcriptome functional annotation and analysis using [Trinotate](http://trinotate.sourceforge.net/).
        All functional annotation data is integrated into a SQLite database and a whole annotation report is created.
        """
        jobs = []

        swissprot_db = os.path.basename(config.param("blastx_trinity_uniprot", "swissprot_db", type='prefixpath'))
        transdecoder_pep = os.path.join("trinotate", "transdecoder", "Trinity.fasta.transdecoder.pep")

        job = trinotate.trinotate(
            swissprot_db = swissprot_db ,
            trinity_fasta = os.path.join("trinity_out_dir", "Trinity.fasta"),
            swissprot_blastx = os.path.join("blast", "blastx_Trinity_" + swissprot_db + ".tsv"),
            transdecoder_pep = transdecoder_pep,
            transdecoder_pfam = os.path.join("trinotate", "transdecoder", "Trinity.fasta.transdecoder.pfam"),
            swissprot_blastp = os.path.join("trinotate", "blastp", "blastp_" + os.path.basename(transdecoder_pep) + "_" + swissprot_db + ".tsv"),
            rnammer = os.path.join("trinotate", "rnammer", "Trinity.fasta.rnammer.gff"),
            signalp = os.path.join("trinotate", "signalp", "signalp.gff"),
            tmhmm = os.path.join("trinotate", "tmhmm", "tmhmm.out"),
            trinotate_sqlite = os.path.join("trinotate", "Trinotate.sqlite"),
            trinotate_report = os.path.join("trinotate", "trinotate_annotation_report.tsv")
        )
        job.samples = self.samples
        jobs.append(job)
        # Render Rmarkdown Report
        jobs.append(
            rmarkdown.render(
                job_input            = os.path.join("trinotate", "trinotate_annotation_report.tsv"),
                job_name             = "trinotate_report",
                input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.trinotate.Rmd") ,
                render_output_dir    = 'report',
                module_section       = 'report',
                prerun_r             = 'report_dir="report"; source_dir="trinotate";'
            )
        )

        return jobs
    def trinotate(self):
        """
        Perform transcriptome functional annotation and analysis using [Trinotate](http://trinotate.sourceforge.net/).
        All functional annotation data is integrated into a SQLite database and a whole annotation report is created.
        """
        jobs = []
        
        swissprot_db = os.path.basename(config.param("blastx_trinity_uniprot", "swissprot_db", type='prefixpath'))
        transdecoder_pep = os.path.join("trinotate", "transdecoder", "Trinity.fasta.transdecoder.pep")
        
        jobs.append( trinotate.trinotate(
            swissprot_db = swissprot_db ,
            trinity_fasta = os.path.join("trinity_out_dir", "Trinity.fasta"),
            swissprot_blastx = os.path.join("blast", "blastx_Trinity_" + swissprot_db + ".tsv"),
            transdecoder_pep = transdecoder_pep,
            transdecoder_pfam = os.path.join("trinotate", "transdecoder", "Trinity.fasta.transdecoder.pfam"),
            swissprot_blastp = os.path.join("trinotate", "blastp", "blastp_" + os.path.basename(transdecoder_pep) + "_" + swissprot_db + ".tsv"),
            rnammer = os.path.join("trinotate", "rnammer", "Trinity.fasta.rnammer.gff"),
            signalp = os.path.join("trinotate", "signalp", "signalp.gff"),
            tmhmm = os.path.join("trinotate", "tmhmm", "tmhmm.out"),
            trinotate_sqlite = os.path.join("trinotate", "Trinotate.sqlite"),
            trinotate_report = os.path.join("trinotate", "trinotate_annotation_report.tsv")
            )
        )
        # Render Rmarkdown Report
        jobs.append(
            rmarkdown.render(
             job_input            = os.path.join("trinotate", "trinotate_annotation_report.tsv"),
             job_name             = "trinotate_report",
             input_rmarkdown_file = os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.trinotate.Rmd") ,
             render_output_dir    = 'report',
             module_section       = 'report', 
             prerun_r             = 'report_dir="report"; source_dir="trinotate";' 
             )
        )

        return jobs
Пример #9
0
    def verify_bam_id(self):
        """
        verifyBamID is a software that verifies whether the reads in particular file match previously known
        genotypes for an individual (or group of individuals), and checks whether the reads are contaminated
        as a mixture of two samples. verifyBamID can detect sample contamination and swaps when external
        genotypes are available. When external genotypes are not available, verifyBamID still robustly
        detects sample swaps.
        """

        # Known variants file
        population_AF = config.param('verify_bam_id',
                                     'population_AF',
                                     required=False)
        known_variants_annotated = config.param('verify_bam_id',
                                                'verifyBamID_variants_file',
                                                required=False)
        verify_bam_id_directory = "verify_bam_id"
        variants_directory = "variants"

        jobs = []

        verify_bam_results = []

        jobs.append(
            Job([known_variants_annotated],
                [variants_directory, verify_bam_id_directory],
                command="mkdir -p " + variants_directory + " " +
                verify_bam_id_directory,
                name="verify_bam_id_create_directories"))

        for sample in self.samples:
            alignment_directory = os.path.join("alignment", sample.name)

            candidate_input_files = [[
                os.path.join(alignment_directory,
                             sample.name + ".sorted.dup.recal.bam")
            ]]
            candidate_input_files.append([
                os.path.join(alignment_directory,
                             sample.name + ".sorted.dedup.bam")
            ])
            candidate_input_files.append([
                os.path.join(alignment_directory,
                             sample.name + ".sorted.mdup.bam")
            ])  # this one is for RnaSeq pipeline
            [input_bam] = self.select_input_files(candidate_input_files)

            output_prefix = os.path.join(verify_bam_id_directory, sample.name)

            coverage_bed = bvatools.resolve_readset_coverage_bed(
                sample.readsets[0])

            # Run verifyBamID
            job = verify_bam_id.verify(input_bam, known_variants_annotated,
                                       output_prefix)
            job.name = "verify_bam_id_" + sample.name
            job.samples = [sample]

            jobs.append(job)

            verify_bam_results.extend([output_prefix + ".selfSM"])

        # Coverage bed is null if whole genome experiment
        target_bed = coverage_bed if coverage_bed else ""

        # Render Rmarkdown Report
        jobs.append(
            rmarkdown.render(
                job_input=verify_bam_results,
                job_name="verify_bam_id_report",
                input_rmarkdown_file=os.path.join(
                    self.report_template_dir, "Illumina.verify_bam_id.Rmd"),
                render_output_dir='report',
                module_section='report',
                prerun_r='source_dir="' + verify_bam_id_directory +
                '"; report_dir="report" ; params=list(verifyBamID_variants_file="'
                + known_variants_annotated + '", dbnsfp_af_field="' +
                population_AF + '", coverage_bed="' + target_bed + '");'))

        return jobs
Пример #10
0
 def differential_expression_filtered(self):    
     """
     Differential Expression and GOSEQ analysis based on filtered transcripts and genes 
     """
     output_directory = os.path.join("filtered_assembly","differential_expression")
     jobs = []
     trinotate_annotation_report = os.path.join("trinotate", "trinotate_annotation_report.tsv")
     report_dir= os.path.join("report","filtered_assembly")
     input_rmarkdown_file=os.path.join(self.report_template_dir, "RnaSeqDeNovoAssembly.differential_expression_goseq_filtered.Rmd") 
     
     # Filter input files 
     trinotate_annotation_report_filtered = trinotate_annotation_report + ".isoforms_filtered.tsv"
     trinotate_annotation_report_filtered_header={}
     trinotate_annotation_report_filtered_header["isoforms"] = trinotate_annotation_report + ".isoforms_filtered_header.tsv"
     trinotate_annotation_report_filtered_header["genes"]= trinotate_annotation_report + ".genes_filtered_header.tsv"
     counts_ids = { 'genes':"Genes", 'isoforms':"Isoforms" }
     trinotate_filters = None if not config.param('filter_annotated_components', 'filters_trinotate', required=False) else config.param('filter_annotated_components', 'filters_trinotate', required=False).split("\n")
     source_directory = "differential_expression"
     
     # Create the files containing filtered isoforms and genes with headers
     jobs.append(concat_jobs([                    
                     Job(command="mkdir -p " + output_directory ),
                     Job([trinotate_annotation_report_filtered], 
                         [trinotate_annotation_report_filtered_header["genes"]], 
                         command="cat " + trinotate_annotation_report_filtered + " | awk 'BEGIN{OFS=\"_\";FS=\"_\"}{print $1,$2}' | uniq | sed '1s/^/ \\n/' " + "  > " + trinotate_annotation_report_filtered_header["genes"]
                         ),
                     Job([trinotate_annotation_report_filtered], 
                         [trinotate_annotation_report_filtered_header["isoforms"]], 
                         command="sed '1s/^/ \\n/' " + trinotate_annotation_report_filtered  + " > " + trinotate_annotation_report_filtered_header["isoforms"])
                     ],name="differential_expression_filtered_get_trinotate")
     )
     
     # Run DGE and merge dge results with annotations        
     for item in "genes","isoforms":
         matrix = os.path.join(output_directory, item + ".counts.matrix.symbol")
         job=tools.py_parseMergeCsv([ trinotate_annotation_report_filtered_header[item], os.path.join(source_directory, item + ".counts.matrix.symbol") ],
                 "\\\\t",
                 matrix,
                 "\'\' " + counts_ids[item],
                 left_join=True,
                 exclude="\' \'")
         jobs.append(concat_jobs([
                               job,
                               Job([os.path.join(source_directory, item +".lengths.tsv.noheader.tsv")], 
                                   [os.path.join(output_directory, item +".lengths.tsv.noheader.tsv")], 
                                   command="cp " + os.path.join(source_directory, item +".lengths.tsv.noheader.tsv") + " " + os.path.join(output_directory, item +".lengths.tsv.noheader.tsv")),
                               concat_jobs(self.differential_expression_and_goseq_rsem(output_directory, item, trinotate_annotation_report), name="differential_expression_filtered_" + item)
                               ], name="differential_expression_filtered_" + item)
         )
     
     # Dependencies for report    
     output_files = []                
     for job_item in jobs:
         output_files.extend([output_file for output_file in job_item.output_files if output_file not in output_files])
         
     # DGE Report
     # Render Rmarkdown Report
     jobs.append(
         rmarkdown.render(
          job_input            = output_files,
          job_name             = "differential_expression_goseq_rnaseq_denovo_filtered_report",
          input_rmarkdown_file = input_rmarkdown_file ,
          render_output_dir    = 'report',
          module_section       = 'report', 
          prerun_r             = 'report_dir="' + report_dir + '"; source_dir="' + output_directory + '"; ' + 'top_n_results=10; contrasts=c("' + '","'.join(contrast.name for contrast in self.contrasts) + '");'
          )
     )
     
     return jobs