Пример #1
0
    def bam_hard_clip(self):
        """
        Generate a hardclipped version of the bam for the toxedo suite which doesn't support this official sam feature.
        """
        
        jobs = []
        for sample in self.samples:
            alignment_input = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.bam")
            alignment_output = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.hardClip.bam")
            job=pipe_jobs([
                samtools.view(
                    alignment_input,
                    None,
                    "-h"
                ),
                Job(
                    [None],
                    [alignment_output],
                    # awk to transform soft clip into hard clip for tuxedo suite
                    command="""\
awk 'BEGIN {{OFS="\\t"}} {{if (substr($1,1,1)=="@") {{print;next}}; split($6,C,/[0-9]*/); split($6,L,/[SMDIN]/); if (C[2]=="S") {{$10=substr($10,L[1]+1); $11=substr($11,L[1]+1)}}; if (C[length(C)]=="S") {{L1=length($10)-L[length(L)-1]; $10=substr($10,1,L1); $11=substr($11,1,L1); }}; gsub(/[0-9]*S/,"",$6); print}}' """.format()
                ),
                samtools.view(
                    "-",
                    alignment_output,
                    "-hbS"
                ),
            ])
            job.name="tuxedo_hard_clip."+ sample.name
            jobs.append(job)
        return jobs
Пример #2
0
 def extract_bam_unmap(self):
     jobs = []
     
     for sample in self.samples:
         sclip_directory = os.path.join("sclip", sample.name)
         sclip_file_prefix = os.path.join("sclip", sample.name, sample.name + ".")
         extract_directory = os.path.join("extract", sample.name)
         extract_file_prefix = os.path.join("extract", sample.name, sample.name + ".")
         
         jobMkdir = Job(command="if [ ! -d " + extract_directory + " ]; then mkdir -p " + extract_directory + "; fi")
         ## extract Orphan
         job = concat_jobs([
         jobMkdir,
             concat_jobs([
              samtools.view(sclip_file_prefix + "scOthers.bam", extract_file_prefix + "ORPHAN.bam", "-b -h -f 12 -F 256"),
              samtools.sort(extract_file_prefix + "ORPHAN.bam", extract_file_prefix + "ORPHAN.sName", True)
             ])    
         ], name="extract_bam_ORPHAN_" + sample.name)
         
         jobs.append(job)
         
         ## extract OEA close to sclip
         job = concat_jobs([
         jobMkdir,
             concat_jobs([
              samtools.view(sclip_file_prefix + "sc.bam", extract_file_prefix + "OEAUNMAP.1.bam", "-b -h -f 68 -F 264"),
              samtools.sort(extract_file_prefix + "OEAUNMAP.1.bam", extract_file_prefix + "OEAUNMAP.1.sName", True)
             ])    
         ], name="extract_bam_OEAUNMAP1_" + sample.name)
         
         jobs.append(job)
         
         job = concat_jobs([
         jobMkdir,
             concat_jobs([
              samtools.view(sclip_file_prefix + "sc.bam", extract_file_prefix + "OEAUNMAP.2.bam", "-b -h -f 132 -F 264"),
              samtools.sort(extract_file_prefix + "OEAUNMAP.2.bam", extract_file_prefix + "OEAUNMAP.2.sName", True)
             ])    
         ], name="extract_bam_OEAUNMAP2_" + sample.name)
         
         jobs.append(job)
         
         job = concat_jobs([
         jobMkdir,
             concat_jobs([
              samtools.view(sclip_file_prefix + "sc.bam", extract_file_prefix + "OEAMAP.bam", "-b -h -f 8 -F 1284"),
              samtools.sort(extract_file_prefix + "OEAMAP.bam", extract_file_prefix + "OEAMAP.sName", True)
             ])    
         ], name="extract_bam_OEAMAP_" + sample.name)
         
         jobs.append(job)
         
     return jobs
Пример #3
0
    def raw_counts(self):
        """
        Count reads in features using [htseq-count](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html).
        """

        jobs = []

        for sample in self.samples:
            alignment_file_prefix = os.path.join("alignment", sample.name, sample.name)
            input_bam = alignment_file_prefix + ".QueryNameSorted.bam"
            
            # Count reads
            output_count = os.path.join("raw_counts", sample.name + ".readcounts.csv")
            stranded = "no" if config.param('DEFAULT', 'strand_info') == "fr-unstranded" else "reverse"
            job = concat_jobs([
                Job(command="mkdir -p raw_counts"),
                pipe_jobs([
                        samtools.view(
                                input_bam,
                                options="-F 4"
                        ),
                        htseq.htseq_count(
                        "-",
                        config.param('htseq_count', 'gtf', type='filepath'),
                        output_count,
                        config.param('htseq_count', 'options'),
                        stranded
                        )
                ])
            ], name="htseq_count." + sample.name)
            jobs.append(job)

        return jobs
Пример #4
0
    def samtools_view_filter(self):
        """
        Filter unique reads by mapping quality using [Samtools](http://www.htslib.org/).
        """

        jobs = []
        for readset in self.readsets:
            readset_bam_prefix = os.path.join("alignment", readset.sample.name,
                                              readset.name,
                                              readset.name + ".sorted.")
            readset_bam = readset_bam_prefix + "bam"
            filtered_readset_bam = readset_bam_prefix + "filtered.bam"

            job = samtools.view(
                readset_bam, filtered_readset_bam, "-b -F4 -q " + str(
                    config.param(
                        'samtools_view_filter', 'min_mapq', type='int')))
            job.name = "samtools_view_filter." + readset.name
            jobs.append(job)

        report_file = os.path.join("report", "ChipSeq.samtools_view_filter.md")
        jobs.append(
            Job([
                os.path.join("alignment", readset.sample.name, readset.name,
                             readset.name + ".sorted.filtered.bam")
                for readset in self.readsets
            ], [report_file], [['samtools_view_filter', 'module_pandoc']],
                command="""\
mkdir -p report && \\
pandoc --to=markdown \\
  --template {report_template_dir}/{basename_report_file} \\
  --variable min_mapq="{min_mapq}" \\
  {report_template_dir}/{basename_report_file} \\
  > {report_file}""".format(min_mapq=config.param('samtools_view_filter',
                                                  'min_mapq',
                                                  type='int'),
                            report_template_dir=self.report_template_dir,
                            basename_report_file=os.path.basename(report_file),
                            report_file=report_file),
                report_files=[report_file],
                name="samtools_view_filter_report"))

        return jobs
Пример #5
0
    def samtools_view_filter(self):
        """
        Filter unique reads by mapping quality using [Samtools](http://www.htslib.org/).
        """

        jobs = []
        for readset in self.readsets:
            readset_bam_prefix = os.path.join("alignment", readset.sample.name, readset.name, readset.name + ".sorted.")
            readset_bam = readset_bam_prefix + "bam"
            filtered_readset_bam = readset_bam_prefix + "filtered.bam"

            job = samtools.view(readset_bam, filtered_readset_bam, "-b -F4 -q " + str(config.param('samtools_view_filter', 'min_mapq', type='int')))
            job.name = "samtools_view_filter." + readset.name
            jobs.append(job)

        report_file = os.path.join("report", "ChipSeq.samtools_view_filter.md")
        jobs.append(
            Job(
                [os.path.join("alignment", readset.sample.name, readset.name, readset.name + ".sorted.filtered.bam") for readset in self.readsets],
                [report_file],
                [['samtools_view_filter', 'module_pandoc']],
                command="""\
mkdir -p report && \\
pandoc --to=markdown \\
  --template {report_template_dir}/{basename_report_file} \\
  --variable min_mapq="{min_mapq}" \\
  {report_template_dir}/{basename_report_file} \\
  > {report_file}""".format(
                    min_mapq=config.param('samtools_view_filter', 'min_mapq', type='int'),
                    report_template_dir=self.report_template_dir,
                    basename_report_file=os.path.basename(report_file),
                    report_file=report_file
                ),
                report_files=[report_file],
                name="samtools_view_filter_report")
        )

        return jobs
Пример #6
0
    def metrics(self):
        """
        Compute metrics and generate coverage tracks per sample. Multiple metrics are computed at this stage:
        Number of raw reads, Number of filtered reads, Number of aligned reads, Number of duplicate reads,
        Median, mean and standard deviation of insert sizes of reads after alignment, percentage of bases
        covered at X reads (%_bases_above_50 means the % of exons bases which have at least 50 reads)
        whole genome or targeted percentage of bases covered at X reads (%_bases_above_50 means the % of exons
        bases which have at least 50 reads). A TDF (.tdf) coverage track is also generated at this step
        for easy visualization of coverage in the IGV browser.
        """

        # check the library status
        library, bam = {}, {}
        for readset in self.readsets:
            if not library.has_key(readset.sample):
                library[readset.sample] = "SINGLE_END"
            if readset.run_type == "PAIRED_END":
                library[readset.sample] = "PAIRED_END"
            if not bam.has_key(readset.sample):
                bam[readset.sample] = ""
            if readset.bam:
                bam[readset.sample] = readset.bam

        jobs = []
        created_interval_lists = []
        for sample in self.samples:
            file_prefix = os.path.join("alignment", sample.name,
                                       sample.name + ".sorted.dedup.")
            coverage_bed = bvatools.resolve_readset_coverage_bed(
                sample.readsets[0])

            candidate_input_files = [[file_prefix + "bam"]]
            if bam[sample]:
                candidate_input_files.append([bam[sample]])
            [input] = self.select_input_files(candidate_input_files)

            job = picard.collect_multiple_metrics(input,
                                                  re.sub(
                                                      "bam", "all.metrics",
                                                      input),
                                                  library_type=library[sample])
            job.name = "picard_collect_multiple_metrics." + sample.name
            job.samples = [sample]
            jobs.append(job)

            # Compute genome coverage with GATK
            job = gatk.depth_of_coverage(input,
                                         re.sub("bam", "all.coverage", input),
                                         coverage_bed)
            job.name = "gatk_depth_of_coverage.genome." + sample.name
            job.samples = [sample]
            jobs.append(job)

            # Compute genome or target coverage with BVATools
            job = bvatools.depth_of_coverage(
                input,
                re.sub("bam", "coverage.tsv", input),
                coverage_bed,
                other_options=config.param('bvatools_depth_of_coverage',
                                           'other_options',
                                           required=False))
            job.name = "bvatools_depth_of_coverage." + sample.name
            job.samples = [sample]
            jobs.append(job)

            if coverage_bed:
                # Get on-target reads (if on-target context is detected)
                ontarget_bam = re.sub("bam", "ontarget.bam", input)
                flagstat_output = re.sub("bam", "bam.flagstat", ontarget_bam)
                job = concat_jobs([
                    bedtools.intersect(input, ontarget_bam, coverage_bed),
                    samtools.flagstat(ontarget_bam, flagstat_output)
                ])
                job.name = "ontarget_reads." + sample.name
                job.removable_files = [ontarget_bam]
                job.samples = [sample]
                jobs.append(job)

                # Compute on target percent of hybridisation based capture
                interval_list = re.sub("\.[^.]+$", ".interval_list",
                                       coverage_bed)
                if not interval_list in created_interval_lists:
                    job = tools.bed2interval_list(None, coverage_bed,
                                                  interval_list)
                    job.name = "interval_list." + os.path.basename(
                        coverage_bed)
                    jobs.append(job)
                    created_interval_lists.append(interval_list)
                file_prefix = os.path.join("alignment", sample.name,
                                           sample.name + ".sorted.dedup.")
                job = picard.calculate_hs_metrics(file_prefix + "bam",
                                                  file_prefix + "onTarget.tsv",
                                                  interval_list)
                job.name = "picard_calculate_hs_metrics." + sample.name
                job.samples = [sample]
                jobs.append(job)

            # Calculate the number of reads with higher mapping quality than the threshold passed in the ini file
            job = concat_jobs([
                samtools.view(
                    input, re.sub(".bam", ".filtered_reads.counts.txt", input),
                    "-c " + config.param('mapping_quality_filter',
                                         'quality_threshold'))
            ])
            job.name = "mapping_quality_filter." + sample.name
            job.samples = [sample]
            jobs.append(job)

            # Calculate GC bias
            # For captured analysis
            #if coverage_bed:
            #target_input = re.sub(".bam", ".targeted.bam", input)
            #job = concat_jobs([
            #bedtools.intersect(
            #input,
            #target_input,
            #coverage_bed
            #)
            #bedtools.coverage(
            #target_input,
            #re.sub(".bam", ".gc_cov.1M.txt", target_input)
            #),
            #metrics.gc_bias(
            #re.sub(".bam", ".gc_cov.1M.txt", target_input),
            #re.sub(".bam", ".GCBias_all.txt", target_input)
            #)
            #])
            # Or for whole genome analysis
            #else:
            gc_content_file = re.sub(".bam", ".gc_cov.1M.txt", input)
            job = bedtools.coverage(input, gc_content_file, coverage_bed)
            if coverage_bed:
                gc_content_on_target_file = re.sub(".bam",
                                                   ".gc_cov.1M.on_target.txt",
                                                   input)
                gc_ontent_target_job = bedtools.intersect(
                    gc_content_file, gc_content_on_target_file, coverage_bed)
                gc_content_file = gc_content_on_target_file
                job = concat_jobs([job, gc_ontent_target_job])
            job = concat_jobs([
                job,
                metrics.gc_bias(gc_content_file,
                                re.sub(".bam", ".GCBias_all.txt", input))
            ])
            job.name = "GC_bias." + sample.name
            job.samples = [sample]
            jobs.append(job)

            job = igvtools.compute_tdf(input, input + ".tdf")
            job.name = "igvtools_compute_tdf." + sample.name
            job.samples = [sample]
            jobs.append(job)

        return jobs
Пример #7
0
    def wiggle(self):
        """
        Generate wiggle tracks suitable for multiple browsers.
        """

        jobs = []
        
        ##check the library status
        library = {}
        for readset in self.readsets:
            if not library.has_key(readset.sample) :
                library[readset.sample]="PAIRED_END"
            if readset.run_type == "SINGLE_END" :
                library[readset.sample]="SINGLE_END"
        
        for sample in self.samples:
            bam_file_prefix = os.path.join("alignment", sample.name, sample.name + ".sorted.mdup.")
            input_bam = bam_file_prefix + "bam"
            bed_graph_prefix = os.path.join("tracks", sample.name, sample.name)
            big_wig_prefix = os.path.join("tracks", "bigWig", sample.name)

            if (config.param('DEFAULT', 'strand_info') != 'fr-unstranded') and library[sample] == "PAIRED_END":
                input_bam_f1 = bam_file_prefix + "tmp1.forward.bam"
                input_bam_f2 = bam_file_prefix + "tmp2.forward.bam"
                input_bam_r1 = bam_file_prefix + "tmp1.reverse.bam"
                input_bam_r2 = bam_file_prefix + "tmp2.reverse.bam"
                output_bam_f = bam_file_prefix + "forward.bam"
                output_bam_r = bam_file_prefix + "reverse.bam"

                bam_f_job = concat_jobs([
                    samtools.view(input_bam, input_bam_f1, "-bh -F 256 -f 81"),
                    samtools.view(input_bam, input_bam_f2, "-bh -F 256 -f 161"),
                    picard.merge_sam_files([input_bam_f1, input_bam_f2], output_bam_f),
                    Job(command="rm " + input_bam_f1 + " " + input_bam_f2)
                ], name="wiggle." + sample.name + ".forward_strandspec")
                # Remove temporary-then-deleted files from job output files, otherwise job is never up to date
                bam_f_job.output_files.remove(input_bam_f1)
                bam_f_job.output_files.remove(input_bam_f2)

                bam_r_job = concat_jobs([
                    Job(command="mkdir -p " + os.path.join("tracks", sample.name) + " " + os.path.join("tracks", "bigWig")),
                    samtools.view(input_bam, input_bam_r1, "-bh -F 256 -f 97"),
                    samtools.view(input_bam, input_bam_r2, "-bh -F 256 -f 145"),
                    picard.merge_sam_files([input_bam_r1, input_bam_r2], output_bam_r),
                    Job(command="rm " + input_bam_r1 + " " + input_bam_r2)
                ], name="wiggle." + sample.name + ".reverse_strandspec")
                # Remove temporary-then-deleted files from job output files, otherwise job is never up to date
                bam_r_job.output_files.remove(input_bam_r1)
                bam_r_job.output_files.remove(input_bam_r2)

                jobs.extend([bam_f_job, bam_r_job])

                outputs = [
                    [bed_graph_prefix + ".forward.bedGraph", big_wig_prefix + ".forward.bw"],
                    [bed_graph_prefix + ".reverse.bedGraph", big_wig_prefix + ".reverse.bw"],
                ]
            else:
                outputs = [[bed_graph_prefix + ".bedGraph", big_wig_prefix + ".bw"]]

            for bed_graph_output, big_wig_output in outputs:
                job = concat_jobs([
                    Job(command="mkdir -p " + os.path.join("tracks", sample.name) + " " + os.path.join("tracks", "bigWig"), removable_files=["tracks"]),
                    bedtools.graph(input_bam, bed_graph_output, big_wig_output,library[sample])
                ], name="wiggle." + re.sub(".bedGraph", "", os.path.basename(bed_graph_output)))
                jobs.append(job)

        return jobs
Пример #8
0
    def extract_fastq_oea_sclip(self):
        jobs = []

        for sample in self.samples:
            extract_directory = os.path.join("extract", sample.name)
            extract_file_prefix = os.path.join("extract", sample.name, sample.name + ".")
            sclip_file_prefix = os.path.join("sclip", sample.name, sample.name + ".")
            
            jobMkdir = Job(command="if [ ! -d " + extract_directory + " ]; then mkdir -p " + extract_directory + "; fi")
            
            ## create fastq of OEA close to sclip
            job = pipe_jobs([
                 samtools.view(extract_file_prefix + "OEAUNMAP.1.sName.bam"),
                 self.get_job_sam_to_fastq(extract_file_prefix + "OEAUNMAP.1.fastq.gz")
            ], name="extract_fastq_OEA1_" + sample.name)
            jobs.append(job)
            
            job = pipe_jobs([
                 samtools.view(extract_file_prefix + "OEAUNMAP.2.sName.bam"),
                 self.get_job_sam_to_fastq(extract_file_prefix + "OEAUNMAP.2.fastq.gz")
            ], name="extract_fastq_OEA2_" + sample.name)
            jobs.append(job)
            
            job = pipe_jobs([
                 samtools.view(extract_file_prefix + "OEAMAP.bam", options="-f 64 -q " + config.param('DEFAULT', 'min_mapping_quality')),
                 self.get_job_sam_to_fastq(extract_file_prefix + "OEAMAP.1.fastq.gz")
            ], name="extract_fastq_OEAMAP1_" + sample.name)
            jobs.append(job)
            
            job = pipe_jobs([
                 samtools.view(extract_file_prefix + "OEAMAP.bam", options="-f 128 -q " + config.param('DEFAULT', 'min_mapping_quality')),
                 self.get_job_sam_to_fastq(extract_file_prefix + "OEAMAP.2.fastq.gz")
            ], name="extract_fastq_OEAMAP2_" + sample.name)
            jobs.append(job)
            
            ## equal fastq file beetween OEAMAP and OEAUNMAP (due to the filter of mapq)
            job = concat_jobs([
                 tools.py_equalFastqFile(extract_file_prefix + "OEAMAP.2.fastq.gz", extract_file_prefix + "OEAUNMAP.1.fastq.gz", extract_file_prefix + "OEAUNMAP.1.equal.fastq.gz"),
                 Job(command="rm " + extract_file_prefix + "OEAUNMAP.1.fastq.gz")
            ], name="equal_fastq_OEA1_" + sample.name)
            jobs.append(job)              
            
            job = concat_jobs([
                 tools.py_equalFastqFile(extract_file_prefix + "OEAMAP.1.fastq.gz", extract_file_prefix + "OEAUNMAP.2.fastq.gz", extract_file_prefix + "OEAUNMAP.2.equal.fastq.gz"),
                 Job(command="rm " + extract_file_prefix + "OEAUNMAP.2.fastq.gz")
            ], name="equal_fastq_OEA2_" + sample.name)
            jobs.append(job)
            
            ## create fastq sclip
            jobMkdir = Job(command="if [ ! -d " + extract_directory + " ]; then mkdir -p " + extract_directory + "; fi")
            jobFastq = Job(
                input_files=[sclip_file_prefix+"scSequences.txt"],
                output_files=[extract_file_prefix+"sclip.1.fastq.gz"],
                command="awk 'NR>1 {if ($3==\"+\") { print \"@\"$4; print $5 ;print \"+\";  print $6}}' " + 
                        sclip_file_prefix + "scSequences.txt " +
                        "| gzip -c > " + extract_file_prefix+"sclip.1.fastq.gz",
                name="fastq_sclip1_" + sample.name
            )
            job = concat_jobs([
                jobMkdir,
                jobFastq
            ], name="fastq_sclip1_" + sample.name)
            jobs.append(job)
            
            jobFastq = Job(
                input_files=[sclip_file_prefix+"scSequences.txt"],
                output_files=[extract_file_prefix+"sclip.2.fastq.gz"],
                command="awk 'NR>1 {if ($3==\"-\") { print \"@\"$4; print $5 ;print \"+\";  print $6}}' " + 
                        sclip_file_prefix + "scSequences.txt " +
                        "| gzip -c > " + extract_file_prefix+"sclip.2.fastq.gz",
                name="fastq_sclip2_" + sample.name
            )
            job = concat_jobs([
                jobMkdir,
                jobFastq
            ], name="fastq_sclip2_" + sample.name)
            jobs.append(job)
            
        return jobs