Пример #1
0
def haplotype_caller(core_req=16,
                     mem_req=29 * 1024,
                     in_bams=find('bam$', n='>0'),
                     in_bais=find('bai$', n='>0'),
                     in_target_bed=find('target.bed'),
                     out_vcf=out_dir('raw_variants.g.vcf')):
    in_bams = bam_list_to_inputs(in_bams)
    intervals = arg('--intervals', in_target_bed)
    return r"""
        {gatk} \
        -T HaplotypeCaller \
        -R {s[ref][reference_fasta]} \
        -D {s[ref][dbsnp_vcf]} \
        -nct {core_req} \
        --emitRefConfidence GVCF \
        -I {in_bams} \
        -o {out_vcf} \
        {intervals} \
        -A Coverage \
        -A GCContent \
        -A AlleleBalanceBySample \
        -A AlleleBalance \
        -A MappingQualityRankSumTest \
        -A InbreedingCoeff \
        -A FisherStrand \
        -A QualByDepth
    """.format(s=s, gatk=gatk(mem_req), **locals())
Пример #2
0
def realigner_target_creator(core_req=8,
                             mem_req=8 * 1024,
                             in_target_bed=find('target.bed'),
                             in_bams=find('bam$', n='>0'),
                             in_bais=find('bai$', n='>0'),
                             out_bams=forward('in_bams'),
                             out_bais=forward('in_bais'),
                             out_sites=out_dir('denovo_realign_targets.bed')):
    in_bams = bam_list_to_inputs(in_bams)

    if s['ref']['version'] == 'b37':
        in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf']
    elif s['ref']['version'] == 'hg38':
        in_knowns = [s['ref']['mills_and_1kg_indel_vcf']]

    # TODO should we pad intervals?  might be indels on perimeter that need realigner.  Not too worried because we're using HaplotypeCaller, though.
    return r"""
        #could add more knowns from ESP and other seq projects...
        {gatk} \
        -T RealignerTargetCreator \
        -R {s[ref][reference_fasta]} \
        -I {in_bams} \
        -o {out_sites} \
        {knowns} \
        -nt {core_req} \
        {args}
    """.format(s=s, gatk=gatk(mem_req),
               args=arg('--intervals', in_target_bed),
               knowns=' '.join('-known %s' % p for p in in_knowns),
               **locals())
Пример #3
0
def haplotype_caller(core_req=16,
                     mem_req=12 * 1024,
                     in_bams=find('bam$', n='>0'),
                     in_bais=find('bai$', n='>0'),
                     in_target_bed=find('target.bed'),
                     out_vcf=out_dir('raw_variants.g.vcf')):
    in_bams = bam_list_to_inputs(in_bams)
    intervals = arg('--intervals', in_target_bed)
    return r"""
        {gatk} \
        -T HaplotypeCaller \
        -R {s[ref][reference_fasta]} \
        -D {s[ref][dbsnp_vcf]} \
        -nct {core_req} \
        --emitRefConfidence GVCF \
        -stand_call_conf 30 \
        -stand_emit_conf 10 \
        -I {in_bams} \        -o {out_vcf} \
        {intervals} \
        -A Coverage \
        -A GCContent \
        -A AlleleBalanceBySample \
        -A AlleleBalance \
        -A MappingQualityRankSumTest \
        -A InbreedingCoeff \
        -A FisherStrand \
        -A QualByDepth
    """.format(s=s, gatk=gatk(mem_req), **locals())
Пример #4
0
def indel_realigner(core_req=4,  # proxy for mem_req until i test mem_req out
                    mem_req=8 * 1024,
                    contig=None,
                    in_bams=find('bam$', n='>0'),
                    in_bais=find('bai$', n='>0'),
                    in_sites=find('denovo_realign_targets.bed'),
                    out_bam=out_dir('realigned.bam'),
                    out_bai=out_dir('realigned.bai')):
    in_bams = bam_list_to_inputs(in_bams)

    if s['ref']['version'] == 'b37':
        in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf']
    elif s['ref']['version'] == 'hg38':
        in_knowns = [s['ref']['mills_and_1kg_indel_vcf']]

    return r"""
        # IR does not support parallelization
        {gatk} \
        -T IndelRealigner \
        -R {s[ref][reference_fasta]} \
        -I {in_bams} \
        -o {out_bam} \
        -targetIntervals {in_sites} \
        {knowns} \
        -model USE_READS \
        --filter_bases_not_stored \
        {intervals}

        {s[opt][samtools]} index {out_bam}
    """.format(s=s,
               intervals=arg('--intervals', contig),
               gatk=gatk(mem_req),
               knowns=' '.join('-known %s' % p for p in in_knowns),
               **locals())
Пример #5
0
def indel_realigner(core_req=4,  # proxy for mem_req until i test mem_req out
                    mem_req=8 * 1024,
                    contig=None,
                    in_bams=find('bam$', n='>0'),
                    in_bais=find('bai$', n='>0'),
                    in_sites=find('denovo_realign_targets.bed'),
                    out_bam=out_dir('realigned.bam'),
                    out_bai=out_dir('realigned.bai')):
    in_bams = bam_list_to_inputs(in_bams)

    if s['ref']['version'] == 'b37':
        in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf']
    elif s['ref']['version'] == 'hg38':
        in_knowns = [s['ref']['mills_and_1kg_indel_vcf']]

    return r"""
        # IR does not support parallelization
        {gatk} \
        -T IndelRealigner \
        -R {s[ref][reference_fasta]} \
        -I {in_bams} \
        -o {out_bam} \
        -targetIntervals {in_sites} \
        {knowns} \
        -model USE_READS \
        --filter_bases_not_stored \
        {intervals}

        {s[opt][samtools]} index {out_bam}
    """.format(s=s,
               intervals=arg('--intervals', contig),
               gatk=gatk(mem_req),
               knowns=' '.join('-known %s' % p for p in in_knowns),
               **locals())
Пример #6
0
def realigner_target_creator(core_req=8,
                             mem_req=8 * 1024,
                             in_target_bed=find('target.bed'),
                             in_bams=find('bam$', n='>0'),
                             in_bais=find('bai$', n='>0'),
                             out_bams=forward('in_bams'),
                             out_bais=forward('in_bais'),
                             out_sites=out_dir('denovo_realign_targets.bed')):
    in_bams = bam_list_to_inputs(in_bams)

    if s['ref']['version'] == 'b37':
        in_knowns = s['ref']['1kg_indel_vcf'], s['ref']['mills_vcf']
    elif s['ref']['version'] == 'hg38':
        in_knowns = [s['ref']['mills_and_1kg_indel_vcf']]

    # TODO should we pad intervals?  might be indels on perimeter that need realigner.  Not too worried because we're using HaplotypeCaller, though.
    return r"""
        #could add more knowns from ESP and other seq projects...
        {gatk} \
        -T RealignerTargetCreator \
        -R {s[ref][reference_fasta]} \
        -I {in_bams} \
        -o {out_sites} \
        {knowns} \
        -nt {core_req} \
        {args}
    """.format(s=s, gatk=gatk(mem_req),
               args=arg('--intervals', in_target_bed),
               knowns=' '.join('-known %s' % p for p in in_knowns),
               **locals())
Пример #7
0
def trim_galore(in_fastq1=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='1')),
                in_fastq2=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='2')),
                out_directory=out_dir(''),
                out_fastq1=out_dir('trimmed_r1.fastq.gz'),
                out_fastq2=out_dir('trimmed_r2.fastq.gz')):
    return r"""
        {s[opt][trim_galore]} \
        --paired \
        --dont_gzip \
        -o {out_directory} \
        --path_to_cutadapt {s[opt][cutadapt]} \
        {in_fastq1} {in_fastq2}
    """.format(s=s, **locals())
Пример #8
0
def trim_galore(in_fastq1=find('fq.gz|\.fastq|fastq.gz',
                               tags=dict(read_pair='1')),
                in_fastq2=find('fq.gz|\.fastq|fastq.gz',
                               tags=dict(read_pair='2')),
                out_directory=out_dir(''),
                out_fastq1=out_dir('trimmed_r1.fastq.gz'),
                out_fastq2=out_dir('trimmed_r2.fastq.gz')):
    return r"""
        {s[opt][trim_galore]} \
        --paired \
        --dont_gzip \
        -o {out_directory} \
        --path_to_cutadapt {s[opt][cutadapt]} \
        {in_fastq1} {in_fastq2}
    """.format(s=s, **locals())
Пример #9
0
def filter_bed_by_contig(contig,
                       drm='local',
                       in_bed=find('bed$'),
                       out_bed=out_dir('target.bed')):
    return r"""
        grep -P "^{contig}\t" {in_bed} > {out_bed}
    """.format(s=s, **locals())
Пример #10
0
def word_count(chars=False,
               in_txts=find('txt$', n='>=1'),
               out_txt=out_dir('wc.txt')):
    c = ' -c' if chars else ''
    return 'wc{c} {input} > {out_txt}'.format(input=' '.join(map(str,
                                                                 in_txts)),
                                              **locals())
Пример #11
0
def freebayes(reference_fasta=settings['ref']['reference_fasta'],
              max_complex_gap=2,
              no_complex=True,
              in_target_bed=find('bed$'), in_bam=find('bam$'),
              out_vcf=out_dir('variants.vcf')):
    return r"""
        {s[opt][freebayes]} -f {reference_fasta} \
        --vcf {out_vcf} \
        --targets {in_target_bed} \
        {args} \
        -m 30 -q 10 -R 0 -S 0 -F 0.1 \
        {in_bam}
    """.format(s=settings,
               args=args(('--max-complex-gap', max_complex_gap),
                         ('--no-complex', no_complex)),
               **locals())
Пример #12
0
def filter_bed_by_contig(contig,
                         drm='local',
                         in_bed=find('bed$'),
                         out_bed=out_dir('target.bed')):
    return r"""
        grep -P "^{contig}\t" {in_bed} > {out_bed}
    """.format(s=settings, **locals())
Пример #13
0
def sam_to_fastq_interleave(in_bam=find('bam$'),
                            out_fastq=out_dir('reads.fastq')):
    return r"""
        {picard} SamToFastq \
        I={in_bam} \
        FASTQ={out_fastq}
    """.format(s=s, picard=picard(), **locals())
Пример #14
0
def cut_adapt(minimum_length=50,
              in_fastq1=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='1')),
              in_fastq2=find('fq.gz|\.fastq|fastq.gz', tags=dict(read_pair='2')),
              out_fastq1=out_dir('trimmed_r1.fastq.gz'),
              out_fastq2=out_dir('trimmed_r2.fastq.gz')):
    # out_fastq1='>( gzip > %s)' % out_fastq1
    # out_fastq2='>( gzip > %s)' % out_fastq2
    return r"""
        {s[opt][cutadapt]} \
        -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \
        -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \
        {args} \
        -o {out_fastq1} -p {out_fastq2} \
        {in_fastq1} {in_fastq2}
    """.format(s=s,
               args=args(('--minimum-length', minimum_length)),
               **locals())
Пример #15
0
def split_fastq_file(num_chunks, prefix, out_fastqs, in_fastq=find('fq.gz|\.fastq|fastq.gz')):
    return r"""

        python {b} {in_fastq} {prefix} {num_chunks}

    """.format(s=s,
               b=bin('fastq/split_fastq_file.py'),
               **locals())
Пример #16
0
def freebayes(reference_fasta=settings['ref']['reference_fasta'],
              max_complex_gap=2,
              no_complex=True,
              in_target_bed=find('bed$'),
              in_bam=find('bam$'),
              out_vcf=out_dir('variants.vcf')):
    return r"""
        {s[opt][freebayes]} -f {reference_fasta} \
        --vcf {out_vcf} \
        --targets {in_target_bed} \
        {args} \
        -m 30 -q 10 -R 0 -S 0 -F 0.1 \
        {in_bam}
    """.format(s=settings,
               args=args(('--max-complex-gap', max_complex_gap),
                         ('--no-complex', no_complex)),
               **locals())
Пример #17
0
def ngsutils_fastq_split(num_chunks, prefix, in_fastq=find('fq.gz|\.fastq|fastq.gz')):
    """
    Doesn't work with streams :(
    """
    return r"""
        {s[opt][ngsutils]}/fastqutils split {in_fastq} {prefix} {num_chunks} -gz

    """.format(s=s,
               **locals())
Пример #18
0
def fastq_to_sam(rgid, sample_name, library, platform, platform_unit,
                 in_fastq1=find('.fastq', tags=dict(read_pair='1')),
                 in_fastq2=find('.fastq', tags=dict(read_pair='2')),
                 out_bam=out_dir('unaligned.bam')):
    return r"""
        {picard} FastqToSam \
        FASTQ={in_fastq1} \
        FASTQ2={in_fastq2} \
        O={out_bam} \
        SAMPLE_NAME={sample_name} \
        LIBRARY_NAME={library} \
        PLATFORM_UNIT={platform_unit} \
        PLATFORM={platform} \
        READ_GROUP_NAME={rgid}

    """.format(s=s,
               picard=picard(),
               **locals())
Пример #19
0
def split_fastq_file(num_chunks,
                     prefix,
                     out_fastqs,
                     in_fastq=find('fq.gz|\.fastq|fastq.gz')):
    return r"""

        python {b} {in_fastq} {prefix} {num_chunks}

    """.format(s=s, b=bin('fastq/split_fastq_file.py'), **locals())
Пример #20
0
def sam_to_fastq_interleave(in_bam=find('bam$'),
                            out_fastq=out_dir('reads.fastq')):
    return r"""
        {picard} SamToFastq \
        I={in_bam} \
        FASTQ={out_fastq}
    """.format(s=s,
               picard=picard(),
               **locals())
Пример #21
0
def collect_variant_calling_metrics(in_vcf=find('in_vcf'),
                                    in_dbsnp=s['ref']['dbsnp_vcf'],
                                    out_path=out_dir('picard.variant_metrics')):
    return r"""
        {picard} CollectVariantCallingMetrics \
        I={in_vcf} \
        DBSNP={in_dbsnp} \
        O={out_path}
    """.format(picard=picard(), **locals())
Пример #22
0
def merge_sam_files(in_bams=find('bam', n='>=1'),
                    out_bam=out_dir('merged.bam'),
                    out_bai=out_dir('merged.bai')):
    return r"""
        {picard} MergeSamFiles \
        {inputs} \
        O={out_bam} \
        ASSUME_SORTED=True \
        CREATE_INDEX=True
    """.format(picard=picard(), inputs=list_to_input(in_bams), **locals())
Пример #23
0
def ngsutils_fastq_split(num_chunks,
                         prefix,
                         in_fastq=find('fq.gz|\.fastq|fastq.gz')):
    """
    Doesn't work with streams :(
    """
    return r"""
        {s[opt][ngsutils]}/fastqutils split {in_fastq} {prefix} {num_chunks} -gz

    """.format(s=s, **locals())
Пример #24
0
def collect_variant_calling_metrics(
    in_vcf=find('in_vcf'),
    in_dbsnp=s['ref']['dbsnp_vcf'],
    out_path=out_dir('picard.variant_metrics')):
    return r"""
        {picard} CollectVariantCallingMetrics \
        I={in_vcf} \
        DBSNP={in_dbsnp} \
        O={out_path}
    """.format(picard=picard(), **locals())
Пример #25
0
def cut_adapt(minimum_length=50,
              in_fastq1=find('fq.gz|\.fastq|fastq.gz',
                             tags=dict(read_pair='1')),
              in_fastq2=find('fq.gz|\.fastq|fastq.gz',
                             tags=dict(read_pair='2')),
              out_fastq1=out_dir('trimmed_r1.fastq.gz'),
              out_fastq2=out_dir('trimmed_r2.fastq.gz')):
    # out_fastq1='>( gzip > %s)' % out_fastq1
    # out_fastq2='>( gzip > %s)' % out_fastq2
    return r"""
        {s[opt][cutadapt]} \
        -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \
        -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \
        {args} \
        -o {out_fastq1} -p {out_fastq2} \
        {in_fastq1} {in_fastq2}
    """.format(s=s,
               args=args(('--minimum-length', minimum_length)),
               **locals())
Пример #26
0
def mark_illumina_adapters(mem_req=8 * 1024,
                           in_bam=find('bam'),
                           out_bam=out_dir('unaligned_trimmed.bam'),
                           out_metrics=out_dir('adapter.metrics')):
    return r"""
        {picard} MarkIlluminaAdapters\
        I={in_bam} \
        O={out_bam} \
        METRICS={out_metrics}
    """.format(s=s, picard=picard(), **locals())
Пример #27
0
def bwa_mem(rgid, sample_name, library, platform, platform_unit,
            reference=s['ref']['reference_fasta'],
            core_req=16,
            in_fastqs=find('.fastq|.fq|.fq.gz|.fastq.gz', n=2),
            out_cutadapt_log=out_dir('cutadapt.log'),
            out_bam=out_dir('aligned.bam'),
            out_bai=out_dir('aligned.bai')):
    in_fastq1, in_fastq2 = in_fastqs
    fifo1 = out_bam.replace('aligned.bam', 'fifo1')
    fifo2 = out_bam.replace('aligned.bam', 'fifo2')

    return r"""
        {s[opt][bwa]} mem \
          -t {bwa_cores} -L 0 -M \
          -R "@RG\tID:{rgid}\tLB:{library}\tSM:{sample_name}\tPL:{platform}\tPU:{platform_unit}" \
          {reference} \
          {in_fastq1} \
          {in_fastq2} \
        | {picard} SortSam I=/dev/stdin O={out_bam} CREATE_INDEX=true SORT_ORDER=coordinate
        """.format(s=s,
                   bwa_cores=core_req-2,
                   picard=picard.picard(),
                   **locals())


    # @can_stream(['in_fastq1', 'in_fastq2'])
    # def bwa_mem_with_trimming(rgid, sample_name, library, platform, platform_unit,
    # reference=s['ref']['reference_fasta'],
    #             core_req=16,
    #             in_fastq1=find('.fastq', tags=dict(read_pair='1')),
    #             in_fastq2=find('.fastq', tags=dict(read_pair='2')),
    #             out_bam=out_dir('aligned.bam'),
    #             out_bai=out_dir('aligned.bam.bai'),
    #             out_adapter_metrics=out_dir('adapter.metrics')):
    #     return r"""
    #
    #             {fastq_to_sam} \
    #             | {mark_illumina_adapters} \
    #             | {sam_to_fastq}
    #             | {s[opt][bwa]} mem \
    #               -t {core_req} -L 0 -M -p \
    #               -R "@RG\tID:{rgid}\tLB:{library}\tSM:{sample_name}\tPL:{platform}\tPU:{platform_unit}" \
    #               {reference} \
    #               /dev/stdin \
    #             | {s[opt][samtools]} sort -@ 2 -m 2G - {samtools_out}
    #
    #             {s[opt][samtools]} index {out_bam}
    #             """.format(s=s,
    #                        fastq_to_sam=picard.fastq_to_sam(rgid=rgid, sample_name=sample_name, library=library, platform=platform, platform_unit=platform_unit,
    #                                                         in_fastq1=in_fastq1, in_fastq2=in_fastq2, out_bam='/dev/stdout').strip(),
    #                        mark_illumina_adapters=picard.mark_illumina_adapters(in_bam='/dev/stdin', out_bam='/dev/stdout', metrics=out_adapter_metrics).strip(),
    #                        sam_to_fastq=picard.sam_to_fastq_interleave('/dev/stdin', '/dev/stdout'),
    #
    #                        samtools_out=out_bam.replace('.bam', ''),
    #                        **locals())
Пример #28
0
def mark_duplicates(
    core_req=4,  # for scratch space
    mem_req=12 * 1024,
    in_bams=find('bam$', n='>=1'),
    in_bais=find('bai$', n='>=1'),
    out_bam=out_dir('deduped.bam'),
    out_bai=out_dir('deduped.bam.bai'),
    out_metrics=out_dir('deduped.metrics')):
    return r"""
        {picard} MarkDuplicates \
        {inputs} \
        O={out_bam} \
        METRICS_FILE={out_metrics} \
        ASSUME_SORTED=True \
        MAX_RECORDS_IN_RAM=1000000 \
        VALIDATION_STRINGENCY=SILENT \
        VERBOSITY=INFO

        {s[opt][samtools]} index {out_bam}
    """.format(inputs=list_to_input(in_bams), s=s, picard=picard(), **locals())
Пример #29
0
def merge(in_bams=find('bam$', n='>0'), out_bam=out_dir('merged.bam')):
    if len(in_bams) == 1:
        # Can't merge 1 bam, just copy it
        return r"""
        cp {in_bams[0]} {out_bam}
        """.format(**locals())
    else:
        in_bams = ' '.join(map(str, in_bams))
        return r"""
            {s[opt][samtools]} merge -f {out_bam} {in_bams}
        """.format(s=s, **locals())
Пример #30
0
def collect_wgs_metrics(in_bam=find('bam'),
                        out_path=out_dir('picard.raw_wgs_metrics.txt'),
                        reference_fasta=s['ref']['reference_fasta']):
    return r"""
    {picard} CollectRawWgsMetrics \
      I={in_bam} \
      O={out_path} \
      R={reference_fasta} \
      INCLUDE_BQ_HISTOGRAM=true
    """.format(picard=picard(),
               **locals())
Пример #31
0
def fastq_to_sam(rgid,
                 sample_name,
                 library,
                 platform,
                 platform_unit,
                 in_fastq1=find('.fastq', tags=dict(read_pair='1')),
                 in_fastq2=find('.fastq', tags=dict(read_pair='2')),
                 out_bam=out_dir('unaligned.bam')):
    return r"""
        {picard} FastqToSam \
        FASTQ={in_fastq1} \
        FASTQ2={in_fastq2} \
        O={out_bam} \
        SAMPLE_NAME={sample_name} \
        LIBRARY_NAME={library} \
        PLATFORM_UNIT={platform_unit} \
        PLATFORM={platform} \
        READ_GROUP_NAME={rgid}

    """.format(s=s, picard=picard(), **locals())
Пример #32
0
def fastqc(core_req=8,
           in_r1s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='1')),
           in_r2s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='2')),
           out_dir=out_dir('fastqc/')):
    assert len(in_r1s) == len(in_r2s)

    # if len(in_r1s) > 1 or in_r1s[0].startswith('<('):
    #     # If there are more than 1 fastqs per read_pair, merge them into one file per read_pair
    #     # Note, catting compressed files together seems fine
    #     # Have to cat because fastqc does not support streaming
    #     # TODO make sure we are concating to local temp disc if available.  For the usual S3 option this is fine, since we're already in a tmp dir
    #     # TODO stream from s3 into a cat command when input files start with s3://
    #
    #     r1, r2 = 'cat_r1.fastq.gz', 'cat_r2.fastq.gz'
    #     cat = r"""
    #         cat {r1s_join} > {r1}
    #         cat {r2s_join} > {r2}
    #         """.format(s=s,
    #                    r1s_join=' '.join(map(str, in_r1s)),
    #                    r2s_join=' '.join(map(str, in_r2s)),
    #                    **locals())
    #     cleanup = 'rm %s %s' % (r1, r2)
    # else:
    #     r1, r2 = in_r1s[0], in_r2s[0]
    #     cat = ""
    #     cleanup = ""

    cat = 'cat {fqs} | {zcat_or_cat}'.format(fqs=' '.join(in_r1s + in_r2s),
                                             zcat_or_cat='zcat' if '.gz' in in_r1s[0] else 'cat')

    return r"""
            mkdir -p {out_dir}

            {cat} | \
            {s[opt][fastqc]} \
            --threads {core_req} \
            --dir {s[gk][tmp_dir]} \
            -o {out_dir} \
            /dev/stdin

            """.format(s=s, **locals())
Пример #33
0
def combine_gvcfs(mem_req=12 * 1024,
                  in_vcfs=find('vcf|vcf.gz$', n='>0'),
                  out_vcf=out_dir('variants.g.vcf')):
    in_vcfs = vcf_list_to_input(in_vcfs)

    return r"""
        {gatk} \
        -T CombineGVCFs \
        -R {s[ref][reference_fasta]} \
        {in_vcfs} \
        -o {out_vcf}
    """.format(s=s, gatk=gatk(mem_req), **locals())
Пример #34
0
def merge_sam_files(in_bams=find('bam', n='>=1'),
                    out_bam=out_dir('merged.bam'),
                    out_bai=out_dir('merged.bai')):
    return r"""
        {picard} MergeSamFiles \
        {inputs} \
        O={out_bam} \
        ASSUME_SORTED=True \
        CREATE_INDEX=True
    """.format(picard=picard(),
               inputs=list_to_input(in_bams),
               **locals())
Пример #35
0
def mark_duplicates(core_req=4,  # for scratch space
                    mem_req=12 * 1024,
                    in_bams=find('bam$', n='>=1'),
                    in_bais=find('bai$', n='>=1'),
                    out_bam=out_dir('deduped.bam'),
                    out_bai=out_dir('deduped.bam.bai'),
                    out_metrics=out_dir('deduped.metrics')):
    return r"""
        {picard} MarkDuplicates \
        {inputs} \
        O={out_bam} \
        METRICS_FILE={out_metrics} \
        ASSUME_SORTED=True \
        MAX_RECORDS_IN_RAM=1000000 \
        VALIDATION_STRINGENCY=SILENT \
        VERBOSITY=INFO

        {s[opt][samtools]} index {out_bam}
    """.format(inputs=list_to_input(in_bams), s=s,
               picard=picard(),
               **locals())
Пример #36
0
def mark_illumina_adapters(mem_req=8 * 1024,
                           in_bam=find('bam'),
                           out_bam=out_dir('unaligned_trimmed.bam'),
                           out_metrics=out_dir('adapter.metrics')):
    return r"""
        {picard} MarkIlluminaAdapters\
        I={in_bam} \
        O={out_bam} \
        METRICS={out_metrics}
    """.format(s=s,
               picard=picard(),
               **locals())
Пример #37
0
def merge(in_bams=find('bam$', n='>0'),
          out_bam=out_dir('merged.bam')):
    if len(in_bams) == 1:
        # Can't merge 1 bam, just copy it
        return r"""
        cp {in_bams[0]} {out_bam}
        """.format(**locals())
    else:
        in_bams = ' '.join(map(str, in_bams))
        return r"""
            {s[opt][samtools]} merge -f {out_bam} {in_bams}
        """.format(s=s, **locals())
Пример #38
0
def combine_gvcfs(mem_req=12 * 1024,
                  in_vcfs=find('vcf|vcf.gz$', n='>0'),
                  out_vcf=out_dir('variants.g.vcf')):
    in_vcfs = vcf_list_to_input(in_vcfs)

    return r"""
        {gatk} \
        -T CombineGVCFs \
        -R {s[ref][reference_fasta]} \
        {in_vcfs} \
        -o {out_vcf}
    """.format(s=s, gatk=gatk(mem_req), **locals())
Пример #39
0
def genotype_gvcfs(core_req=8,
                   mem_req=12 * 1024,
                   in_vcfs=find('vcf|vcf.gz$', n='>0'),
                   out_vcf=out_dir('variants.vcf')):
    return r"""
        {gatk} \
        -T GenotypeGVCFs \
        -R {s[ref][reference_fasta]} \
        -D {s[ref][dbsnp_vcf]} \
        -nt {core_req} \
        {inputs} \
        -o {out_vcf}
    """.format(s=s, gatk=gatk(mem_req), inputs=vcf_list_to_input(in_vcfs), **locals())
Пример #40
0
def genotype_gvcfs(core_req=8,
                   mem_req=12 * 1024,
                   in_vcfs=find('vcf|vcf.gz$', n='>0'),
                   out_vcf=out_dir('variants.vcf')):
    return r"""
        {gatk} \
        -T GenotypeGVCFs \
        -R {s[ref][reference_fasta]} \
        -D {s[ref][dbsnp_vcf]} \
        -nt {core_req} \
        {inputs} \
        -o {out_vcf}
    """.format(s=s, gatk=gatk(mem_req), inputs=vcf_list_to_input(in_vcfs), **locals())
Пример #41
0
def fastqc(core_req=8,
           in_r1s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='1')),
           in_r2s=find('fq.gz|\.fastq|fastq.gz', n='>=1', tags=dict(read_pair='2')),
           out_dir=out_dir('fastqc/')):
    assert len(in_r1s) == len(in_r2s)

    # if len(in_r1s) > 1 or in_r1s[0].startswith('<('):
    #     # If there are more than 1 fastqs per read_pair, merge them into one file per read_pair
    #     # Note, catting compressed files together seems fine
    #     # Have to cat because fastqc does not support streaming
    #     # TODO make sure we are concating to local temp disc if available.  For the usual S3 option this is fine, since we're already in a tmp dir
    #     # TODO stream from s3 into a cat command when input files start with s3://
    #
    #     r1, r2 = 'cat_r1.fastq.gz', 'cat_r2.fastq.gz'
    #     cat = r"""
    #         cat {r1s_join} > {r1}
    #         cat {r2s_join} > {r2}
    #         """.format(s=s,
    #                    r1s_join=' '.join(map(str, in_r1s)),
    #                    r2s_join=' '.join(map(str, in_r2s)),
    #                    **locals())
    #     cleanup = 'rm %s %s' % (r1, r2)
    # else:
    #     r1, r2 = in_r1s[0], in_r2s[0]
    #     cat = ""
    #     cleanup = ""


    return r"""
            mkdir -p {out_dir}

            {s[opt][fastqc]} \
            --threads {core_req} \
            --dir {s[gk][tmp_dir]} \
            -o {out_dir} \
            {fqs}

            """.format(s=s, fqs=' '.join(in_r1s + in_r2s),**locals())
Пример #42
0
def collect_multiple_metrics(in_bam=find('bam'),
                             out_path=out_dir('picard'),
                             reference_fasta=s['ref']['reference_fasta']):
    return r"""
      {picard} CollectMultipleMetrics \
      I={in_bam} \
      O={out_path} \
      R={reference_fasta} \
      {programs}
    """.format(picard=picard(),
               programs=' '.join('PROGRAM=%s' % p for p in
                                 ['CollectAlignmentSummaryMetrics', 'CollectInsertSizeMetrics',
                                  'QualityScoreDistribution', 'MeanQualityByCycle',
                                  'CollectBaseDistributionByCycle', 'CollectGcBiasMetrics',
                                  'CollectSequencingArtifactMetrics', 'CollectQualityYieldMetrics',
                                  ]),
               **locals())
Пример #43
0
def collect_multiple_metrics(in_bam=find('bam'),
                             out_path=out_dir('picard'),
                             reference_fasta=s['ref']['reference_fasta']):
    return r"""
      {picard} CollectMultipleMetrics \
      I={in_bam} \
      O={out_path} \
      R={reference_fasta} \
      {programs}
    """.format(picard=picard(),
               programs=' '.join('PROGRAM=%s' % p for p in [
                   'CollectAlignmentSummaryMetrics',
                   'CollectInsertSizeMetrics', 'QualityScoreDistribution',
                   'MeanQualityByCycle', 'CollectBaseDistributionByCycle',
                   'CollectGcBiasMetrics', 'CollectSequencingArtifactMetrics',
                   'CollectQualityYieldMetrics', 'CollectWgsMetrics'
               ]),
               **locals())
Пример #44
0
def select_variants(select_type,
                    in_vcfs=find('vcf|vcf.gz$', n='>0'),
                    out_vcf=out_dir('variants.vcf'),
                    in_reference_fasta=s['ref']['reference_fasta'],
                    mem_req=6 * 1024):
    """
    :param select_type: "SNP" or "INDEL"
    """

    return r"""
        {gatk} \
        -T SelectVariants \
        -R {in_reference_fasta} \
        {inputs} \
        -selectType {select_type} \
        -o {out_vcf}
    """.format(s=s, gatk=gatk(mem_req),
               inputs=vcf_list_to_input(in_vcfs),
               **locals())
Пример #45
0
def cat(in_txts=find('txt$', n='>=1'), out_txt=out_dir('cat.txt')):
    return 'cat {input_str} > {out_txt}'.format(input_str=' '.join(
        map(str, in_txts)),
                                                **locals())
Пример #46
0
def md5sum(in_file=find('.*', n=1), out_md5=out_dir('checksum.md5')):
    out_md5.basename = in_file.basename + '.md5'
    return 'md5sum {in_file}'.format(**locals())
Пример #47
0
def md5sum(in_file=find('.*', n=1), out_md5=out_dir('checksum.md5')):
    out_md5.basename = in_file.basename + '.md5'
    return 'md5sum {in_file}'.format(**locals())
Пример #48
0
def word_count(chars=False, in_txts=find('txt$', n='>=1'), out_txt=out_dir('wc.txt')):
    c = ' -c' if chars else ''
    return 'wc{c} {input} > {out_txt}'.format(
        input=' '.join(map(str, in_txts)),
        **locals()
    )
Пример #49
0
def cat(in_txts=find('txt$', n='>=1'), out_txt=out_dir('cat.txt')):
    return 'cat {input_str} > {out_txt}'.format(
        input_str=' '.join(map(str, in_txts)),
        **locals()
    )
Пример #50
0
def paste(in_txts=find('txt$', n='>=1'), out_txt=out_dir('paste.txt')):
    return 'paste {input} > {out_txt}'.format(
        input=' '.join(map(str, (in_txts,))),
        **locals()
    )
Пример #51
0
def view(f, in_bam=find('bam$'), out_bam=out_dir('reads.bam')):
    return '{s[opt][samtools]} view -f {f} -h {in_bam} > {out_bam}'.format(
        s=s, **locals())
Пример #52
0
def vcf_concat_parts(in_vcfs=find('vcf$', n='>0'),
                     out_vcf=out_dir('freebayes.vcf')):
    return r"""
        {s[opt][vcf_concat_parts]} {vcfs} > {out_vcf}
    """.format(s=settings, vcfs=' '.join(in_vcfs), **locals())
Пример #53
0
def paste(in_txts=find('txt$', n='>=1'), out_txt=out_dir('paste.txt')):
    return 'paste {input} > {out_txt}'.format(input=' '.join(
        map(str, (in_txts, ))),
                                              **locals())
Пример #54
0
def vcf_concat_parts(in_vcfs=find('vcf$', n='>0'), out_vcf=out_dir('freebayes.vcf')):
    return r"""
        {s[opt][vcf_concat_parts]} {vcfs} > {out_vcf}
    """.format(s=settings, vcfs=' '.join(in_vcfs), **locals())
Пример #55
0
def view(f, in_bam=find('bam$'), out_bam=out_dir('reads.bam')):
    return '{s[opt][samtools]} view -f {f} -h {in_bam} > {out_bam}'.format(s=s,
                                                                           **locals())