示例#1
0
文件: pipeline.py 项目: LPM-HMS/PvKey
def Pipeline():
    testing = wga_settings['test'] 
    target = wga_settings['target']
    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval',range(1,23) + ['X', 'Y'])
    glm = ('glm', ['SNP', 'INDEL'])
    
    align_to_reference = sequence_(
        apply_(
            reduce_(['sample_name', 'library', 'platform', 'platform_unit', 'chunk'], pipes.AlignAndCleanALN)
        ),
    )
    
    if target:
        remove_dup = sequence_(
            reduce_(['sample_name'], picard.MERGE_SAMS)
        )
    else:
        remove_dup = sequence_(
            reduce_(['sample_name'], picard.MarkDuplicates)
        )
       
    preprocess_alignment = sequence_(                              
        map_(samtools.IndexBam),
        apply_(
            split_([intervals],gatk.RealignerTargetCreator) #if not is_capture or testing else map_(gatk.RealignerTargetCreator)    
        ),
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        apply_(
            reduce_(['sample_name'], gatk.BQSRGatherer),
            map_(gatk.ApplyBQSR) #TODO I add BQSRGatherer as a parent with a hack inside ApplyBQSR.cmd
        )
    )
      
    call_variants = sequence_(
        apply_(
            #reduce_(['interval'],gatk.HaplotypeCaller,tag={'vcf':'HaplotypeCaller'}),
            reduce_split_(['interval'], [glm], gatk.UnifiedGenotyper, tag={'vcf': 'UnifiedGenotyper'}),
            combine=True
        ),
        reduce_(['vcf'], gatk.CombineVariants, 'Combine Into Raw VCFs'),
        split_([glm],gatk.VQSR), 
    )
  
    return sequence_(
            align_to_reference,
            remove_dup,
            preprocess_alignment,
            call_variants
    )
示例#2
0
def Pipeline():
    testing = wga_settings['test']
    target = wga_settings['target']
    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval', list(range(1, 23)) + ['X', 'Y'])
    glm = ('glm', ['SNP', 'INDEL'])

    align_to_reference = sequence_(
        apply_(
            reduce_([
                'sample_name', 'library', 'platform', 'platform_unit', 'chunk'
            ], pipes.AlignAndCleanALN)), )

    if target:
        remove_dup = sequence_(reduce_(['sample_name'], picard.MERGE_SAMS))
    else:
        remove_dup = sequence_(reduce_(['sample_name'], picard.MarkDuplicates))

    preprocess_alignment = sequence_(
        map_(samtools.IndexBam),
        apply_(
            split_(
                [intervals], gatk.RealignerTargetCreator
            )  #if not is_capture or testing else map_(gatk.RealignerTargetCreator)    
        ),
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        apply_(
            reduce_(['sample_name'], gatk.BQSRGatherer),
            map_(
                gatk.ApplyBQSR
            )  #TODO I add BQSRGatherer as a parent with a hack inside ApplyBQSR.cmd
        ))

    call_variants = sequence_(
        apply_(
            #reduce_(['interval'],gatk.HaplotypeCaller,tag={'vcf':'HaplotypeCaller'}),
            reduce_split_(['interval'], [glm],
                          gatk.UnifiedGenotyper,
                          tag={'vcf': 'UnifiedGenotyper'}),
            combine=True),
        reduce_(['vcf'], gatk.CombineVariants, 'Combine Into Raw VCFs'),
        split_([glm], gatk.VQSR),
    )

    return sequence_(align_to_reference, remove_dup, preprocess_alignment,
                     call_variants)
示例#3
0
def pipeline(bams, test_bam=False, chromosome_only_split=False):

    # split_ tuples
    #chrom  = ('chrom', range(1,23) + ['X', 'Y', 'MT'])
    chrom  = ('chrom', range(1,23))

    glm = ('glm', ['SNP', 'INDEL'])

    dbnames = ('dbname', ['dbSNP135','CytoBand','Target_Scan','mirBase','Self_Chain','Repeat_Masker','TFBS','Segmental_Duplications','SIFT','COSMIC',
                          'PolyPhen2','Mutation_Taster','GERP','PhyloP','LRT','Mce46way','Complete_Genomics_69','The_1000g_Febuary_all','The_1000g_April_all',
                          'NHLBI_Exome_Project_euro','NHLBI_Exome_Project_aa','NHLBI_Exome_Project_all','ENCODE_DNaseI_Hypersensitivity','ENCODE_Transcription_Factor',
                          'UCSC_Gene','Refseq_Gene','Ensembl_Gene','CCDS_Gene','HGMD_INDEL','HGMD_SNP','GWAS_Catalog'])
    bam_seq = None
    
    for b in bams:
        header = _getHeaderInfo(b)
        sn     = _getSeqName(header)

        rgid = [ h[0] for h in header['rg']]

        # restrict output for testing
        if test_bam:
            sn    = ['chr1']
            chrom = ('chrom',[1])
            glm   = ('glm',['SNP'])
            skip_VQSR = ('skip_VQSR', [True])
        else:
            skip_VQSR = ('skip_VQSR', [False])

        # if seqName is empty, then let's assume that the input is unaligned bam
        # use everything before extension as part of tag
        sample_name = os.path.splitext(os.path.basename(b))[0]

        if chromosome_only_split:
            # Stop splitting by rgId
            bam_bwa_split = [ ('prevSn', sn), ('chromosome_only_split', [True]) ]
            indelrealign_reduce =  ['bam']
        else:
            bam_bwa_split = [ ('rgId', rgid), ('prevSn', sn), ('chromosome_only_split', [False]) ]
            indelrealign_reduce =  ['bam','rgId']

        s = sequence_( add_([INPUT(b, tags={'bam':sample_name})], stage_name="Load BAMs"), 
                       split_(bam_bwa_split, pipes.Bam_To_BWA))

        if bam_seq is None:   bam_seq = s
        else:                 bam_seq = sequence_(bam_seq, s, combine=True)

    # Previous pipeline
    pr_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(                                  pipes.MarkDuplicates),
        reduce_(['bam','chrom'],               pipes.BaseQualityScoreRecalibration),
        map_(                                  pipes.ReduceReads),
        reduce_split_(['chrom'], [glm],        pipes.UnifiedGenotyper),
        reduce_(['glm'],                       pipes.VariantQualityScoreRecalibration, tag={'vcf':'main'}),
        reduce_(['vcf'],                       pipes.CombineVariants, "Merge VCF"),
        map_(                                  pipes.Vcf2Anno_in),       
        split_([dbnames],                      pipes.Annotate, tag={'build':'hg19'}),       
        reduce_(['vcf'],                       pipes.MergeAnnotations)
    )

    # HaplotypeCaller Pipeline: official for GATK 3.0
    hc_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(                                  pipes.MarkDuplicates),
        reduce_(['bam','chrom'],               pipes.BaseQualityScoreRecalibration),
        map_(                                  pipes.HaplotypeCaller),
        reduce_(['chrom'],                     pipes.GenotypeGVCFs),
        split_([glm, skip_VQSR],               pipes.VariantQualityScoreRecalibration, tag={'vcf':'main'})
    )

    return hc_pipeline
示例#4
0
def pipeline(bams, test_bam=False, chromosome_only_split=False):

    # split_ tuples
    #chrom  = ('chrom', range(1,23) + ['X', 'Y', 'MT'])
    chrom = ('chrom', range(1, 23))

    glm = ('glm', ['SNP', 'INDEL'])

    dbnames = ('dbname', [
        'dbSNP135', 'CytoBand', 'Target_Scan', 'mirBase', 'Self_Chain',
        'Repeat_Masker', 'TFBS', 'Segmental_Duplications', 'SIFT', 'COSMIC',
        'PolyPhen2', 'Mutation_Taster', 'GERP', 'PhyloP', 'LRT', 'Mce46way',
        'Complete_Genomics_69', 'The_1000g_Febuary_all', 'The_1000g_April_all',
        'NHLBI_Exome_Project_euro', 'NHLBI_Exome_Project_aa',
        'NHLBI_Exome_Project_all', 'ENCODE_DNaseI_Hypersensitivity',
        'ENCODE_Transcription_Factor', 'UCSC_Gene', 'Refseq_Gene',
        'Ensembl_Gene', 'CCDS_Gene', 'HGMD_INDEL', 'HGMD_SNP', 'GWAS_Catalog'
    ])
    bam_seq = None

    for b in bams:
        header = _getHeaderInfo(b)
        sn = _getSeqName(header)

        rgid = [h[0] for h in header['rg']]

        # restrict output for testing
        if test_bam:
            sn = ['chr1']
            chrom = ('chrom', [1])
            glm = ('glm', ['SNP'])
            skip_VQSR = ('skip_VQSR', [True])
        else:
            skip_VQSR = ('skip_VQSR', [False])

        # if seqName is empty, then let's assume that the input is unaligned bam
        # use everything before extension as part of tag
        sample_name = os.path.splitext(os.path.basename(b))[0]

        if chromosome_only_split:
            # Stop splitting by rgId
            bam_bwa_split = [('prevSn', sn), ('chromosome_only_split', [True])]
            indelrealign_reduce = ['bam']
        else:
            bam_bwa_split = [('rgId', rgid), ('prevSn', sn),
                             ('chromosome_only_split', [False])]
            indelrealign_reduce = ['bam', 'rgId']

        s = sequence_(
            add_([INPUT(b, tags={'bam': sample_name})],
                 stage_name="Load BAMs"),
            split_(bam_bwa_split, pipes.Bam_To_BWA))

        if bam_seq is None: bam_seq = s
        else: bam_seq = sequence_(bam_seq, s, combine=True)

    # Previous pipeline
    pr_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(pipes.MarkDuplicates),
        reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration),
        map_(pipes.ReduceReads),
        reduce_split_(['chrom'], [glm], pipes.UnifiedGenotyper),
        reduce_(['glm'],
                pipes.VariantQualityScoreRecalibration,
                tag={'vcf': 'main'}),
        reduce_(['vcf'], pipes.CombineVariants, "Merge VCF"),
        map_(pipes.Vcf2Anno_in),
        split_([dbnames], pipes.Annotate, tag={'build': 'hg19'}),
        reduce_(['vcf'], pipes.MergeAnnotations))

    # HaplotypeCaller Pipeline: official for GATK 3.0
    hc_pipeline = sequence_(
        bam_seq,
        reduce_split_(indelrealign_reduce, [chrom], pipes.IndelRealigner),
        map_(pipes.MarkDuplicates),
        reduce_(['bam', 'chrom'], pipes.BaseQualityScoreRecalibration),
        map_(pipes.HaplotypeCaller), reduce_(['chrom'], pipes.GenotypeGVCFs),
        split_([glm, skip_VQSR],
               pipes.VariantQualityScoreRecalibration,
               tag={'vcf': 'main'}))

    return hc_pipeline
示例#5
0
def Pipeline():
    is_capture = wga_settings['capture']
    testing = wga_settings['test']

    # split_ tuples
    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval', range(1, 23) + ['X', 'Y'])

    glm = ('glm', ['SNP', 'INDEL'])

    align_to_reference = sequence_(
        apply_(
            reduce_(['sample_name', 'library'], misc.FastqStats),
            reduce_([
                'sample_name', 'library', 'platform', 'platform_unit', 'chunk'
            ], pipes.AlignAndClean)), )

    preprocess_alignment = sequence_(
        reduce_(['sample_name'], picard.MarkDuplicates),
        apply_(
            map_(picard.CollectMultipleMetrics),
            split_(
                [intervals], gatk.RealignerTargetCreator
            )  #if not is_capture or testing else map_(gatk.RealignerTargetCreator)
        ),
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        apply_(
            reduce_(['sample_name'], gatk.BQSRGatherer),
            map_(
                gatk.ApplyBQSR
            )  #TODO I add BQSRGatherer as a parent with a hack inside ApplyBQSR.cmd
        ))

    call_variants = sequence_(
        # apply_(
        #     reduce_split_([],[intervals,glm], gatk.UnifiedGenotyper, tag={'vcf': 'UnifiedGenotyper'}),
        #     combine=True
        # ) if is_capture
        # else
        apply_(
            #reduce_(['interval'],gatk.HaplotypeCaller,tag={'vcf':'HaplotypeCaller'}),
            reduce_split_(['interval'], [glm],
                          gatk.UnifiedGenotyper,
                          tag={'vcf': 'UnifiedGenotyper'}),
            combine=True),
        reduce_(['vcf'], gatk.CombineVariants, 'Combine Into Raw VCFs'),
        split_([glm], gatk.VQSR),
        map_(gatk.Apply_VQSR),
        reduce_(['vcf'], gatk.CombineVariants, "Combine into Master VCFs"))

    if is_capture:
        return sequence_(align_to_reference, preprocess_alignment,
                         call_variants, massive_annotation)
    else:
        return sequence_(
            align_to_reference, preprocess_alignment,
            reduce_split_(['sample_name'], [intervals], gatk.ReduceReads),
            call_variants, massive_annotation)
示例#6
0
def Pipeline():
    is_capture = wga_settings['capture']
    testing = wga_settings['test']

    # split_ tuples
    if testing:
        intervals = ('interval', [20])
    else:
        intervals = ('interval',range(1,23) + ['X', 'Y'])

    glm = ('glm', ['SNP', 'INDEL'])

    align_to_reference = sequence_(
        apply_(
            reduce_(['sample_name', 'library'], misc.FastqStats),
            reduce_(['sample_name', 'library', 'platform', 'platform_unit', 'chunk'], pipes.AlignAndClean)
        ),
    )

    preprocess_alignment = sequence_(
        reduce_(['sample_name'], picard.MarkDuplicates),
        apply_(
            map_(picard.CollectMultipleMetrics),
            split_([intervals],gatk.RealignerTargetCreator) #if not is_capture or testing else map_(gatk.RealignerTargetCreator)
        ),
        map_(gatk.IndelRealigner),
        map_(gatk.BQSR),
        apply_(
            reduce_(['sample_name'], gatk.BQSRGatherer),
            map_(gatk.ApplyBQSR) #TODO I add BQSRGatherer as a parent with a hack inside ApplyBQSR.cmd
        )
    )

    call_variants = sequence_(
        # apply_(
        #     reduce_split_([],[intervals,glm], gatk.UnifiedGenotyper, tag={'vcf': 'UnifiedGenotyper'}),
        #     combine=True
        # ) if is_capture
        # else
        apply_(
            #reduce_(['interval'],gatk.HaplotypeCaller,tag={'vcf':'HaplotypeCaller'}),
            reduce_split_(['interval'], [glm], gatk.UnifiedGenotyper, tag={'vcf': 'UnifiedGenotyper'}),
            combine=True
        ),
        reduce_(['vcf'], gatk.CombineVariants, 'Combine Into Raw VCFs'),
        split_([glm],gatk.VQSR),
        map_(gatk.Apply_VQSR),
        reduce_(['vcf'], gatk.CombineVariants, "Combine into Master VCFs")
    )

    if is_capture:
        return sequence_(
            align_to_reference,
            preprocess_alignment,
            call_variants,
            massive_annotation
        )
    else:
        return sequence_(
            align_to_reference,
            preprocess_alignment,
            reduce_split_(['sample_name'],[intervals],gatk.ReduceReads),
            call_variants,
            massive_annotation
        )