Exemplo n.º 1
0
def variant_call(ex, aligned_tasks, target_bed_tasks):
    """
    Alignments -> Variants

    :param Execution execution:
    :param list[Task] aligned_tasks:
    :param list[Task] target_bed_tasks:
    :return:
    """
    mkdir('work output', ex.output_dir)

    contig_to_targets = {t.tags['contig']: t for t in target_bed_tasks}

    hapcall_tasks = [
        ex.add_task(gatk.haplotype_caller,
                    tags=dict(contig=contig, **tags),
                    parents=parents + [target_bed_task],
                    out_dir='SM_{sample_name}/work/contigs/{contig}')
        for tags, parents in group(aligned_tasks, ['sample_name'])
        for contig, target_bed_task in contig_to_targets.items()
    ]

    # combine_gvcf_tasks = many2one(gatk.combine_gvcfs, hapcall_tasks, groupby=['sample_name'], out_dir='SM_{sample_name}')

    genotype_task = many2one(gatk.genotype_gvcfs,
                             hapcall_tasks,
                             groupby=[],
                             out_dir='work/variants_raw.vcf')[0]

    select_snps_task = ex.add_task(gatk.select_variants,
                                   tags=dict(
                                       in_vcfs=[genotype_task.output_files[0]],
                                       out_vcf='work/snps_raw.vcf',
                                       select_type='SNP'),
                                   parents=genotype_task)

    filter_snps_task = ex.add_task(
        gatk.variant_filtration,
        tags=dict(in_vcfs=[select_snps_task.output_files[0]],
                  out_vcf='work/snps_filtered.vcf',
                  filters=[('Qual', 'QUAL < 30'), ('QD', 'QD < 2.0'),
                           ('FS_snp', 'FS > 60.0'), ('MQ', 'MQ < 40.0'),
                           ('MQRankSum', 'MQRankSum < -12.5'),
                           ('ReadPosRankSum', 'ReadPosRankSum < -8.0')]),
        parents=select_snps_task)

    select_indels_task = ex.add_task(
        gatk.select_variants,
        tags=dict(in_vcfs=[genotype_task.output_files[0]],
                  out_vcf='work/indels_raw.vcf',
                  select_type='INDEL'),
        parents=genotype_task)

    filter_indels_task = ex.add_task(
        gatk.variant_filtration,
        tags=dict(in_vcfs=[select_indels_task.output_files[0]],
                  out_vcf='work/indels_filtered.vcf',
                  filters=[('Qual', 'QUAL < 30'), ('QD', 'QD < 2.0'),
                           ('FS_indel', 'FS > 200.0'),
                           ('ReadPosRankSum_indel', 'ReadPosRankSum < -2')]),
        parents=select_indels_task)

    combine_variants_task = ex.add_task(
        gatk.combine_variants,
        tags=dict(in_vcfs=[
            filter_indels_task.tags['out_vcf'],
            filter_snps_task.tags['out_vcf']
        ],
                  out_vcf='output/variants.vcf',
                  genotype_merge_option='PRIORITIZE'),
        parents=[filter_snps_task, filter_indels_task])

    variant_stats_task = ex.add_task(
        picard.collect_variant_calling_metrics,
        tags=dict(in_vcf=combine_variants_task.tags['out_vcf'],
                  out_path='output/picard'),
        parents=[combine_variants_task])

    # Run VQSR?

    return combine_variants_task
Exemplo n.º 2
0
def variant_call(ex, aligned_tasks, target_bed_tasks):
    """
    Alignments -> Variants

    :param Execution execution:
    :param list[Task] aligned_tasks:
    :param list[Task] target_bed_tasks:
    :return:
    """
    mkdir('work output', ex.output_dir)

    contig_to_targets = {t.tags['contig']: t for t in target_bed_tasks}

    hapcall_tasks = [ex.add_task(gatk.haplotype_caller,
                                 tags=dict(contig=contig, **tags),
                                 parents=parents + [target_bed_task],
                                 out_dir='SM_{sample_name}/work/contigs/{contig}')
                     for tags, parents in group(aligned_tasks, ['sample_name'])
                     for contig, target_bed_task in contig_to_targets.items()]

    # combine_gvcf_tasks = many2one(gatk.combine_gvcfs, hapcall_tasks, groupby=['sample_name'], out_dir='SM_{sample_name}')

    genotype_tasks = many2one(gatk.genotype_gvcfs, hapcall_tasks, groupby=['contig'], out_dir='work/contigs/{contig}/variants_raw.vcf')

    combine_variants_task1 = ex.add_task(gatk.combine_variants,
                                         stage_name='Combine_Raw_Variants',
                                         tags=dict(in_vcfs=[t.output_files[0] for t in genotype_tasks],
                                                   out_vcf='work/variants.combined.raw.vcf',
                                                   genotype_merge_option='UNSORTED'),
                                         parents=genotype_tasks)

    select_snps_task = ex.add_task(gatk.select_variants,
                                   parents=[combine_variants_task1],
                                   tags=dict(select_type='SNP',
                                             in_vcfs=[combine_variants_task1.output_files[0]],
                                             out_vcf='work/variants.raw.snps.vcf'))

    filter_snps_task = ex.add_task(gatk.variant_filtration,
                                   tags=dict(in_vcfs=[select_snps_task.output_files[0]],
                                             out_vcf='work/variants.filtered.snps.vcf',
                                             filters=[('Qual', 'QUAL < 30'),
                                                      ('QD', 'QD < 2.0'),
                                                      ('FS_snp', 'FS > 60.0'),
                                                      ('MQ', 'MQ < 40.0'),
                                                      ('MQRankSum', 'MQRankSum < -12.5'),
                                                      ('ReadPosRankSum', 'ReadPosRankSum < -8.0')]),
                                   parents=select_snps_task)

    select_indels_task = ex.add_task(gatk.select_variants,
                                     parents=[combine_variants_task1],
                                     tags=dict(select_type='INDEL',
                                               in_vcfs=[combine_variants_task1.output_files[0]],
                                               out_vcf='work/variants.raw.indels.vcf'))

    filter_indels_task = ex.add_task(gatk.variant_filtration,
                                     tags=dict(in_vcfs=[select_indels_task.output_files[0]],
                                               out_vcf='work/variants.filtered.indels.vcf',
                                               filters=[('Qual', 'QUAL < 30'),
                                                        ('QD', 'QD < 2.0'),
                                                        ('FS_indel', 'FS > 200.0'),
                                                        ('ReadPosRankSum_indel', 'ReadPosRankSum < -2')]),
                                     parents=select_indels_task)

    combine_variants_task2 = ex.add_task(gatk.combine_variants,
                                         tags=dict(in_vcfs=[filter_indels_task.tags['out_vcf'], filter_snps_task.tags['out_vcf']],
                                                   out_vcf='output/variants.vcf',
                                                   genotype_merge_option='PRIORITIZE'),
                                         parents=[filter_snps_task, filter_indels_task])

    variant_stats_task = ex.add_task(picard.collect_variant_calling_metrics,
                                     tags=dict(in_vcf=combine_variants_task2.tags['out_vcf'],
                                               out_path='output/picard'),
                                     parents=[combine_variants_task2])

    # Run VQSR?



    return combine_variants_task2
Exemplo n.º 3
0
def align(execution, fastq_tasks, target_bed_tasks):
    """
    Reads -> Alignments

    :param Execution execution: The Execution instance to create Tasks in
    :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks
    :param list[Task] target_bed_tasks: target beds to parallelize/split on
    :return: Indel Realigned Tasks
    """

    # Do we need to split fastqs into smaller pieces?
    aligns = []
    for tags, fastq_task_group in group(fastq_tasks,
                                        by=[
                                            'sample_name', 'library',
                                            'platform', 'platform_unit',
                                            'rgid', 'chunk'
                                        ]):
        # trim_task = execution.add_task(fastq.trim_galore,
        # tags=dict(**tags),
        # parents=fastq_task_group,
        # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')

        align_task = execution.add_task(
            bwa.bwa_mem,
            tags=dict(**tags),
            parents=fastq_task_group,
            out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')
        aligns.append(align_task)

    dedupe = many2one(picard.mark_duplicates,
                      aligns,
                      groupby=['sample_name', 'library'],
                      out_dir='SM_{sample_name}/work/LB_{library}')

    # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage
    # for tags, parents in group(dedupe, ['sample_name']):
    # for target_bed_task in target_bed_tasks:
    # d = dict(contig=target_bed_task.tags['contig'],
    # in_target_bed=target_bed_task.output_files[0],
    # **tags)
    #

    rtc_tasks = [
        execution.add_task(gatk.realigner_target_creator,
                           dict(contig=target_bed_task.tags['contig'],
                                in_target_bed=target_bed_task.output_files[0],
                                **tags),
                           parents + [target_bed_task],
                           out_dir='SM_{sample_name}/work/contigs/{contig}')
        for tags, parents in group(dedupe, ['sample_name'])  # Many2one
        for target_bed_task in target_bed_tasks
    ]  # One2many

    realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks)
    realigned_by_sample_contig_tasks += [
        execution.add_task(samtools.view,
                           dict(out_bam=out_dir('both_pairs_unmapped.bam' %
                                                lb_task),
                                f='12',
                                sample_name=tags['sample_name'],
                                contig='BOTH_PAIRS_UNMAPPED',
                                library=lb_task.tags['library']),
                           parents=lb_task,
                           out_dir='SM_{sample_name}/work/LB_{library}',
                           stage_name='Filter_Both_Pairs_Unmapped')
        for tags, sm_tasks in group(dedupe, ['sample_name'])
        for lb_task in sm_tasks
    ]

    # Skipping BQSR.  Will improve results only slightly, if at all.

    # Merge bams so we have a sample bam.  Returning realign, so bams remained split by contig for downstream
    # parallelization
    merged = many2one(picard.merge_sam_files,
                      realigned_by_sample_contig_tasks, ['sample_name'],
                      out_dir='SM_{sample_name}',
                      stage_name="Merge_Sample_Bams")
    one2one(picard.collect_multiple_metrics,
            merged,
            out_dir='SM_{sample_name}/metrics')

    return merged
Exemplo n.º 4
0
def align(execution, fastq_tasks, target_bed_tasks):
    """
    Reads -> Alignments

    :param Execution execution: The Execution instance to create Tasks in
    :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks
    :param list[Task] target_bed_tasks: target beds to parallelize/split on
    :return: Indel Realigned Tasks
    """

    # Do we need to split fastqs into smaller pieces?
    aligns = []
    for tags, fastq_task_group in group(fastq_tasks, by=['sample_name', 'library', 'platform', 'platform_unit', 'rgid', 'chunk']):
        # trim_task = execution.add_task(fastq.trim_galore,
        # tags=dict(**tags),
        # parents=fastq_task_group,
        # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')

        align_task = execution.add_task(bwa.bwa_mem,
                                        tags=dict(**tags),
                                        parents=fastq_task_group,
                                        out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}')
        aligns.append(align_task)

    dedupe = many2one(picard.mark_duplicates, aligns, groupby=['sample_name', 'library'], out_dir='SM_{sample_name}/work/LB_{library}')

    # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage
    # for tags, parents in group(dedupe, ['sample_name']):
    # for target_bed_task in target_bed_tasks:
    # d = dict(contig=target_bed_task.tags['contig'],
    # in_target_bed=target_bed_task.output_files[0],
    # **tags)
    #

    rtc_tasks = [execution.add_task(gatk.realigner_target_creator,
                                    dict(contig=target_bed_task.tags['contig'],
                                         in_target_bed=target_bed_task.output_files[0], **tags),
                                    parents + [target_bed_task],
                                    out_dir='SM_{sample_name}/work/contigs/{contig}')
                 for tags, parents in group(dedupe, ['sample_name'])  # Many2one
                 for target_bed_task in target_bed_tasks]  # One2many

    realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks)
    realigned_by_sample_contig_tasks += [execution.add_task(samtools.view,
                                                            dict(out_bam=out_dir('both_pairs_unmapped.bam' % lb_task),
                                                                 f='12',
                                                                 sample_name=tags['sample_name'],
                                                                 contig='BOTH_PAIRS_UNMAPPED',
                                                                 library=lb_task.tags['library']),
                                                            parents=lb_task,
                                                            out_dir='SM_{sample_name}/work/LB_{library}',
                                                            stage_name='Filter_Both_Pairs_Unmapped')
                                         for tags, sm_tasks in group(dedupe, ['sample_name'])
                                         for lb_task in sm_tasks]


    # Skipping BQSR.  Will improve results only slightly, if at all.


    # Merge bams so we have a sample bam.  Returning realign, so bams remained split by contig for downstream
    # parallelization
    merged = many2one(picard.merge_sam_files, realigned_by_sample_contig_tasks, ['sample_name'], out_dir='SM_{sample_name}', stage_name="Merge_Sample_Bams")
    one2one(picard.collect_multiple_metrics, merged, out_dir='SM_{sample_name}/metrics')
    one2one(picard.collect_wgs_metrics, merged, out_dir='SM_{sample_name}/metrics')

    return merged