def variant_call(ex, aligned_tasks, target_bed_tasks): """ Alignments -> Variants :param Execution execution: :param list[Task] aligned_tasks: :param list[Task] target_bed_tasks: :return: """ mkdir('work output', ex.output_dir) contig_to_targets = {t.tags['contig']: t for t in target_bed_tasks} hapcall_tasks = [ ex.add_task(gatk.haplotype_caller, tags=dict(contig=contig, **tags), parents=parents + [target_bed_task], out_dir='SM_{sample_name}/work/contigs/{contig}') for tags, parents in group(aligned_tasks, ['sample_name']) for contig, target_bed_task in contig_to_targets.items() ] # combine_gvcf_tasks = many2one(gatk.combine_gvcfs, hapcall_tasks, groupby=['sample_name'], out_dir='SM_{sample_name}') genotype_task = many2one(gatk.genotype_gvcfs, hapcall_tasks, groupby=[], out_dir='work/variants_raw.vcf')[0] select_snps_task = ex.add_task(gatk.select_variants, tags=dict( in_vcfs=[genotype_task.output_files[0]], out_vcf='work/snps_raw.vcf', select_type='SNP'), parents=genotype_task) filter_snps_task = ex.add_task( gatk.variant_filtration, tags=dict(in_vcfs=[select_snps_task.output_files[0]], out_vcf='work/snps_filtered.vcf', filters=[('Qual', 'QUAL < 30'), ('QD', 'QD < 2.0'), ('FS_snp', 'FS > 60.0'), ('MQ', 'MQ < 40.0'), ('MQRankSum', 'MQRankSum < -12.5'), ('ReadPosRankSum', 'ReadPosRankSum < -8.0')]), parents=select_snps_task) select_indels_task = ex.add_task( gatk.select_variants, tags=dict(in_vcfs=[genotype_task.output_files[0]], out_vcf='work/indels_raw.vcf', select_type='INDEL'), parents=genotype_task) filter_indels_task = ex.add_task( gatk.variant_filtration, tags=dict(in_vcfs=[select_indels_task.output_files[0]], out_vcf='work/indels_filtered.vcf', filters=[('Qual', 'QUAL < 30'), ('QD', 'QD < 2.0'), ('FS_indel', 'FS > 200.0'), ('ReadPosRankSum_indel', 'ReadPosRankSum < -2')]), parents=select_indels_task) combine_variants_task = ex.add_task( gatk.combine_variants, tags=dict(in_vcfs=[ filter_indels_task.tags['out_vcf'], filter_snps_task.tags['out_vcf'] ], out_vcf='output/variants.vcf', genotype_merge_option='PRIORITIZE'), parents=[filter_snps_task, filter_indels_task]) variant_stats_task = ex.add_task( picard.collect_variant_calling_metrics, tags=dict(in_vcf=combine_variants_task.tags['out_vcf'], out_path='output/picard'), parents=[combine_variants_task]) # Run VQSR? return combine_variants_task
def variant_call(ex, aligned_tasks, target_bed_tasks): """ Alignments -> Variants :param Execution execution: :param list[Task] aligned_tasks: :param list[Task] target_bed_tasks: :return: """ mkdir('work output', ex.output_dir) contig_to_targets = {t.tags['contig']: t for t in target_bed_tasks} hapcall_tasks = [ex.add_task(gatk.haplotype_caller, tags=dict(contig=contig, **tags), parents=parents + [target_bed_task], out_dir='SM_{sample_name}/work/contigs/{contig}') for tags, parents in group(aligned_tasks, ['sample_name']) for contig, target_bed_task in contig_to_targets.items()] # combine_gvcf_tasks = many2one(gatk.combine_gvcfs, hapcall_tasks, groupby=['sample_name'], out_dir='SM_{sample_name}') genotype_tasks = many2one(gatk.genotype_gvcfs, hapcall_tasks, groupby=['contig'], out_dir='work/contigs/{contig}/variants_raw.vcf') combine_variants_task1 = ex.add_task(gatk.combine_variants, stage_name='Combine_Raw_Variants', tags=dict(in_vcfs=[t.output_files[0] for t in genotype_tasks], out_vcf='work/variants.combined.raw.vcf', genotype_merge_option='UNSORTED'), parents=genotype_tasks) select_snps_task = ex.add_task(gatk.select_variants, parents=[combine_variants_task1], tags=dict(select_type='SNP', in_vcfs=[combine_variants_task1.output_files[0]], out_vcf='work/variants.raw.snps.vcf')) filter_snps_task = ex.add_task(gatk.variant_filtration, tags=dict(in_vcfs=[select_snps_task.output_files[0]], out_vcf='work/variants.filtered.snps.vcf', filters=[('Qual', 'QUAL < 30'), ('QD', 'QD < 2.0'), ('FS_snp', 'FS > 60.0'), ('MQ', 'MQ < 40.0'), ('MQRankSum', 'MQRankSum < -12.5'), ('ReadPosRankSum', 'ReadPosRankSum < -8.0')]), parents=select_snps_task) select_indels_task = ex.add_task(gatk.select_variants, parents=[combine_variants_task1], tags=dict(select_type='INDEL', in_vcfs=[combine_variants_task1.output_files[0]], out_vcf='work/variants.raw.indels.vcf')) filter_indels_task = ex.add_task(gatk.variant_filtration, tags=dict(in_vcfs=[select_indels_task.output_files[0]], out_vcf='work/variants.filtered.indels.vcf', filters=[('Qual', 'QUAL < 30'), ('QD', 'QD < 2.0'), ('FS_indel', 'FS > 200.0'), ('ReadPosRankSum_indel', 'ReadPosRankSum < -2')]), parents=select_indels_task) combine_variants_task2 = ex.add_task(gatk.combine_variants, tags=dict(in_vcfs=[filter_indels_task.tags['out_vcf'], filter_snps_task.tags['out_vcf']], out_vcf='output/variants.vcf', genotype_merge_option='PRIORITIZE'), parents=[filter_snps_task, filter_indels_task]) variant_stats_task = ex.add_task(picard.collect_variant_calling_metrics, tags=dict(in_vcf=combine_variants_task2.tags['out_vcf'], out_path='output/picard'), parents=[combine_variants_task2]) # Run VQSR? return combine_variants_task2
def align(execution, fastq_tasks, target_bed_tasks): """ Reads -> Alignments :param Execution execution: The Execution instance to create Tasks in :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks :param list[Task] target_bed_tasks: target beds to parallelize/split on :return: Indel Realigned Tasks """ # Do we need to split fastqs into smaller pieces? aligns = [] for tags, fastq_task_group in group(fastq_tasks, by=[ 'sample_name', 'library', 'platform', 'platform_unit', 'rgid', 'chunk' ]): # trim_task = execution.add_task(fastq.trim_galore, # tags=dict(**tags), # parents=fastq_task_group, # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') align_task = execution.add_task( bwa.bwa_mem, tags=dict(**tags), parents=fastq_task_group, out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') aligns.append(align_task) dedupe = many2one(picard.mark_duplicates, aligns, groupby=['sample_name', 'library'], out_dir='SM_{sample_name}/work/LB_{library}') # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage # for tags, parents in group(dedupe, ['sample_name']): # for target_bed_task in target_bed_tasks: # d = dict(contig=target_bed_task.tags['contig'], # in_target_bed=target_bed_task.output_files[0], # **tags) # rtc_tasks = [ execution.add_task(gatk.realigner_target_creator, dict(contig=target_bed_task.tags['contig'], in_target_bed=target_bed_task.output_files[0], **tags), parents + [target_bed_task], out_dir='SM_{sample_name}/work/contigs/{contig}') for tags, parents in group(dedupe, ['sample_name']) # Many2one for target_bed_task in target_bed_tasks ] # One2many realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks) realigned_by_sample_contig_tasks += [ execution.add_task(samtools.view, dict(out_bam=out_dir('both_pairs_unmapped.bam' % lb_task), f='12', sample_name=tags['sample_name'], contig='BOTH_PAIRS_UNMAPPED', library=lb_task.tags['library']), parents=lb_task, out_dir='SM_{sample_name}/work/LB_{library}', stage_name='Filter_Both_Pairs_Unmapped') for tags, sm_tasks in group(dedupe, ['sample_name']) for lb_task in sm_tasks ] # Skipping BQSR. Will improve results only slightly, if at all. # Merge bams so we have a sample bam. Returning realign, so bams remained split by contig for downstream # parallelization merged = many2one(picard.merge_sam_files, realigned_by_sample_contig_tasks, ['sample_name'], out_dir='SM_{sample_name}', stage_name="Merge_Sample_Bams") one2one(picard.collect_multiple_metrics, merged, out_dir='SM_{sample_name}/metrics') return merged
def align(execution, fastq_tasks, target_bed_tasks): """ Reads -> Alignments :param Execution execution: The Execution instance to create Tasks in :param list[Task] | [(str, dict)] fastq_tasks: Fastq input (file_path, dict) tuples or Tasks :param list[Task] target_bed_tasks: target beds to parallelize/split on :return: Indel Realigned Tasks """ # Do we need to split fastqs into smaller pieces? aligns = [] for tags, fastq_task_group in group(fastq_tasks, by=['sample_name', 'library', 'platform', 'platform_unit', 'rgid', 'chunk']): # trim_task = execution.add_task(fastq.trim_galore, # tags=dict(**tags), # parents=fastq_task_group, # out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') align_task = execution.add_task(bwa.bwa_mem, tags=dict(**tags), parents=fastq_task_group, out_dir='SM_{sample_name}/work/RG_{rgid}/CH_{chunk}') aligns.append(align_task) dedupe = many2one(picard.mark_duplicates, aligns, groupby=['sample_name', 'library'], out_dir='SM_{sample_name}/work/LB_{library}') # Note, could get slightly improved results by indel realigning over multiple samples, especially if low coverage # for tags, parents in group(dedupe, ['sample_name']): # for target_bed_task in target_bed_tasks: # d = dict(contig=target_bed_task.tags['contig'], # in_target_bed=target_bed_task.output_files[0], # **tags) # rtc_tasks = [execution.add_task(gatk.realigner_target_creator, dict(contig=target_bed_task.tags['contig'], in_target_bed=target_bed_task.output_files[0], **tags), parents + [target_bed_task], out_dir='SM_{sample_name}/work/contigs/{contig}') for tags, parents in group(dedupe, ['sample_name']) # Many2one for target_bed_task in target_bed_tasks] # One2many realigned_by_sample_contig_tasks = one2one(gatk.indel_realigner, rtc_tasks) realigned_by_sample_contig_tasks += [execution.add_task(samtools.view, dict(out_bam=out_dir('both_pairs_unmapped.bam' % lb_task), f='12', sample_name=tags['sample_name'], contig='BOTH_PAIRS_UNMAPPED', library=lb_task.tags['library']), parents=lb_task, out_dir='SM_{sample_name}/work/LB_{library}', stage_name='Filter_Both_Pairs_Unmapped') for tags, sm_tasks in group(dedupe, ['sample_name']) for lb_task in sm_tasks] # Skipping BQSR. Will improve results only slightly, if at all. # Merge bams so we have a sample bam. Returning realign, so bams remained split by contig for downstream # parallelization merged = many2one(picard.merge_sam_files, realigned_by_sample_contig_tasks, ['sample_name'], out_dir='SM_{sample_name}', stage_name="Merge_Sample_Bams") one2one(picard.collect_multiple_metrics, merged, out_dir='SM_{sample_name}/metrics') one2one(picard.collect_wgs_metrics, merged, out_dir='SM_{sample_name}/metrics') return merged