def call_mutect2_gatk(self, inputs, vcf_out): '''Call somatic variants from using MuTect2''' tumor_in, normal_in = inputs tumor_samfile = pysam.AlignmentFile(tumor_in, "rb") normal_samfile = pysam.AlignmentFile(normal_in, "rb") tumor_id = tumor_samfile.header['RG'][0]['SM'] normal_id = normal_samfile.header['RG'][0]['SM'] tumor_samfile.close() normal_samfile.close() # safe_make_dir('variants/mutect2/{sample}'.format(sample=sample_id)) safe_make_dir('variants/mutect2/') command = "gatk Mutect2 -R {reference} " \ "-I {tumor_in} " \ "-tumor {tumor_id} " \ "-I {normal_in} " \ "-normal {normal_id} " \ "--germline-resource {mutect2_gnomad} " \ "--af-of-alleles-not-in-resource 0.001 " \ "-O {out} " \ "-L {gatk_bed} " \ "--max-reads-per-alignment-start 0 " \ "--dont-use-soft-clipped-bases".format(reference=self.reference, tumor_in=tumor_in, normal_in=normal_in, tumor_id=tumor_id, normal_id=normal_id, mutect2_gnomad=self.mutect2_gnomad, gatk_bed=self.gatk_bed, out=vcf_out) # "--af-of-alleles-not-in-resource 0.00003125 " \ run_stage(self.state, 'call_mutect2_gatk', command)
def apply_undr_rover(self, inputs, vcf_output, sample_id): '''Apply undr_rover to call variants from paired end fastq files''' fastq_read1_in, fastq_read2_in = inputs cores = self.get_stage_options('apply_undr_rover', 'cores') safe_make_dir('variants/undr_rover') safe_make_dir('variants/undr_rover/coverdir') coverdir = "variants/undr_rover/coverdir" coverfile = sample_id + ".coverage" command = 'undr_rover --primer_coords {coord_file} ' \ '--primer_sequences {primer_file} ' \ '--reference {reference} ' \ '--out {vcf_output} ' \ '--coverfile {coverdir}/{coverfile} ' \ '--proportionthresh {proportionthresh} ' \ '--absthresh {absthresh} ' \ '--max_variants {maxvariants} ' \ '{fastq_read1} {fastq_read2}'.format( coord_file=self.coord_file, primer_file=self.primer_file, reference=self.reference, vcf_output=vcf_output, coverdir=coverdir, proportionthresh=self.proportionthresh, absthresh=self.absthresh, maxvariants=self.maxvariants, coverfile=coverfile, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in) run_stage(self.state, 'apply_undr_rover', command)
def make_pipeline_call(state): #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs pipeline = Pipeline(name='genericpipe') with open("all_sample.passed.summary.txt", 'r') as inputf: passed_files = inputf.read().split('\n') stages = Stages(state) safe_make_dir('variants') safe_make_dir('variants/gatk') pipeline.originate(task_func=stages.passed_filter_files, name='passed_filter_files', output=passed_files) ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sort.hq.bam'), output='variants/gatk/{sample[0]}.g.vcf') return pipeline
def apply_multicov(self, bam_in, multicov): '''Samtools mpileup''' # bam_in = bam_in bams = ' '.join([bam for bam in bam_in]) safe_make_dir('coverage') command = 'bedtools multicov -bams {bams} -bed {target_bed} > {multicov}'.format( bams=bams, target_bed=self.target_bed, multicov=multicov) run_stage(self.state, 'apply_multicov', command)
def apply_samtools_mpileup(self, bam_in, mpileup_out_bcf): '''Samtools mpileup''' # bam_in = bam_in bams = ' '.join([bam for bam in bam_in]) safe_make_dir('variants') command = 'samtools mpileup -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -go {mpileup_out_bcf} ' \ '-f {reference} {bams}'.format( mpileup_out_bcf=mpileup_out_bcf,reference=self.reference,bams=bams) run_stage(self.state, 'apply_samtools_mpileup', command)
def target_coverage(self, bam_in, coverage_out): '''Calculate coverage using Picard''' safe_make_dir('coverage') picard_args = 'CollectHsMetrics INPUT={bam_in} OUTPUT={coverage_out} ' \ 'R={reference} BAIT_INTERVALS={interval_file} ' \ 'TARGET_INTERVALS={interval_file}'.format( bam_in=bam_in, coverage_out=coverage_out, reference=self.reference, interval_file=self.interval_file) self.run_picard('target_coverage', picard_args)
def structural_variants_socrates(self, bam_in, variants_out, sample_dir): '''Call structural variants with Socrates''' threads = self.state.config.get_stage_option('structural_variants_socrates', 'cores') # jvm_mem is in gb jvm_mem = self.state.config.get_stage_option('structural_variants_socrates', 'jvm_mem') bowtie2_ref_dir = self.state.config.get_stage_option('structural_variants_socrates', 'bowtie2_ref_dir') output_dir = os.path.join(sample_dir, 'socrates') safe_make_dir(output_dir) command = \ ''' cd {output_dir} export _JAVA_OPTIONS='-Djava.io.tmpdir={output_dir}' Socrates all -t {threads} --bowtie2_threads {threads} --bowtie2_db {bowtie2_ref_dir} --jvm_memory {jvm_mem}g {bam} '''.format(output_dir=output_dir, threads=threads, bowtie2_ref_dir=bowtie2_ref_dir, jvm_mem=jvm_mem, bam=bam_in) run_stage(self.state, 'structural_variants_socrates', command)
def align_bwa(self, inputs, bam_out, fam, sample, id): '''Align the paired end fastq files to the reference genome using bwa''' fastq_read1_in, fastq_read2_in = inputs cores = self.get_stage_options('align_bwa', 'cores') read_group = '"@RG\\tID:{id}\\tSM:{sample}\\tPL:Illumina"'.format(sample=sample, id=id) command = 'bwa mem -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} ' \ '| samtools view -b -h -o {bam} -' \ .format(cores=cores, read_group=read_group, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in, reference=self.reference, bam=bam_out) safe_make_dir('results/alignments/FAM_{fam}_SM_{sample}'.format(fam=fam, sample=sample)) run_stage(self.state, 'align_bwa', command)
def align_bwa(self, inputs, bam_out, read_id, lib, lane, sample_id): # def align_bwa(self, inputs, bam_out, sample_id): '''Align the paired end fastq files to the reference genome using bwa''' fastq_read1_in, fastq_read2_in = inputs cores = self.get_stage_options('align_bwa', 'cores') safe_make_dir('alignments/{sample}'.format(sample=sample_id)) read_group = '"@RG\\tID:{readid}\\tSM:{sample}\\tPU:lib1\\tLN:{lane}\\tPL:Illumina"' \ .format(readid=read_id, lib=lib, lane=lane, sample=sample_id) command = 'bwa mem -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} ' \ '| samtools view -b -h -o {bam} -' \ .format(cores=cores, read_group=read_group, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in, reference=self.reference, bam=bam_out) run_stage(self.state, 'align_bwa', command)
def structural_variants_socrates(self, bam_in, variants_out, sample_dir): '''Call structural variants with Socrates''' threads = self.state.config.get_stage_option( 'structural_variants_socrates', 'cores') # jvm_mem is in gb jvm_mem = self.state.config.get_stage_option( 'structural_variants_socrates', 'jvm_mem') bowtie2_ref_dir = self.state.config.get_stage_option( 'structural_variants_socrates', 'bowtie2_ref_dir') output_dir = os.path.join(sample_dir, 'socrates') safe_make_dir(output_dir) command = \ ''' cd {output_dir} export _JAVA_OPTIONS='-Djava.io.tmpdir={output_dir}' Socrates all -t {threads} --bowtie2_threads {threads} --bowtie2_db {bowtie2_ref_dir} --jvm_memory {jvm_mem}g {bam} '''.format(output_dir=output_dir, threads=threads, bowtie2_ref_dir=bowtie2_ref_dir, jvm_mem=jvm_mem, bam=bam_in) run_stage(self.state, 'structural_variants_socrates', command)
def align_bwa(self, inputs, bam_out, sample, id): """Align the paired end fastq files to the reference genome using bwa""" fastq_read1_in, fastq_read2_in = inputs cores = self.get_stage_options("align_bwa", "cores") read_group = '"@RG\\tID:{id}\\tSM:{sample}\\tPL:Illumina"'.format(sample=sample, id=id) command = ( "bwa mem -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} " "| samtools view -b -h -o {bam} -".format( cores=cores, read_group=read_group, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in, reference=self.reference, bam=bam_out, ) ) safe_make_dir("results/alignments/{sample}".format(sample=sample)) run_stage(self.state, "align_bwa", command)
def align_bwa(self, inputs, bam_out, sample_id, lib): '''Align the paired end fastq files to the reference genome using bwa''' fastq_read1_in, fastq_read2_in = inputs cores = self.get_stage_options('align_bwa', 'cores') safe_make_dir('alignments') read_group = '"@RG\\tID:{sample}\\tSM:{sample}\\tPU:lib1\\tPL:Illumina"' \ .format(sample=sample_id) command = 'bwa mem -M -t {cores} -R {read_group} {reference} {fastq_read1} {fastq_read2} ' \ '| {bamclipper} -i -p {primer_bedpe_file} -n 1 ' \ '| samtools view -b -h -o {bam} -' \ .format(cores=cores, read_group=read_group, fastq_read1=fastq_read1_in, fastq_read2=fastq_read2_in, reference=self.reference, bamclipper=self.bamclipper, primer_bedpe_file=self.primer_bedpe_file, bam=bam_out) run_stage(self.state, 'align_bwa', command)
def make_pipeline_call(state): #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs pipeline = Pipeline(name='hiplexpipe') with open("all_sample.passed.summary.txt", 'r') as inputf: passed_files = inputf.read().split('\n') stages = Stages(state) safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/undr_rover') safe_make_dir('variants/undr_rover/coverdir') pipeline.originate(task_func=stages.passed_filter_files, name='passed_filter_files', output=passed_files) # Call variants using undr_rover pipeline.transform( task_func=stages.apply_undr_rover, name='apply_undr_rover', input=output_from('passed_filter_files'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='variants/undr_rover/{sample[0]}.vcf', extras=['{sample[0]}']) #### concatenate undr_rover vcfs #### pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('apply_undr_rover'), filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/undr_rover/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'), output='variants/gatk/{sample[0]}.g.vcf') return pipeline
def call_haplotypecaller_gatk(self, bam_in, vcf_out): '''Call variants using GATK''' safe_make_dir('variants/gatk') # safe_make_dir('variants}'.format(sample=sample_id)) gatk_args = "-T HaplotypeCaller -R {reference} --min_base_quality_score 20 " \ "--emitRefConfidence GVCF " \ "-A AlleleBalance -A AlleleBalanceBySample " \ "-A ChromosomeCounts -A ClippingRankSumTest " \ "-A Coverage -A DepthPerAlleleBySample " \ "-A DepthPerSampleHC -A FisherStrand " \ "-A GCContent -A GenotypeSummaries " \ "-A HardyWeinberg -A HomopolymerRun " \ "-A LikelihoodRankSumTest -A LowMQ " \ "-A MappingQualityRankSumTest -A MappingQualityZero " \ "-A QualByDepth " \ "-A RMSMappingQuality -A ReadPosRankSumTest " \ "-A SampleList -A SpanningDeletions " \ "-A StrandBiasBySample -A StrandOddsRatio " \ "-A TandemRepeatAnnotator -A VariantType " \ "-I {bam} -L {interval_list} -o {out}".format(reference=self.reference, bam=bam_in, interval_list=self.interval_file, out=vcf_out) self.run_gatk('call_haplotypecaller_gatk', gatk_args)
def combine_gvcf_gatk(self, vcf_files_in, vcf_out): '''Combine G.VCF files for all samples using GATK''' safe_make_dir('processed') safe_make_dir('processed/gatk') merge_commands = [] temp_merge_outputs = [] for n in range(0, int(math.ceil(float(len(vcf_files_in)) / 200.0))): start = n * 200 filelist = vcf_files_in[start:start + 200] filelist_command = ' '.join( ['--variant ' + vcf for vcf in filelist]) temp_merge_filename = vcf_out.rstrip( '.vcf') + ".temp_{start}.vcf".format(start=str(start)) gatk_args_full = "java -Xmx{mem}g -jar {jar_path} -T CombineGVCFs -R {reference} " \ "--disable_auto_index_creation_and_locking_when_reading_rods " \ "{g_vcf_files} -o {vcf_out}; ".format(reference=self.reference, jar_path=self.gatk_jar, mem=self.state.config.get_stage_options('combine_gvcf_gatk', 'mem'), g_vcf_files=filelist_command, vcf_out=temp_merge_filename) merge_commands.append(gatk_args_full) temp_merge_outputs.append(temp_merge_filename) final_merge_vcfs = ' '.join( ['--variant ' + vcf for vcf in temp_merge_outputs]) gatk_args_full_final = "java -Xmx{mem}g -jar {jar_path} -T CombineGVCFs -R {reference} " \ "--disable_auto_index_creation_and_locking_when_reading_rods " \ "{g_vcf_files} -o {vcf_out}".format(reference=self.reference, jar_path=self.gatk_jar, mem=self.state.config.get_stage_options('combine_gvcf_gatk', 'mem'), g_vcf_files=final_merge_vcfs, vcf_out=vcf_out) merge_commands.append(gatk_args_full_final) final_command = ''.join(merge_commands) run_stage(self.state, 'combine_gvcf_gatk', final_command)
def concatenate_vcfs(self, vcf_files_in, vcf_out): safe_make_dir('processed') safe_make_dir('processed/vardict') merge_commands = [] temp_merge_outputs = [] for n in range(0, int(math.ceil(float(len(vcf_files_in)) / 200.0))): start = n * 200 filelist = vcf_files_in[start:start + 200] filelist_command = ' '.join([vcf for vcf in filelist]) temp_merge_filename = vcf_out.rstrip( '.vcf') + ".temp_{start}.vcf".format(start=str(start)) command1 = 'bcftools merge -O z -o {vcf_out} {join_vcf_files} && bcftools index -t -f {vcf_out}; '.format( vcf_out=temp_merge_filename, join_vcf_files=filelist_command) merge_commands.append(command1) temp_merge_outputs.append(temp_merge_filename) final_merge_vcfs = ' '.join([vcf for vcf in temp_merge_outputs]) command2 = 'bcftools merge -O z -o {vcf_out} {join_vcf_files} '.format( vcf_out=vcf_out, join_vcf_files=final_merge_vcfs) merge_commands.append(command2) final_command = ''.join(merge_commands) run_stage(self.state, 'concatenate_vcfs', final_command)
def fastqc(self, fastq_in, dir_out): '''Quality check fastq file using fastqc''' safe_make_dir(dir_out) command = "fastqc --quiet -o {dir} {fastq}".format(dir=dir_out, fastq=fastq_in) run_stage(self.state, 'fastqc', command)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='cellfree_seq') # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') # The original FASTQ files fastq_files = glob.glob('fastqs/*') # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}.sort.hq.bam') pipeline.transform(task_func=stages.run_connor, name='run_connor', input=output_from('align_bwa'), filter=suffix('.sort.hq.bam'), output='.sort.hq.connor.bam') safe_make_dir('metrics') safe_make_dir('metrics/summary') safe_make_dir('metrics/connor') pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_raw', input=output_from('coverage_bed_raw', 'genome_reads_raw', 'target_reads_raw', 'total_reads_raw'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'summary.txt']) pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/connor/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_connor', input=output_from('coverage_bed_connor', 'genome_reads_connor', 'target_reads_connor', 'total_reads_connor'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/connor/all_sample.summary.\1.txt', extras=[r'\1', 'connor.summary.txt']) safe_make_dir('variants') safe_make_dir('variants/vardict') pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') (pipeline.merge( task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('sort_vcfs'), output='variants/vardict/combined.vcf.gz').follows('index_vcfs')) pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.decomp.norm.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_final_vcf', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vcf.gz.tbi') (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vep.vcf').follows('index_final_vcf')) return pipeline
def make_pipeline_process(state): #originate process pipeline state # Define empty pipeline pipeline = Pipeline(name='haloplexpipe') # Get a list of paths to all the directories to be combined for variant calling run_directories = state.config.get_option('runs') #grab files from each of the processed directories in "runs" gatk_files = [] for directory in run_directories: gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf')) stages = Stages(state) #dummy stage to take the globbed outputs of each run that is to be processed pipeline.originate(task_func=stages.glob_gatk, name='glob_gatk', output=gatk_files) # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('glob_gatk'), output='processed/gatk/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # Apply GT filters to genotyped vcf pipeline.transform(task_func=stages.genotype_filter_gatk, name='genotype_filter_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output='.raw.gt-filter.vcf') # Decompose and normalise multiallelic sites pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('genotype_filter_gatk'), filter=suffix('.raw.gt-filter.vcf'), output='.raw.gt-filter.decomp.norm.vcf') # Annotate VCF file using GATK pipeline.transform(task_func=stages.variant_annotator_gatk, name='variant_annotator_gatk', input=output_from('vt_decompose_normalise'), filter=suffix('.raw.gt-filter.decomp.norm.vcf'), output='.raw.gt-filter.decomp.norm.annotate.vcf') # Filter vcf pipeline.transform( task_func=stages.gatk_filter, name='gatk_filter', input=output_from('variant_annotator_gatk'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'), output='.raw.gt-filter.decomp.norm.annotate.filter.vcf') #Apply VEP pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('gatk_filter'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'), output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf') ####### vardict stuff vardict_files = [] for directory in run_directories: vardict_files.extend( glob.glob(directory + '/variants/vardict/*sorted.vcf.gz')) #dummy stage to take the globbed outputs of each run that is to be processed pipeline.originate(task_func=stages.glob_vardict, name='glob_vardict', output=vardict_files) safe_make_dir('processed/vardict') #concatenate all vardict vcfs pipeline.merge(task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('glob_vardict'), output='processed/vardict/combined.vcf.gz') pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise_vardict', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.decomp.norm.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_final_vcf', input=output_from('vt_decompose_normalise_vardict'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vcf.gz.tbi') (pipeline.transform( task_func=stages.apply_vep, name='apply_vep_vardict', input=output_from('vt_decompose_normalise_vardict'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vep.vcf').follows('index_final_vcf')) return pipeline
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='haloplexpipe') # Get a list of paths to all the FASTQ files #fastq_files = state.config.get_option('fastqs') fastq_files = glob.glob("fastqs/*.gz") # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('processed_fastqs') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') safe_make_dir('metrics/pass_samples') safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/vardict') # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) pipeline.transform( task_func=stages.run_surecalltrimmer, name='run_surecalltrimmer', input=output_from('original_fastqs'), filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'), #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'), #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'), extras=['{sample[0]}'], # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step. output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz') #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('run_surecalltrimmer'), filter=formatter( 'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz' ), add_inputs=add_inputs( 'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'), #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'), #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'), extras=['{sample[0]}'], output='alignments/{sample[0]}.bam') # Run locatit from agilent. this should produce sorted bam files, so no sorting needed at the next step pipeline.collate(task_func=stages.run_locatit, name='run_locatit', input=output_from('align_bwa', 'original_fastqs'), filter=regex(r'.+/(.+_L\d\d\d).+'), output=r'alignments/\1.locatit.bam') pipeline.transform(task_func=stages.sort_bam, name='sort_bam', input=output_from('run_locatit'), filter=suffix('.locatit.bam'), output='.sorted.locatit.bam') # # # # # Metrics stages # # # # # # generate mapping metrics (post locatit) pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) # Intersect the bam file with the region of interest pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') # Calculate coverage metrics from the intersected bam file pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') # Count the number of mapped reads pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') # Count the number of on-target reads pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') # Count the number of total reads pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') # Generate summary metrics from the stats files produces pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) # # # # # Metrics stages end # # # # # # # # # # Checking metrics and calling # # # # # # Originate to set the location of the metrics summary file (pipeline.originate( task_func=stages.grab_summary_file, name='grab_summary_file', output='all_sample.summary.txt').follows('generate_stats')) # Awk command to produce a list of bam files passing filters pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt') # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller pipeline.subdivide(name='passed_filter_files', task_func=stages.read_samples, input=output_from('filter_stats'), filter=formatter(), output="metrics/pass_samples/*.bam") # Call variants using GATK (pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam')) # Call variants with vardict (pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']).follows('sort_bam')) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') return (pipeline)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='fampipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # Make directories for outputs safe_make_dir('results') safe_make_dir('results/alignments') safe_make_dir('results/variants') # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. # IF THE READS ARE SPLIT IN LANES e.g. FAM_f2_SM_f2i5_ID_idx46-TCCCGA-L001-L002_LB_lb_PL_ILLUMINA_R2 # filter=formatter( # '.+/FAM_(?P<famid>[a-zA-Z0-9]+)_SM_(?P<sample>[a-zA-Z0-9-]+)_ID_(?P<runid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1.fastq.gz'), filter=formatter('.+/FAM_(?P<fam>[a-zA-Z0-9]+)_SM_(?P<sample>[a-zA-Z0-9]+)' \ '_ID_(?P<id>[a-zA-Z0-9-]+)_LB_(?P<lb>[a-zA-Z0-9]+)' \ '_PL_(?P<pl>[a-zA-Z0-9]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/FAM_{fam[0]}_SM_{sample[0]}_ID_{id[0]}' \ '_LB_{lb[0]}_PL_{pl[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{id[0]}'], # The output file name is the sample name with a .bam extension. output='results/alignments/{sample[0]}/FAM_{fam[0]}_SM_{sample[0]}_ID_{id[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform( task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Call variants using GATK pipeline.transform( task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.dedup.realn.recal.bam'), output='results/variants/{sample[0]}.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='FAMExomes.mergegvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergegvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['FAMExomes.snp_recal', 'FAMExomes.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['FAMExomes.indel_recal', 'FAMExomes.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['FAMExomes.recal_INDEL.vcf']), output='.combined.vcf') .follows('apply_indel_recalibrate_gatk')) # Filter variants using GATK pipeline.transform( task_func=stages.filter_variants_gatk, name='filter_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.filtered.vcf') # Select variants using GATK pipeline.transform( task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('filter_variants_gatk'), filter=suffix('.filtered.vcf'), output='.selected.vcf') # Rare variant genotyping using FamSeq pipeline.transform( task_func=stages.rare_variants_famseq, name='rare_variants_famseq', input=output_from('select_variants_gatk'), filter=suffix('.selected.vcf'), output='.famseq.vcf') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='thepipelinex') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # Create output directories safe_make_dir('fastqc') safe_make_dir('alignments') safe_make_dir('variants') # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # FastQC on all FASTQ files pipeline.transform( task_func=stages.qc_fastqc, name='qc_fastqc', input=output_from('original_fastqs'), filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-.]+)_(?P<lib>[a-zA-Z0-9:-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_(?P<read>[12]).fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz ##add_inputs=add_inputs( ## '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options # extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'], extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='fastqc/{sample[0]}/') # Align paired end reads in FASTQ to the reference producing a BAM file (pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. # filter=formatter('(?P<path>.+)/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+)_1.fastq.gz'), filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-.]+)_(?P<lib>[a-zA-Z0-9:-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz add_inputs=add_inputs( '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'], # extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.bam') .follows('qc_fastqc')) # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Local realignment using GATK # Generate RealignerTargetCreator using GATK pipeline.transform( task_func=stages.realigner_target_creator, name='realigner_target_creator', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('realigner_target_creator'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9:-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).intervals'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.bam'), output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9:-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'), # output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Merge lane bams to sample bams pipeline.collate( task_func=stages.merge_sample_bams, name='merge_sample_bams', filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9:-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).sort.dedup.realn.recal.bam'), # inputs=add_inputs('alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'), input=output_from('print_reads_gatk'), output='alignments/{sample[0]}/{sample[0]}.merged.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard2', input=output_from('merge_sample_bams'), # filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).merged.bam'), filter=suffix('.merged.bam'), # XXX should make metricsup an extra output? output=['.merged.dedup.bam', '.metricsdup']) # Local realignment2 using GATK # Generate RealignerTargetCreator using GATK pipeline.transform( task_func=stages.realigner_target_creator, name='realigner_target_creator2', input=output_from('mark_duplicates_picard2'), filter=suffix('.dedup.bam'), output='.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk2', input=output_from('realigner_target_creator2'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).merged.intervals'), # filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).intervals'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{sample[0]}.merged.dedup.bam'), output='alignments/{sample[0]}/{sample[0]}.merged.dedup.realn.bam') .follows('mark_duplicates_picard2')) # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('local_realignment_gatk2'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.dedup.realn.bam'), output='variants/{sample[0]}.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_haplotypecaller_gatk'), output='variants/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), add_inputs=add_inputs(['ALL.snp_recal', 'ALL.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), add_inputs=add_inputs( ['ALL.indel_recal', 'ALL.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['ALL.recal_INDEL.vcf']), # output='.combined.vcf') output='ALL.raw.vqsr.vcf') .follows('apply_indel_recalibrate_gatk')) # # # Select variants using GATK # pipeline.transform( # task_func=stages.select_variants_gatk, # name='select_variants_gatk', # input=output_from('combine_variants_gatk'), # filter=suffix('.combined.vcf'), # output='.selected.vcf') return pipeline
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') # The original FASTQ files fastq_files = glob.glob('fastqs/*') # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter( '.+/(?P<sample>[a-zA-Z0-9_-]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq.gz' ), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2_{lib[0]}.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}.clipped.sort.hq.bam') # generate mapping metrics. pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) summary_file = 'all_sample.summary.txt' (pipeline.originate(task_func=stages.grab_summary_file, name='grab_summary_file', output=summary_file).follows('generate_stats')) pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt', extras=['all_sample.failed.summary.txt']) return pipeline
def make_pipeline_process(state): # Define empty pipeline pipeline = Pipeline(name='hiplexpipe') # Get a list of paths to all the directories to be combined for variant calling run_directories = state.config.get_option('runs') #grab files from each of the processed directories in "runs" gatk_files = [] undr_rover_files = [] for directory in run_directories: gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf')) undr_rover_files.extend( glob.glob(directory + '/variants/undr_rover/*sorted.vcf.gz')) # Stages are dependent on the state stages = Stages(state) # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.glob_gatk, name='glob_gatk', output=gatk_files) #Dummy stage to grab the undr rover files pipeline.originate(task_func=stages.glob_undr_rover, name='glob_undr_rover', output=undr_rover_files) safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/undr_rover') pipeline.merge(task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('glob_undr_rover'), output='variants/undr_rover/combined_undr_rover.vcf.gz') pipeline.transform(task_func=stages.index_final_vcf, name='index_final_vcf', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.vcf.gz.tbi') # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('glob_gatk'), output='ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # Apply GT filters to genotyped vcf pipeline.transform(task_func=stages.genotype_filter_gatk, name='genotype_filter_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output='.raw.gt-filter.vcf') # Decompose and normalise multiallelic sites pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('genotype_filter_gatk'), filter=suffix('.raw.gt-filter.vcf'), output='.raw.gt-filter.decomp.norm.vcf') # Annotate VCF file using GATK pipeline.transform(task_func=stages.variant_annotator_gatk, name='variant_annotator_gatk', input=output_from('vt_decompose_normalise'), filter=suffix('.raw.gt-filter.decomp.norm.vcf'), output='.raw.gt-filter.decomp.norm.annotate.vcf') # Filter vcf pipeline.transform( task_func=stages.gatk_filter, name='gatk_filter', input=output_from('variant_annotator_gatk'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'), output='.raw.gt-filter.decomp.norm.annotate.filter.vcf') #Apply VEP (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('gatk_filter'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'), add_inputs=add_inputs( ['variants/undr_rover/combined_undr_rover.vcf.gz']), output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf').follows( 'index_final_vcf')) return pipeline