def create_lumpy_workflow(lumpy_vcf, tumour_bam=None, normal_bam=None, single_node=False): workflow = pypeliner.workflow.Workflow() lumpy_job_name = 'run_lumpy' if normal_bam: normal_bam = mgd.InputFile(normal_bam) normal_disc = mgd.TempInputFile('normal.discordants.sorted.bam') normal_split = mgd.TempInputFile('normal.splitters.sorted.bam') lumpy_job_name += '_normal' else: normal_disc = None normal_split = None if tumour_bam: tumour_bam = mgd.InputFile(tumour_bam) tumour_disc = mgd.TempInputFile('tumour.discordants.sorted.bam') tumour_split = mgd.TempInputFile('tumour.splitters.sorted.bam') lumpy_job_name += '_tumour' else: tumour_disc = None tumour_split = None if normal_bam: workflow.subworkflow( name='preprocess_lumpy_normal', func=lumpy_preprocess_workflow, args=(normal_bam, mgd.TempOutputFile('normal.discordants.sorted.bam'), mgd.TempOutputFile('normal.splitters.sorted.bam')), kwargs={'single_node': single_node}) if tumour_bam: workflow.subworkflow( name='preprocess_lumpy_tumour', func=lumpy_preprocess_workflow, args=(tumour_bam, mgd.TempOutputFile('tumour.discordants.sorted.bam'), mgd.TempOutputFile('tumour.splitters.sorted.bam')), kwargs={'single_node': single_node}) workflow.transform( name=lumpy_job_name, ctx=helpers.get_default_ctx(memory=10, disk=500, walltime='72:00'), func='wgs.workflows.lumpy.tasks.run_lumpyexpress', args=(mgd.OutputFile(lumpy_vcf), config.default_params('breakpoint_calling')['lumpy_paths']), kwargs={ 'tumour_bam': tumour_bam, 'tumour_discordants': tumour_disc, 'tumour_splitters': tumour_split, 'normal_bam': normal_bam, 'normal_discordants': normal_disc, 'normal_splitters': normal_split, 'docker_image': config.containers('lumpy') }) return workflow
def create_consensus_workflow( destruct_breakpoints, lumpy_vcf, output, chromosomes ): params = config.default_params('breakpoint_calling') workflow = pypeliner.workflow.Workflow() workflow.transform( name='parse_lumpy', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_lumpy_task', args=( mgd.InputFile(lumpy_vcf), mgd.TempOutputFile('lumpy.csv'), params["parse_lumpy"], ), kwargs={'chromosomes': chromosomes} ) workflow.transform( name='parse_destruct', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_destruct_task', args=( mgd.InputFile(destruct_breakpoints), mgd.TempOutputFile('destruct.csv'), params["parse_destruct"], ), kwargs={'chromosomes': chromosomes} ) workflow.transform( name='consensus_breakpoint_calling', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.consensus_calls', args=( mgd.TempInputFile('destruct.csv'), mgd.TempInputFile('lumpy.csv'), mgd.OutputFile(output, extensions=['.yaml']), params['consensus'] ), ) return workflow
def create_remixt_workflow( tumour_path, normal_path, breakpoints, sample_id, remixt_results_filename, remixt_brk_cn_csv, remixt_cn_csv, remixt_minor_modes_csv, remixt_mix_csv, remixt_read_depth_csv, remixt_stats_csv, remixt_refdata, reference, single_node=False, ): ctx = {'docker_image': config.containers('wgs')} params = config.default_params('copynumber_calling')['remixt'] workflow = pypeliner.workflow.Workflow(ctx=ctx) remixt_config = { 'genome_fasta': reference, 'genome_fai': reference + '.fai', } if breakpoints is None: workflow.setobj( obj=mgd.TempOutputObj('emptybreakpoints'), value=[], ) workflow.transform( name='write_empty_breakpoints', func='wgs.workflows.remixt.tasks.write_empty_breakpoints', args=( mgd.TempInputObj('emptybreakpoints'), mgd.TempOutputFile('filtered_breakpoints.csv'), ), ) else: workflow.transform( name='filter_breakpoints', func='wgs.workflows.remixt.tasks.filter_destruct_breakpoints', ctx=helpers.get_default_ctx(memory=4, walltime='4:00'), args=(mgd.InputFile(breakpoints), mgd.TempOutputFile('filtered_breakpoints.csv'), params['min_num_reads'])) if single_node: workflow.transform( name='remixt', func='wgs.workflows.remixt.tasks.run_remixt_local', ctx=helpers.get_default_ctx(memory=15, walltime='120:00', ncpus=8), args=( mgd.TempSpace("remixt_temp"), mgd.TempInputFile('filtered_breakpoints.csv'), mgd.InputFile(tumour_path, extensions=['.bai']), mgd.InputFile(normal_path, extensions=['.bai']), sample_id, mgd.OutputFile(remixt_results_filename), mgd.TempSpace('remixt_raw_dir'), remixt_config, remixt_refdata, ), ) else: workflow.subworkflow(name='remixt', func="remixt.workflow.create_remixt_bam_workflow", ctx={ 'docker_image': config.containers('remixt'), 'walltime': '48:00' }, args=( mgd.TempInputFile('filtered_breakpoints.csv'), { sample_id: mgd.InputFile(tumour_path, extensions=['.bai']), sample_id + 'N': mgd.InputFile(normal_path, extensions=['.bai']) }, { sample_id: mgd.OutputFile(remixt_results_filename) }, mgd.TempSpace('remixt_raw_dir'), remixt_config, remixt_refdata, ), kwargs={ 'normal_id': sample_id + 'N', }) workflow.transform( name='parse_remixt', func='wgs.workflows.remixt.tasks.parse_remixt_file', args=(mgd.InputFile(remixt_results_filename), [ mgd.OutputFile(remixt_brk_cn_csv, extensions=['.yaml']), mgd.OutputFile(remixt_cn_csv, extensions=['.yaml']), mgd.OutputFile(remixt_minor_modes_csv, extensions=['.yaml']), mgd.OutputFile(remixt_mix_csv, extensions=['.yaml']), mgd.OutputFile(remixt_read_depth_csv, extensions=['.yaml']), mgd.OutputFile(remixt_stats_csv, extensions=['.yaml']), ], ['/brk_cn', '/cn', '/minor_modes', '/mix', '/read_depth', '/stats'], mgd.TempSpace('tempdir_parse'))) return workflow
def create_titan_workflow( tumour_bam, normal_bam, targets, outfile, params, segs, igv_segs, parsed, plots, tar_outputs, museq_vcf, sample_id, reference, chromosomes, het_positions, map_wig, gc_wig, pygenes_gtf, single_node=None ): cn_params = config.default_params('copynumber_calling') chunks = [(v['num_clusters'], v['ploidy']) for v in cn_params['titan_intervals']] targets = mgd.InputFile(targets) if targets else None ctx = {'docker_image': config.containers('wgs')} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('numclusters', 'ploidy'), value=chunks, ) workflow.transform( name='generate_intervals', func='wgs.workflows.titan.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='2:00', ), ret=mgd.OutputChunks('interval'), args=( reference, chromosomes, ), kwargs={'size': cn_params['split_size']} ) if single_node: workflow.transform( name='run_museq', ctx=helpers.get_default_ctx( memory=15, walltime='96:00', ncpus=8), func='wgs.utils.museq_utils.run_museq_one_job', args=( mgd.TempSpace("run_museq_temp"), mgd.OutputFile(museq_vcf), reference, mgd.InputChunks('interval'), cn_params['museq_params'], ), kwargs={ 'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']), 'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']), 'titan_mode': True, 'museq_docker_image': config.containers('mutationseq'), 'vcftools_docker_image': config.containers('vcftools') } ) else: workflow.transform( name='run_museq', ctx=helpers.get_default_ctx( memory=15, walltime='24:00'), axes=('interval',), func='wgs.utils.museq_utils.run_museq', args=( mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), reference, mgd.InputInstance('interval'), cn_params['museq_params'] ), kwargs={ 'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']), 'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']), 'titan_mode': True, 'docker_image': config.containers('mutationseq') } ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='4:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('museq.vcf', 'interval'), mgd.OutputFile(museq_vcf), mgd.TempSpace('merge_vcf'), ), kwargs={'docker_image': config.containers('vcftools')} ) workflow.transform( name='convert_museq_vcf2counts', ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.convert_museq_vcf2counts', args=( mgd.InputFile(museq_vcf), mgd.TempOutputFile('museq_postprocess.txt'), het_positions, ), ) workflow.transform( name='run_readcounter_tumour', ctx=helpers.get_default_ctx( memory=10, walltime='16:00', disk=200 ), func='wgs.workflows.titan.tasks.run_readcounter', args=( mgd.InputFile(tumour_bam, extensions=['.bai']), mgd.TempOutputFile('tumour.wig'), chromosomes, cn_params['readcounter'] ), ) workflow.transform( name='run_readcounter_normal', ctx=helpers.get_default_ctx( memory=10, walltime='16:00', disk=200 ), func='wgs.workflows.titan.tasks.run_readcounter', args=( mgd.InputFile(normal_bam, extensions=['.bai']), mgd.TempOutputFile('normal.wig'), chromosomes, cn_params['readcounter'] ), ) workflow.transform( name='calc_correctreads_wig', ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.calc_correctreads_wig', args=( mgd.TempInputFile('tumour.wig'), mgd.TempInputFile('normal.wig'), targets, mgd.TempOutputFile('correct_reads.txt'), gc_wig, map_wig, cn_params['genome_type'] ), kwargs={'docker_image': config.containers('titan')} ) workflow.transform( name='run_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=15, walltime='72:00', ncpus='8'), func='wgs.workflows.titan.tasks.run_titan', args=( mgd.TempInputFile('museq_postprocess.txt'), mgd.TempInputFile('correct_reads.txt'), mgd.TempOutputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan.Rdata', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_params', 'numclusters', 'ploidy'), mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy'), sample_id, map_wig, cn_params['titan_params'], cn_params['genome_type'] ), kwargs={'docker_image': config.containers('titan'), 'threads': '8'} ) workflow.transform( name='plot_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=10, walltime='16:00', ), func='wgs.workflows.titan.tasks.plot_titan', args=( mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_plots', 'numclusters', 'ploidy'), mgd.TempSpace("titan_plots_tempdir", 'numclusters', 'ploidy'), mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy') ), kwargs={ 'chromosomes': chromosomes, 'docker_image': config.containers('titan'), }, ) workflow.transform( name='calc_cnsegments_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func='wgs.workflows.titan.tasks.calc_cnsegments_titan', args=( mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'), sample_id, ), kwargs={'docker_image': config.containers('titan')} ) workflow.transform( name='annot_pygenes', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.annot_pygenes', args=( mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_segs.csv', 'numclusters', 'ploidy'), pygenes_gtf, ), ) workflow.transform( name='parse_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func='wgs.workflows.titan.tasks.parse_titan_data', args=( mgd.TempInputFile('titan_segs.csv', 'numclusters', 'ploidy'), mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_parsed.csv', 'numclusters', 'ploidy'), ), ) # select optimal solution workflow.transform( name="select_optimal_solution", ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func="wgs.workflows.titan.tasks.select_optimal_solution", args=( chunks, mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]), mgd.OutputFile(segs, extensions=['.yaml']), mgd.OutputFile(igv_segs, extensions=['.yaml']), mgd.OutputFile(params, extensions=['.yaml']), mgd.OutputFile(outfile, extensions=['.yaml']), mgd.OutputFile(parsed, extensions=['.yaml']), mgd.OutputFile(plots), ) ) workflow.transform( name='tar_all_data', ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func="wgs.workflows.titan.tasks.tar_all_data", args=( mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]), mgd.OutputFile(tar_outputs), mgd.TempSpace("titan_all_parameters_data"), chunks ) ) return workflow
def create_mutect_workflow(normal_bam, tumour_bam, snv_vcf, snv_maf, reference, reference_vep, chromosomes, normal_id, tumour_id, single_node=None): params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow() workflow.transform(name='generate_intervals', func='wgs.workflows.mutect.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform( name='mutect_one_node', ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus=8, disk=600), func='wgs.workflows.mutect.tasks.run_mutect_one_job', args=(mgd.TempSpace("run_mutect_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam)), ) else: workflow.transform( name='mutect_caller', ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.workflows.mutect.tasks.run_mutect', args=(mgd.TempOutputFile('mutect.vcf', 'interval'), reference, mgd.InputInstance('interval'), mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.TempSpace('mutect_temp', 'interval')), ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.mutect.tasks.merge_vcfs', args=( mgd.TempInputFile('mutect.vcf', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), ) workflow.transform(name='bcftools_normalize', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('merged.vcf'), mgd.TempOutputFile('normalized.vcf'), reference, )) workflow.transform( name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized.vcf'), mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="strelka_indel_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) return workflow
def create_museq_workflow(snv_vcf, museqportrait_pdf, reference, chromosomes, thousand_genomes=None, dbsnp=None, germline_refdata=None, tumour_bam=None, normal_bam=None, single_node=None): name = 'run_museq' if tumour_bam: tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai']) name += '_tumour' if normal_bam: normal_bam = mgd.InputFile(normal_bam, extensions=['.bai']) name += '_normal' single = False if name == 'run_museq_tumour_normal' else True params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config.containers('wgs')}) workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform(name=name, ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus='8', disk=600), func='wgs.utils.museq_utils.run_museq_one_job', args=( mgd.TempSpace("run_museq_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), params['museq_params'], ), kwargs={ 'tumour_bam': tumour_bam, 'normal_bam': normal_bam, 'museq_docker_image': config.containers('mutationseq'), 'vcftools_docker_image': config.containers('vcftools') }) else: workflow.transform(name=name, ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.utils.museq_utils.run_museq', args=( mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), reference, mgd.InputInstance('interval'), params['museq_params'], ), kwargs={ 'tumour_bam': tumour_bam, 'normal_bam': normal_bam, 'docker_image': config.containers('mutationseq'), }) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('museq.vcf', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform(name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('merged.vcf'), mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform( name='run_museqportrait', ctx=helpers.get_default_ctx( memory=5, walltime='8:00', ), func='wgs.workflows.mutationseq.tasks.run_museqportrait', args=( mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(museqportrait_pdf), mgd.TempOutputFile('museqportrait.txt'), mgd.TempOutputFile('museqportrait.log'), single, ), kwargs={ 'docker_image': config.containers('mutationseq'), 'thousand_genomes': thousand_genomes, 'dbsnp': dbsnp, 'germline_refdata': germline_refdata, 'germline_plot_threshold': params['germline_portrait_threshold'] }) return workflow
def create_consensus_workflow(museq_germline, museq_snv, strelka_snv, strelka_indel, somatic_calls, somatic_snpeff, somatic_ma, somatic_ids, indel_calls, indel_snpeff, indel_ma, indel_ids, germline_calls, germline_snpeff, germline_ma, germline_ids, refdir): params = config.default_params('variant_calling') chromosomes = config.refdir_data(refdir)['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.transform( name='parse_museq_germlines', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(museq_germline, extensions=['.csi', '.tbi']), mgd.OutputFile(germline_calls, extensions=['.yaml']), mgd.OutputFile(germline_snpeff, extensions=['.yaml']), mgd.OutputFile(germline_ma, extensions=['.yaml']), mgd.OutputFile(germline_ids, extensions=['.yaml']), params["parse_museq"], chromosomes, mgd.TempSpace("tempdir_parse_germlines")), ) workflow.transform( name='parse_strelka_indel', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(strelka_indel, extensions=['.csi', '.tbi']), mgd.OutputFile(indel_calls, extensions=['.yaml']), mgd.OutputFile(indel_snpeff, extensions=['.yaml']), mgd.OutputFile(indel_ma, extensions=['.yaml']), mgd.OutputFile(indel_ids, extensions=['.yaml']), params["parse_strelka"], chromosomes, mgd.TempSpace("tempdir_strelka_indel")), ) workflow.transform( name='parse_museq_snv', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(museq_snv, extensions=['.csi', '.tbi']), mgd.TempOutputFile('museq_snv.csv', extensions=['.yaml']), mgd.TempOutputFile('museq_snpeff.csv', extensions=['.yaml']), mgd.TempOutputFile('museq_ma.csv', extensions=['.yaml']), mgd.TempOutputFile('museq_ids.csv', extensions=['.yaml']), params["parse_museq"], chromosomes, mgd.TempSpace("tempdir_parse_museq_snv")), ) workflow.transform( name='parse_strelka_snv', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf', args=(mgd.InputFile(strelka_snv, extensions=['.csi', '.tbi']), mgd.TempOutputFile('strelka_snv.csv', extensions=['.yaml']), mgd.TempOutputFile('strelka_snv_snpeff.csv', extensions=['.yaml']), mgd.TempOutputFile('strelka_snv_ma.csv', extensions=['.yaml']), mgd.TempOutputFile('strelka_snv_ids.csv', extensions=['.yaml']), params["parse_strelka"], chromosomes, mgd.TempSpace("tempdir_parse_strelka_snv")), ) workflow.transform( name='merge_snvs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv.csv', extensions=['.yaml']), mgd.TempInputFile('museq_snv.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_calls, extensions=['.yaml']), ), ) workflow.transform( name='merge_snpeff', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv_snpeff.csv', extensions=['.yaml']), mgd.TempInputFile('museq_snpeff.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_snpeff, extensions=['.yaml']), ), kwargs={'on': ['chrom', 'pos']}) workflow.transform( name='merge_ma', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv_ma.csv', extensions=['.yaml']), mgd.TempInputFile('museq_ma.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_ma, extensions=['.yaml']), ), kwargs={'on': ['chrom', 'pos']}) workflow.transform( name='merge_ids', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap', args=( [ mgd.TempInputFile('strelka_snv_ids.csv', extensions=['.yaml']), mgd.TempInputFile('museq_ids.csv', extensions=['.yaml']) ], mgd.OutputFile(somatic_ids, extensions=['.yaml']), ), kwargs={'on': ['chrom', 'pos']}) return workflow
def lumpy_preprocess_workflow(bamfile, discordants_sorted_bam, splitters_sorted_bam, single_node=False): workflow = pypeliner.workflow.Workflow() if single_node: workflow.transform( name='run_lumpy_preprocess', ctx=helpers.get_default_ctx(memory=10, walltime='96:00', disk=300), func='wgs.workflows.lumpy.tasks.run_lumpy_preprocess', args=(mgd.InputFile(bamfile), mgd.OutputFile(discordants_sorted_bam), mgd.OutputFile(splitters_sorted_bam), mgd.TempSpace("lumpy_preprocess_temp"), config.default_params('breakpoint_calling')['lumpy_paths']), kwargs={ 'lumpy_docker_image': config.containers('lumpy'), 'samtools_docker_image': config.containers('samtools') }) else: workflow.transform( name='run_samtools_view_normal', ctx=helpers.get_default_ctx( memory=10, walltime='24:00', ), func='wgs.workflows.lumpy.tasks.run_samtools_view', args=( mgd.InputFile(bamfile), mgd.TempOutputFile('normal.discordants.unsorted.bam'), ), kwargs={'docker_image': config.containers('samtools')}) workflow.transform( name='run_lumpy_extract_split_reads_bwamem_normal', ctx=helpers.get_default_ctx( memory=10, walltime='24:00', ), func= 'wgs.workflows.lumpy.tasks.run_lumpy_extract_split_reads_bwamem', args=(mgd.InputFile(bamfile), mgd.TempOutputFile('normal.splitters.unsorted.bam'), config.default_params('breakpoint_calling')['lumpy_paths']), kwargs={'docker_image': config.containers('lumpy')}) workflow.transform( name='run_samtools_sort_discordants_normal', ctx=helpers.get_default_ctx( memory=10, walltime='24:00', ), func='wgs.workflows.lumpy.tasks.run_samtools_sort', args=( mgd.TempInputFile('normal.discordants.unsorted.bam'), mgd.OutputFile(discordants_sorted_bam), ), kwargs={'docker_image': config.containers('samtools')}) workflow.transform( name='run_samtools_sort_splitters_normal', ctx=helpers.get_default_ctx( memory=10, walltime='24:00', ), func='wgs.workflows.lumpy.tasks.run_samtools_sort', args=( mgd.TempInputFile('normal.splitters.unsorted.bam'), mgd.OutputFile(splitters_sorted_bam), ), kwargs={'docker_image': config.containers('samtools')}) return workflow
def create_hmmcopy_workflow( bam_file, sample_id, bias_pdf, correction_pdf, hmmcopy_pdf, hmmcopy_table, pygenes_table, chromosomes, map_wig, gc_wig, pygenes_gtf, ): cn_params = config.default_params()['copynumber_calling'] workflow = pypeliner.workflow.Workflow() workflow.transform(name='hmmcopy_readcounter', ctx=helpers.get_default_ctx( memory=5, walltime='2:00', ), func='wgs.workflows.hmmcopy.tasks.hmmcopy_readcounter', args=( mgd.InputFile(bam_file, extensions=['.bai']), mgd.TempOutputFile('infile.wig'), chromosomes, cn_params['readcounter'], )) workflow.transform( name='calc_corr', func='wgs.workflows.hmmcopy.tasks.calc_corr', args=( mgd.TempInputFile('infile.wig'), mgd.TempOutputFile('infile_copy.txt'), mgd.TempOutputFile('infile_copy.obj'), gc_wig, map_wig, cn_params['map_cutoff'], ), ) workflow.transform( name='run_hmmcopy', func='wgs.workflows.hmmcopy.tasks.run_hmmcopy', args=( mgd.TempInputFile('infile_copy.obj'), mgd.TempInputFile('infile_copy.txt'), mgd.TempOutputFile('hmmcopy_res.obj'), mgd.TempOutputFile('hmmcopy_segments.txt'), mgd.OutputFile(hmmcopy_table), sample_id, cn_params['hmmcopy_params'], ), ) workflow.transform( name='plot_hmm', func='wgs.workflows.hmmcopy.tasks.plot_hmm', args=( mgd.TempInputFile('infile_copy.obj'), mgd.TempInputFile('hmmcopy_res.obj'), mgd.TempSpace('correction_plots_dir'), mgd.TempSpace('hmmcopy_plots_dir'), mgd.OutputFile(bias_pdf), mgd.OutputFile(correction_pdf), mgd.OutputFile(hmmcopy_pdf), ), ) workflow.transform(name='annot_hmm', func='wgs.workflows.hmmcopy.tasks.annot_hmm', args=( mgd.TempInputFile('hmmcopy_segments.txt'), mgd.OutputFile(pygenes_table), pygenes_gtf, )) return workflow
def create_strelka_workflow(normal_bam_file, tumour_bam_file, snv_vcf_file, snv_maf_file, indel_vcf_file, indel_maf_file, reference, reference_vep, chromosomes, normal_id, tumour_id, single_node=False, is_exome=False): params = config.default_params('variant_calling') workflow = Workflow(ctx=helpers.get_default_ctx(memory=5, walltime='4:00'), ) workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ret=mgd.OutputChunks('regions'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) workflow.transform( name='count_fasta_bases', func="wgs.workflows.strelka.tasks.count_fasta_bases", args=( reference, pypeliner.managed.TempOutputFile('ref_base_counts.tsv'), ), ) workflow.transform( name="get_chrom_sizes", func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes", ret=pypeliner.managed.TempOutputObj('known_sizes'), args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'), chromosomes)) if single_node: workflow.transform(name='strelka_one_node', func="wgs.workflows.strelka.tasks.strelka_one_node", args=( pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai' ]), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai' ]), reference, mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace('call_genome_segment_tmp'), mgd.InputChunks('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': is_exome, }) else: workflow.transform( name='get_chromosome_depths', axes=('regions', ), func="wgs.workflows.strelka.tasks.get_chromosome_depth", args=( mgd.InputInstance('regions'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('chrom_depth.txt', 'regions'), ), ) workflow.transform( name='merge_chromosome_depths', func="wgs.workflows.strelka.tasks.merge_chromosome_depths", args=(mgd.TempInputFile('chrom_depth.txt', 'regions', axes_origin=[]), mgd.TempOutputFile('merged_chrom_depth.txt'))) workflow.transform( name='call_genome_segment', axes=('regions', ), func="wgs.workflows.strelka.tasks.call_genome_segment", args=( mgd.TempInputFile('merged_chrom_depth.txt'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('indels.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf', 'regions'), mgd.TempSpace('call_genome_segment_tmp', 'regions'), mgd.InputInstance('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': False, }) workflow.transform( name='merge_indels', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('indels.vcf', 'regions'), mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("indels_merge")), ) workflow.transform( name='merge_snvs', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('snvs.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("snvs_merge")), ) workflow.transform(name='bcftools_normalize_snv', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('snvs.vcf.gz'), mgd.TempOutputFile('normalized_snvs.vcf'), reference, )) workflow.transform( name='finalise_normalize_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_snvs.vcf'), mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform(name='bcftools_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('indels.vcf.gz'), mgd.TempOutputFile('normalized_indels.vcf'), reference, )) workflow.transform( name='finalise_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_indels.vcf'), mgd.TempOutputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_indel', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_snv', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="strelka_snv_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(snv_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) workflow.subworkflow(name="strelka_indel_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(indel_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(indel_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) return workflow
def collect_bam_metrics(bam, markdups_metrics, sample_id, refdir, metrics, picard_insert_metrics, picard_insert_pdf, flagstat_metrics, picard_gc_metrics, picard_gc_summary, picard_gc_pdf, picard_wgs_metrics, bam_tdf, picard_mem=8): ''' calculates bam metrics in bams 1. picard insert metrics 2. picard GC metrics 3. picard wgs metrics 4. fastqc metrics :param config: config images for metrics :param bams: sample:bam dictionary :param metrics_csv: output csv containing metrics :param single_node: ''' ref_genome = config.refdir_data(refdir)['paths']['reference'] picard_wgs_params = config.default_params('alignment')['picard_wgs_params'] reftype = config.refdir_data(refdir)['params']['reference_type'] workflow = pypeliner.workflow.Workflow() workflow.transform( name="calc_picard_insert_metrics", ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400), func='wgs.workflows.alignment.tasks.bam_collect_insert_metrics', args=( mgd.InputFile(bam), mgd.OutputFile(flagstat_metrics), mgd.OutputFile(picard_insert_metrics), mgd.OutputFile(picard_insert_pdf), mgd.TempSpace('picard_insert'), ), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name="calc_picard_gc_metrics", func='wgs.workflows.alignment.tasks.bam_collect_gc_metrics', ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400), args=(mgd.InputFile(bam), ref_genome, mgd.OutputFile(picard_gc_metrics), mgd.OutputFile(picard_gc_summary), mgd.OutputFile(picard_gc_pdf), mgd.TempSpace('picard_gc')), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name="calc_picard_wgs_metrics", func='wgs.workflows.alignment.tasks.bam_collect_wgs_metrics', ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400), args=(mgd.InputFile(bam), ref_genome, mgd.OutputFile(picard_wgs_metrics), picard_wgs_params, mgd.TempSpace('picard_wgs')), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name='igvtools_tdf', ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), func='wgs.workflows.alignment.tasks.get_igvtools_count', args=(pypeliner.managed.InputFile(bam), pypeliner.managed.OutputFile(bam_tdf), reftype), ) workflow.transform( name='collect_metrics', func='wgs.workflows.alignment.tasks.bam_collect_all_metrics', ctx=helpers.get_default_ctx(memory=10, walltime='4:00', disk=400), args=(mgd.InputFile(flagstat_metrics), mgd.InputFile(picard_insert_metrics), mgd.InputFile(picard_wgs_metrics), mgd.InputFile(markdups_metrics), mgd.OutputFile(metrics, extensions=['.yaml']), sample_id), kwargs={ 'main_dtypes': dtypes()['metrics'], 'insert_dtypes': dtypes()['insert_metrics'] }) return workflow
def align_sample_split(fastq_1, fastq_2, out_file, samtools_flagstat, sample_id, lane_id, sample_info, refdir, picard_mem=2): ref_genome = config.refdir_data(refdir)['paths']['reference'] split_size = config.default_params('alignment')['split_size'] out_bai = out_file + '.bai' workflow = pypeliner.workflow.Workflow() workflow.transform( name='split_fastq_1', ctx=helpers.get_default_ctx( memory=4, walltime='24:00', ), func='biowrappers.components.io.fastq.tasks.split_fastq', args=( pypeliner.managed.InputFile(fastq_1), pypeliner.managed.TempOutputFile('read_1', 'split'), split_size, ), ) workflow.transform( name='split_fastq_2', ctx=helpers.get_default_ctx( memory=4, walltime='24:00', ), func='biowrappers.components.io.fastq.tasks.split_fastq', args=( pypeliner.managed.InputFile(fastq_2), pypeliner.managed.TempOutputFile('read_2', 'split', axes_origin=[]), split_size, ), ) workflow.transform(name='align_bwa_mem', axes=('split', ), ctx=helpers.get_default_ctx( memory=8, walltime='16:00', ncpus=8, ), func='wgs.workflows.alignment.tasks.align_bwa_mem', args=( pypeliner.managed.TempInputFile('read_1', 'split'), pypeliner.managed.TempInputFile('read_2', 'split'), ref_genome, pypeliner.managed.TempOutputFile( 'aligned.bam', 'split'), '8', sample_info, ), kwargs={ 'sample_id': sample_id, 'lane_id': lane_id, }) workflow.transform( name='sort', axes=('split', ), ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), func='wgs.workflows.alignment.tasks.bam_sort', args=(pypeliner.managed.TempInputFile('aligned.bam', 'split'), pypeliner.managed.TempOutputFile('sorted.bam', 'split'), pypeliner.managed.TempSpace('bam_sort_by_split', 'split')), kwargs={'mem': '{}G'.format(picard_mem)}) workflow.transform( name='merge', ctx=helpers.get_default_ctx( memory=8, walltime='72:00', ), func="wgs.workflows.alignment.tasks.merge_bams", args=(pypeliner.managed.TempInputFile('sorted.bam', 'split'), pypeliner.managed.OutputFile(out_file), pypeliner.managed.TempSpace('bam_merge_by_split')), kwargs={'mem': picard_mem}) workflow.commandline( name='index', ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), args=('samtools', 'index', pypeliner.managed.InputFile(out_file), pypeliner.managed.OutputFile(out_bai)), ) workflow.commandline( name='flagstat', ctx=helpers.get_default_ctx( memory=4, walltime='16:00', ), args=('samtools', 'flagstat', pypeliner.managed.InputFile(out_file), '>', pypeliner.managed.OutputFile(samtools_flagstat)), ) return workflow
def create_freebayes_germline_workflow(germline_vcf, germline_maf, bam_file, reference, reference_vep, chromosomes, normal_id, single_node=None): params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow() workflow.transform(name='generate_intervals', func='wgs.workflows.freebayes.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform( name='freebayes_one_node', ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus=8, disk=600), func='wgs.workflows.freebayes.tasks.run_freebayes_one_job', args=(mgd.TempSpace("run_freebayes_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), mgd.InputFile(bam_file))) else: workflow.transform( name='freebayes', ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.workflows.freebayes.tasks.run_freebayes_germline', args=(mgd.TempOutputFile('freebayes_germline.vcf', 'interval'), reference, mgd.InputInstance('interval'), mgd.InputFile(bam_file), mgd.TempSpace('tempdir_freebayes', 'interval')), ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('freebayes_germline.vcf', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), ) workflow.transform(name='bcftools_normalize', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('merged.vcf'), mgd.TempOutputFile('normalized.vcf'), reference, )) workflow.transform( name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized.vcf'), mgd.OutputFile(germline_vcf, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="freebayes_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(germline_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(germline_maf, extensions=['.tbi', '.csi']), reference_vep, ), kwargs={'normal_id': normal_id}) return workflow
def create_samtools_germline_workflow(germline_vcf, germline_roh, bam_file, reference, chromosomes, single_node=None): params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config.containers('wgs')}) workflow.transform( name='generate_intervals', func='wgs.workflows.samtools_germline.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform( name='samtools_germline', ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus=8, disk=600), func= 'wgs.workflows.samtools_germline.tasks.run_samtools_germline_one_job', args=(mgd.TempSpace("run_samtools_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), mgd.InputFile(bam_file)), kwargs={ 'samtools_docker_image': config.containers('samtools'), 'vcftools_docker_image': config.containers('vcftools') }) else: workflow.transform( name='samtools_germline', ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.workflows.samtools_germline.tasks.run_samtools_germline', args=(mgd.TempOutputFile('germline.vcf.gz', 'interval'), reference, mgd.InputInstance('interval'), mgd.InputFile(bam_file)), kwargs={'docker_image': config.containers('samtools')}) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('germline.vcf.gz', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform(name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('merged.vcf'), mgd.OutputFile(germline_vcf, extensions=['.tbi', '.csi']), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform( name='roh_calling', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.workflows.samtools_germline.tasks.roh_calling', args=(mgd.InputFile(germline_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(germline_roh)), kwargs={'docker_image': config.containers('vcftools')}) return workflow