def create_battenberg_workflow( seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs ): if normal_id is None: raise ValueError('cloneHD requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') utils.make_parent_directory(results_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file) workflow.subworkflow( name='run_battenberg', axes=('sample_id',), func=create_battenberg_single_workflow, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), normal_id, pypeliner.managed.InputInstance('sample_id'), pypeliner.managed.OutputFile('results', 'sample_id', template=results_files), config, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, }, ) workflow.transform( name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('results', 'sample_id', template=results_files), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow
def main(args): config = cli.load_pypeliner_config(args) pyp = pypeliner.app.Pypeline([], config) workflow = Workflow() workflow.subworkflow(name='snpeff', func=snpeff.create_snpeff_annotation_workflow, args=(pypeliner.managed.InputFile( args.target_vcf_file), pypeliner.managed.TempOutputFile('snpeff.h5')), kwargs={ 'data_base': args.data_base, 'split_size': args.split_size, 'table_name': 'snpeff' }) workflow.transform(name='convert_to_tsv', func=convert_hdf5_to_tsv, ctx={'mem': 2}, args=(pypeliner.managed.TempInputFile('snpeff.h5'), 'snpeff', pypeliner.managed.OutputFile(args.out_file)), kwargs={ 'compress': True, 'index': False }) pyp.run(workflow)
def create_mappability_wig_file(config, out_file): workflow = Workflow() workflow.subworkflow( name='download_mappability_bigwig', func=biowrappers.components.io.download.create_download_workflow, args=( config['mappability_url'], pypeliner.managed.OutputFile(out_file + '.bigwig'), )) workflow.commandline( name='convert_mappability_to_wig', ctx={'mem': 4}, args=( 'mapCounter', '-w', config['window_size'], pypeliner.managed.InputFile(out_file + '.bigwig'), '>', pypeliner.managed.OutputFile(out_file), ), ) return workflow
def create_setup_theta_workflow(config, databases, **kwargs): mappability_dir = os.path.realpath( os.path.join(os.path.dirname(config['mappability_template']), os.pardir)) map_extract_log = os.path.join(mappability_dir, 'mappability_extract.log') chromosomes_dir = os.path.dirname(config['chromosome_template']) utils.make_directory(mappability_dir) utils.make_directory(chromosomes_dir) workflow = Workflow() workflow.subworkflow( name='download_mappability', func=biowrappers.components.io.download.create_download_workflow, args=( config['mappability_url'], pypeliner.managed.TempOutputFile('mappability.tar.gz'), )) workflow.commandline( name='extract_mappability', args=( 'tar', '-xzvf', pypeliner.managed.TempInputFile('mappability.tar.gz'), '-C', mappability_dir, '>', pypeliner.managed.OutputFile(map_extract_log), ), ) for chromosome in config['chromosomes']: workflow.subworkflow( name='download_chromosome_{}'.format(chromosome), func=biowrappers.components.io.download.create_download_workflow, args=( config['chromosome_url_template'].format(chromosome), pypeliner.managed.TempOutputFile( 'chromosome_{}.fa.gz'.format(chromosome)), )) workflow.commandline( name='extract_chromosome_{}'.format(chromosome), args=( 'gunzip', '-c', pypeliner.managed.TempInputFile( 'chromosome_{}.fa.gz'.format(chromosome)), '>', pypeliner.managed.OutputFile( config['chromosome_template'].format(chromosome)), ), ) return workflow
def realignment_readgroups_pipeline( config, in_file, out_file): workflow = Workflow() workflow.transform( name='get_read_group_configs', func=tasks.get_read_group_configs, ret=pypeliner.managed.TempOutputObj('read_group_config', 'read_group_id'), args=( pypeliner.managed.InputFile(in_file), ) ) workflow.commandline( name='create_read_group_bam', axes=('read_group_id',), args=( 'samtools', 'view', '-b', '-r', pypeliner.managed.InputInstance('read_group_id'), pypeliner.managed.InputFile(in_file), '>', pypeliner.managed.TempOutputFile('read_group_bam', 'read_group_id'), ) ) workflow.subworkflow( name='realignment_pipeline', axes=('read_group_id',), func=realignment_pipeline, args=( config, pypeliner.managed.TempInputFile('read_group_bam', 'read_group_id'), pypeliner.managed.TempOutputFile('realigned_read_group_bam', 'read_group_id'), ), kwargs={ 'read_group_info': pypeliner.managed.TempInputObj('read_group_config', 'read_group_id'), } ) workflow.transform( name='merge_and_markdups', axes=('read_group_id',), ctx={'mem' : 48, 'num_retry' : 3, 'mem_retry_increment' : 16}, func=bam_tasks.mark_duplicates, args=( pypeliner.managed.TempInputFile('realigned_read_group_bam', 'read_group_id'), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'tmp_dir' : pypeliner.managed.TempSpace('markdup_temp', 'read_group_id') } ) return workflow
def create_setup_titan_workflow(config, databases, **kwargs): workflow = Workflow() workflow.subworkflow(name='gc_wig', func=create_gc_wig_file, args=( config, pypeliner.managed.InputFile( databases['ref_genome']['local_path']), pypeliner.managed.OutputFile(config['gc_wig']), )) workflow.subworkflow(name='mappability_wig', func=create_mappability_wig_file, args=( config, pypeliner.managed.OutputFile( config['mappability_wig']), )) return workflow
def create_dbsnp_download_workflow(config, out_file): workflow = Workflow() workflow.subworkflow( name='download', func=download.create_download_workflow, args=( config['url'], pypeliner.managed.OutputFile(out_file) ) ) workflow.transform( name='index', ctx={'mem': 4}, func=vcf_tasks.index_vcf, args=( pypeliner.managed.InputFile(out_file), ) ) return workflow
def call_and_annotate_pipeline(config, normal_bam_path, tumour_bam_paths, raw_data_dir, results_file, chromosomes=default_chromosomes): workflow = Workflow() workflow.setobj( pypeliner.managed.OutputChunks('tumour_sample_id', axes_origin=[ 0, ]), tumour_bam_paths.keys()) variant_files = get_variant_files(chromosomes, config, raw_data_dir) normal_bam_file = pypeliner.managed.File(normal_bam_path) tumour_bam_files = pypeliner.managed.File('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths) ref_genome_fasta_file = pypeliner.managed.File( config['databases']['ref_genome']['local_path']) #=================================================================================================================== # Multi sample calling #=================================================================================================================== if 'nuseq_multi_sample' in config: workflow.subworkflow( name='nuseq_multi_sample', axes=(), func= 'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow', args=( normal_bam_file.as_input(), [ pypeliner.managed.InputFile(x) for x in tumour_bam_paths.values() ], ref_genome_fasta_file.as_input(), variant_files['snv']['vcf']['nuseq_multi_sample'].as_output()), kwargs=config['nuseq_multi_sample']['kwargs']) workflow.transform( name='convert_nuseq_multi_sample_vcf_to_hdf5', axes=(), ctx=default_ctx, func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5", args=( variant_files['snv']['vcf']['nuseq_multi_sample'].as_input(), variant_files['snv']['hdf']['nuseq_multi_sample'].as_output(), '/snv/vcf/nuseq_multi_sample/all', ), kwargs={'score_callback': vcf_score_callbacks['snv']['nuseq']}) #=================================================================================================================== # Single sample calling #=================================================================================================================== if 'nuseq' in config: workflow.subworkflow( name='nuseq', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow', args=(normal_bam_file.as_input(), [ tumour_bam_files.as_input(), ], ref_genome_fasta_file.as_input(), variant_files['snv']['vcf']['nuseq'].as_output()), kwargs=config['nuseq']['kwargs']) if 'mutect' in config: workflow.subworkflow( name='mutect', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.mutect.create_mutect_workflow', args=(normal_bam_file.as_input(), tumour_bam_files.as_input(), ref_genome_fasta_file.as_input(), config['databases']['cosmic']['local_path'], config['databases']['dbsnp']['local_path'], variant_files['snv']['vcf']['mutect'].as_output()), kwargs=config['mutect']['kwargs']) if 'strelka' in config: workflow.subworkflow( name='strelka', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.strelka.create_strelka_workflow', args=(normal_bam_file.as_input(), tumour_bam_files.as_input(), ref_genome_fasta_file.as_input(), variant_files['indel']['vcf']['strelka'].as_output(), variant_files['snv']['vcf']['strelka'].as_output()), kwargs=config['strelka']['kwargs']) #=================================================================================================================== # Convert vcf to hdf5 #=================================================================================================================== for var_type in variant_files: for prog in variant_files[var_type]['vcf']: if prog == 'nuseq_multi_sample': continue workflow.transform( name='convert_{0}_indel_{1}_to_hdf5'.format(prog, var_type), axes=('tumour_sample_id', ), ctx=default_ctx, func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5", args=(variant_files[var_type]['vcf'][prog].as_input(), variant_files[var_type]['hdf'][prog].as_output(), pypeliner.managed.Template( '/{var_type}/vcf/{prog}/{{tumour_sample_id}}'.format( prog=prog, var_type=var_type), 'tumour_sample_id')), kwargs={'score_callback': vcf_score_callbacks[var_type][prog]}) #=================================================================================================================== # Indel annotation #=================================================================================================================== workflow.transform( name='merge_indels', ctx=big_mem_ctx, func='biowrappers.components.io.vcf.tasks.vcf_tasks.merge_vcfs', args=([x.as_input() for x in variant_files['indel']['vcf'].values()], pypeliner.managed.TempOutputFile('all.indel.vcf'))) workflow.transform( name='finalise_indels', func="biowrappers.components.io.vcf.tasks.finalise_vcf", args=(pypeliner.managed.TempInputFile('all.indel.vcf'), pypeliner.managed.TempOutputFile('all.indel.vcf.gz'))) workflow.subworkflow( name='annotate_indels', axes=(), func=create_annotation_workflow, args=( config, pypeliner.managed.TempInputFile('all.indel.vcf.gz'), pypeliner.managed.TempOutputFile('indel_annotations.h5'), os.path.join(raw_data_dir, 'indel'), ), kwargs={'variant_type': 'indel'}) #=================================================================================================================== # SNV #=================================================================================================================== workflow.transform( name='merge_snvs', ctx=big_mem_ctx, func="biowrappers.components.io.vcf.tasks.merge_vcfs", args=([x.as_input() for x in variant_files['snv']['vcf'].values()], pypeliner.managed.TempOutputFile('all.snv.vcf'))) workflow.transform( name='finalise_snvs', func="biowrappers.components.io.vcf.tasks.finalise_vcf", args=(pypeliner.managed.TempInputFile('all.snv.vcf'), pypeliner.managed.TempOutputFile('all.snv.vcf.gz'))) workflow.subworkflow( name='annotate_snvs', axes=(), func=create_annotation_workflow, args=( config, pypeliner.managed.TempInputFile('all.snv.vcf.gz'), pypeliner.managed.TempOutputFile('snv_annotations.h5'), os.path.join(raw_data_dir, 'snv'), ), kwargs={'variant_type': 'snv'}) workflow.subworkflow( name='normal_snv_counts', func= 'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow', args=( normal_bam_file.as_input(), pypeliner.managed.TempInputFile('all.snv.vcf.gz'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')), ), kwargs=get_kwargs(config['snv_counts']['kwargs'], '/snv/counts/normal')) workflow.subworkflow( name='tumour_snv_counts', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow', args=(tumour_bam_files.as_input(), pypeliner.managed.TempInputFile('all.snv.vcf.gz'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'snv', 'counts', '{tumour_sample_id}.h5'), 'tumour_sample_id')), kwargs=get_kwargs( config['snv_counts']['kwargs'], pypeliner.managed.Template('/snv/counts/{tumour_sample_id}', 'tumour_sample_id'))) #=================================================================================================================== # Create final output #=================================================================================================================== tables = [ pypeliner.managed.TempInputFile('indel_annotations.h5'), pypeliner.managed.TempInputFile('snv_annotations.h5'), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'snv', 'counts', '{tumour_sample_id}.h5'), 'tumour_sample_id'), ] for var_type in variant_files: for prog in variant_files[var_type]['hdf']: tables.append(variant_files[var_type]['hdf'][prog].as_input()) workflow.transform( name='build_results_file', ctx=default_ctx, func='biowrappers.components.io.hdf5.tasks.concatenate_tables', args=(tables, pypeliner.managed.OutputFile(results_file)), kwargs={ 'drop_duplicates': True, }) return workflow
def call_and_annotate_pipeline( config, normal_bam_path, tumour_bam_paths, raw_data_dir, results_file, ): workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('tumour_sample_id'), value=tumour_bam_paths.keys(), ) merge_inputs = {} if 'destruct' in config: destruct_raw_data = os.path.join(raw_data_dir, 'destruct') destruct_results_filename = os.path.join(destruct_raw_data, 'results.h5') make_parent_directory(destruct_results_filename) workflow.subworkflow( name='destruct', func=destruct.destruct_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), config['destruct']['config'], config['destruct']['ref_data_dir'], pypeliner.managed.OutputFile(destruct_results_filename), destruct_raw_data, ), ) merge_inputs['/breakpoints/destruct'] = pypeliner.managed.InputFile( destruct_results_filename) if 'delly' in config: delly_raw_data = os.path.join(raw_data_dir, 'delly') delly_results_filename = os.path.join(delly_raw_data, 'results.h5') make_parent_directory(delly_results_filename) workflow.subworkflow( name='delly', func=delly.delly_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), config['delly']['ref_genome_fasta_file'], config['delly']['exclude_file'], pypeliner.managed.OutputFile(delly_results_filename), delly_raw_data, ), ) merge_inputs['/breakpoints/delly'] = pypeliner.managed.InputFile( delly_results_filename) if 'lumpysv' in config: lumpysv_raw_data = os.path.join(raw_data_dir, 'lumpysv') lumpysv_results_filename = os.path.join(lumpysv_raw_data, 'results.h5') make_parent_directory(lumpysv_results_filename) workflow.subworkflow( name='lumpysv', func=lumpysv.lumpysv_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), pypeliner.managed.OutputFile(lumpysv_results_filename), lumpysv_raw_data, ), ) merge_inputs['/breakpoints/lumpysv'] = pypeliner.managed.InputFile( lumpysv_results_filename) workflow.transform(name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( merge_inputs, pypeliner.managed.OutputFile(results_file), )) return workflow
def create_strelka_workflow(normal_bam_file, tumour_bam_file, snv_vcf_file, snv_maf_file, indel_vcf_file, indel_maf_file, reference, reference_vep, chromosomes, normal_id, tumour_id, single_node=False, is_exome=False): params = config.default_params('variant_calling') workflow = Workflow(ctx=helpers.get_default_ctx(memory=5, walltime='4:00'), ) workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ret=mgd.OutputChunks('regions'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) workflow.transform( name='count_fasta_bases', func="wgs.workflows.strelka.tasks.count_fasta_bases", args=( reference, pypeliner.managed.TempOutputFile('ref_base_counts.tsv'), ), ) workflow.transform( name="get_chrom_sizes", func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes", ret=pypeliner.managed.TempOutputObj('known_sizes'), args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'), chromosomes)) if single_node: workflow.transform(name='strelka_one_node', func="wgs.workflows.strelka.tasks.strelka_one_node", args=( pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai' ]), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai' ]), reference, mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace('call_genome_segment_tmp'), mgd.InputChunks('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': is_exome, }) else: workflow.transform( name='get_chromosome_depths', axes=('regions', ), func="wgs.workflows.strelka.tasks.get_chromosome_depth", args=( mgd.InputInstance('regions'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('chrom_depth.txt', 'regions'), ), ) workflow.transform( name='merge_chromosome_depths', func="wgs.workflows.strelka.tasks.merge_chromosome_depths", args=(mgd.TempInputFile('chrom_depth.txt', 'regions', axes_origin=[]), mgd.TempOutputFile('merged_chrom_depth.txt'))) workflow.transform( name='call_genome_segment', axes=('regions', ), func="wgs.workflows.strelka.tasks.call_genome_segment", args=( mgd.TempInputFile('merged_chrom_depth.txt'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('indels.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf', 'regions'), mgd.TempSpace('call_genome_segment_tmp', 'regions'), mgd.InputInstance('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': False, }) workflow.transform( name='merge_indels', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('indels.vcf', 'regions'), mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("indels_merge")), ) workflow.transform( name='merge_snvs', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('snvs.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("snvs_merge")), ) workflow.transform(name='bcftools_normalize_snv', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('snvs.vcf.gz'), mgd.TempOutputFile('normalized_snvs.vcf'), reference, )) workflow.transform( name='finalise_normalize_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_snvs.vcf'), mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform(name='bcftools_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('indels.vcf.gz'), mgd.TempOutputFile('normalized_indels.vcf'), reference, )) workflow.transform( name='finalise_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_indels.vcf'), mgd.TempOutputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_indel', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_snv', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="strelka_snv_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(snv_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) workflow.subworkflow(name="strelka_indel_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(indel_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(indel_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) return workflow
def main(args): biowrappers.components.utils.make_directory(args.out_dir) with open(args.config_file) as config_file: config_text = config_file.read() config_text = config_text.format(out_dir=args.out_dir, ref_db_dir=args.ref_db_dir) config = yaml.load(config_text) pypeliner_args = vars(args) pypeliner_args['tmpdir'] = os.path.join(args.out_dir, 'pipeline') pyp = pypeliner.app.Pypeline(modules=[tasks], config=pypeliner_args) download_urls = {} for sample in ('tumour', 'normal'): lanes = config['lanes'][sample] for lane in lanes: download_urls[(sample, lane)] = config['lanes'][sample][lane]['url'] raw_lane_template = os.path.join(args.out_dir, 'lanes', 'raw', '{lane}.bam') realigned_lane_template = os.path.join(args.out_dir, 'lanes', 'realigned', '{lane}.bam') sample_bam_template = os.path.join(args.out_dir, '{sample}.bam') workflow = Workflow(default_ctx={'mem': 8}) workflow.setobj( obj=pypeliner.managed.TempOutputObj('url', 'sample', 'lane'), value=download_urls, ) workflow.subworkflow( name='download_lanes', axes=('sample', 'lane'), func=biowrappers.components.io.download.create_download_workflow, args=( pypeliner.managed.TempInputObj('url', 'sample', 'lane'), pypeliner.managed.OutputFile('raw_lane', 'sample', 'lane', template=raw_lane_template), ) ) workflow.subworkflow( name='realign_lanes', axes=('sample', 'lane'), func=biowrappers.pipelines.realignment.realignment_pipeline, args=( config['realignment'], pypeliner.managed.InputFile('raw_lane', 'sample', 'lane', template=raw_lane_template), pypeliner.managed.OutputFile('realigned_lane', 'sample', 'lane', template=realigned_lane_template), ) ) workflow.transform( name='merge_and_markdups', axes=('sample',), func=biowrappers.components.io.bam.tasks.mark_duplicates, args=( pypeliner.managed.InputFile('realigned_lane', 'sample', 'lane', template=realigned_lane_template), pypeliner.managed.OutputFile('bam', 'sample', template=sample_bam_template), ), kwargs={ 'tmp_dir': pypeliner.managed.TempSpace('markdup_temp', 'sample') } ) pyp.run(workflow) normal_bam_file = sample_bam_template.format(sample='normal') tumour_bam_file = sample_bam_template.format(sample='tumour') workflow = Workflow(default_ctx={'mem': 8}) breakpoint_raw_data_dir = os.path.join(args.out_dir, 'breakpoints', 'raw') breakpoint_results_file = os.path.join(args.out_dir, 'breakpoints', 'results.h5') workflow.subworkflow( name='breakpoint_call_and_annotate', func=biowrappers.pipelines.breakpoint_call_and_annotate.call_and_annotate_pipeline, args=( config, pypeliner.managed.InputFile(normal_bam_file), {'tumour': pypeliner.managed.InputFile(tumour_bam_file)}, pypeliner.managed.Template(os.path.join(breakpoint_raw_data_dir)), pypeliner.managed.OutputFile(breakpoint_results_file), ), ) somatic_breakpoints_file = os.path.join(args.out_dir, 'somatic_breakpoints.tsv') workflow.transform( name='extract_somatic_breakpoint', ctx={'mem': 4}, func=tasks.extract_somatic_breakpoint, args=( pypeliner.managed.InputFile(breakpoint_results_file), pypeliner.managed.OutputFile(somatic_breakpoints_file), config, ) ) copy_number_raw_data_dir = os.path.join(args.out_dir, 'copy_number', 'raw') breakpoint_results_file = os.path.join(args.out_dir, 'copy_number', 'results.h5') workflow.subworkflow( name='copy_number_call_and_annotate', func=biowrappers.pipelines.copy_number.call_and_annotate_pipeline, args=( config, pypeliner.managed.InputFile(normal_bam_file), {'tumour': pypeliner.managed.InputFile(tumour_bam_file)}, copy_number_raw_data_dir, pypeliner.managed.OutputFile(breakpoint_results_file), ), kwargs={ 'somatic_breakpoint_file': pypeliner.managed.InputFile(somatic_breakpoints_file), }, ) pyp.run(workflow)
def create_setup_tools_workflow(databases, config): workflow = Workflow() if 'destruct' in config: import destruct.create_ref_data workflow.transform( name='destruct_create_ref_data', ctx={'mem': 16}, func=destruct.create_ref_data.create_ref_data, args=( config['destruct']['config'], config['destruct']['ref_data_dir'], ), ) if 'delly' in config: workflow.subworkflow( name='delly_exclude', func=download.create_download_workflow, args=( config['delly']['exclude_url'], pypeliner.managed.OutputFile(config['delly']['exclude_file']), ) ) if 'remixt' in config: workflow.subworkflow( name='create_setup_remixt_workflow', func=biowrappers.components.copy_number_calling.remixt.create_setup_remixt_workflow, args=( config['remixt']['config'], databases, ), kwargs={ 'ref_data_dir': config['remixt']['ref_data_dir'], }, ) if 'titan' in config: workflow.subworkflow( name='create_setup_titan_workflow', func=biowrappers.components.copy_number_calling.titan.create_setup_titan_workflow, args=( config['titan']['config'], databases, ) ) if 'theta' in config: workflow.subworkflow( name='create_setup_theta_workflow', func=biowrappers.components.copy_number_calling.theta.create_setup_theta_workflow, args=( config['theta']['config'], databases, ) ) if 'clonehd' in config: workflow.subworkflow( name='create_setup_clonehd_workflow', func=biowrappers.components.copy_number_calling.clonehd.create_setup_clonehd_workflow, args=( config['clonehd']['config'], databases, ) ) return workflow
def create_ref_genome_download_and_index_workflow(config, out_file): workflow = Workflow() if config['url'].endswith('gz'): workflow.subworkflow( name='download', func=download.create_download_workflow, args=( config['url'], pypeliner.managed.TempOutputFile('ref.fasta.gz'), ) ) workflow.commandline( name='gunzip', args=( 'gzip', '-cd', pypeliner.managed.TempInputFile('ref.fasta.gz'), '>', pypeliner.managed.OutputFile(out_file) ), ) else: workflow.subworkflow( name='download', func=download.create_download_workflow, args=( config['url'], pypeliner.managed.OutputFile(out_file) ) ) workflow.commandline( name='build_dict', ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, args=( 'samtools', 'dict', pypeliner.managed.InputFile(out_file), '>', pypeliner.managed.OutputFile(out_file + '.build_dict.log'), ) ) workflow.commandline( name='build_fai', ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, args=( 'samtools', 'faidx', pypeliner.managed.InputFile(out_file), '>', pypeliner.managed.OutputFile(out_file + '.build_fai.log'), ) ) workflow.commandline( name='build_bwa_index', ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, args=( 'bwa', 'index', pypeliner.managed.InputFile(out_file), '>', pypeliner.managed.OutputFile(out_file + '.build_bwa_index.log'), ) ) return workflow
def create_setup_reference_dbs_workflow(config): workflow = Workflow() if 'cosmic' in config: workflow.transform( name='cosmic', func=tasks.download_cosmic, args=( config['cosmic'], pypeliner.managed.OutputFile(config['cosmic']['local_path']), pypeliner.managed.TempSpace('cosmic_work', cleanup=None) ) ) if 'dbsnp' in config: workflow.subworkflow( name='dbsnp', func=create_dbsnp_download_workflow, args=( config['dbsnp'], pypeliner.managed.OutputFile(config['dbsnp']['local_path']), ) ) if 'mappability' in config: workflow.subworkflow( name='mappability', func=download.create_download_workflow, args=( config['mappability']['url'], pypeliner.managed.OutputFile(config['mappability']['local_path']), ) ) if 'ref_genome' in config and 'url' in config['ref_genome']: workflow.subworkflow( name='ref_genome', func=create_ref_genome_download_and_index_workflow, args=( config['ref_genome'], pypeliner.managed.OutputFile(config['ref_genome']['local_path']), ) ) if 'snpeff' in config: workflow.commandline( name='snpeff', args=( 'snpEff', 'download', config['snpeff']['db'] ) ) if 'chrom_info' in config: workflow.subworkflow( name='chrom_info', func=download.create_download_workflow, args=( config['chrom_info']['url'], pypeliner.managed.OutputFile(config['chrom_info']['local_path']), ) ) return workflow
def create_annotation_workflow( config, in_vcf_file, cosmic_status_file, dbsnp_status_file, mappability_file, snpeff_file, trinuc_file, variant_type='snv', ): annotators = ('cosmic_status', 'dbsnp_status', 'mappability', 'snpeff', 'tri_nucleotide_context') kwargs = {} for a in annotators: kwargs[a] = get_kwargs(config[a]['kwargs'], '/{0}/{1}'.format(variant_type, a)) workflow = Workflow() workflow.subworkflow( name='cosmic_status', func= 'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2), args=(config['databases']['cosmic']['local_path'], pypeliner.managed.InputFile(in_vcf_file), pypeliner.managed.OutputFile(cosmic_status_file)), kwargs=config["cosmic_status"]['kwargs']) workflow.subworkflow( name='dbsnp_status', func= 'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2), args=(config['databases']['dbsnp']['local_path'], pypeliner.managed.InputFile(in_vcf_file), pypeliner.managed.OutputFile(dbsnp_status_file)), kwargs=config["dbsnp_status"]['kwargs']) workflow.subworkflow( name='mappability', func= 'biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2), args=( config['databases']['mappability']['local_path'], pypeliner.managed.InputFile(in_vcf_file, extensions=['.tbi']), pypeliner.managed.OutputFile(mappability_file), ), kwargs=config["mappability"]['kwargs']) workflow.subworkflow( name='snpeff', func= 'biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2), args=(config['databases']['snpeff']['db'], config['databases']['snpeff']['data_dir'], pypeliner.managed.InputFile(in_vcf_file), pypeliner.managed.OutputFile(snpeff_file)), kwargs=kwargs['snpeff']) workflow.subworkflow( name='tri_nucleotide_context', func= 'biowrappers.components.variant_calling.tri_nucleotide_context.create_vcf_tric_nucleotide_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2), args=( config['databases']['ref_genome']['local_path'], pypeliner.managed.InputFile(in_vcf_file), pypeliner.managed.OutputFile(trinuc_file), ), kwargs=config["tri_nucleotide_context"]['kwargs']) return workflow
def destruct_pipeline( normal_bam_file, tumour_bam_files, config, ref_data_dir, out_file, raw_data_dir, normal_sample_id='normal', ): bam_files = tumour_bam_files bam_files[normal_sample_id] = normal_bam_file utils.make_directory(os.path.join(raw_data_dir, 'raw')) breakpoint_file = os.path.join(raw_data_dir, 'raw', 'breakpoint.tsv') breakpoint_library_file = os.path.join(raw_data_dir, 'raw', 'breakpoint_library.tsv') breakpoint_read_file = os.path.join(raw_data_dir, 'raw', 'breakpoint_read.tsv') utils.make_directory(os.path.join(raw_data_dir, 'somatic')) somatic_breakpoint_file = os.path.join(raw_data_dir, 'somatic', 'breakpoint.tsv') somatic_breakpoint_library_file = os.path.join(raw_data_dir, 'somatic', 'breakpoint_library.tsv') raw_read_data_dir = os.path.join(raw_data_dir, 'read_data') utils.make_directory(raw_read_data_dir) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=bam_files.keys(), ) workflow.subworkflow( name='run_destruct', func="destruct.workflow.create_destruct_workflow", args=( pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files), pypeliner.managed.OutputFile(breakpoint_file), pypeliner.managed.OutputFile(breakpoint_library_file), pypeliner.managed.OutputFile(breakpoint_read_file), config, ref_data_dir, ), kwargs={ 'raw_data_dir': raw_read_data_dir, }, ) workflow.transform( name='filter_annotate_breakpoints', ctx={'mem': 8}, func= 'biowrappers.components.breakpoint_calling.destruct.tasks.filter_annotate_breakpoints', args=( pypeliner.managed.InputFile(breakpoint_file), pypeliner.managed.InputFile(breakpoint_library_file), [normal_sample_id], pypeliner.managed.OutputFile(somatic_breakpoint_file), pypeliner.managed.OutputFile(somatic_breakpoint_library_file), ), ) workflow.transform( name='write_store', func= 'biowrappers.components.breakpoint_calling.destruct.tasks.write_store', ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, args=( pypeliner.managed.InputFile(somatic_breakpoint_file), pypeliner.managed.InputFile(somatic_breakpoint_library_file), mgd.OutputFile(out_file), ), ) return workflow
def create_annotation_workflow( config, in_vcf_file, out_file, raw_data_dir, variant_type='snv', docker_config={}, snpeff_docker={}, vcftools_docker={}, ): annotators = ('cosmic_status', 'dbsnp_status', 'mappability', 'snpeff', 'tri_nucleotide_context') result_files = {} kwargs = {} for a in annotators: kwargs[a] = get_kwargs(config[a]['kwargs'], '/{0}/{1}'.format(variant_type, a)) result_files[a] = pypeliner.managed.File( os.path.join(raw_data_dir, '{0}.csv.gz'.format(a))) if not os.path.isdir(raw_data_dir): os.mkdir(raw_data_dir) assert os.path.isdir(raw_data_dir) workflow = Workflow() workflow.subworkflow( name='cosmic_status', func= 'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2, **docker_config), args=( config['databases']['cosmic']['local_path'], pypeliner.managed.InputFile(in_vcf_file), result_files['cosmic_status'].as_output(), ), kwargs=config["cosmic_status"]['kwargs']) workflow.subworkflow( name='dbsnp_status', func= 'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2, **docker_config), args=( config['databases']['dbsnp']['local_path'], pypeliner.managed.InputFile(in_vcf_file), result_files['dbsnp_status'].as_output(), ), kwargs=config["dbsnp_status"]['kwargs']) workflow.subworkflow( name='mappability', func= 'biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2, **docker_config), args=( config['databases']['mappability']['local_path'], pypeliner.managed.InputFile(in_vcf_file, extensions=['.tbi']), result_files['mappability'].as_output(), ), kwargs=config["mappability"]['kwargs']) workflow.subworkflow( name='snpeff', func= 'biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2, **docker_config), args=( config['databases']['snpeff']['db'], config['databases']['snpeff']['data_dir'], pypeliner.managed.InputFile(in_vcf_file), result_files['snpeff'].as_output(), ), kwargs=dict(snpeff_docker=snpeff_docker, **kwargs['snpeff'])) workflow.subworkflow( name='tri_nucleotide_context', func= 'biowrappers.components.variant_calling.tri_nucleotide_context.create_vcf_tric_nucleotide_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2, **docker_config), args=( config['databases']['ref_genome']['local_path'], pypeliner.managed.InputFile(in_vcf_file), result_files['tri_nucleotide_context'].as_output(), ), kwargs=config["tri_nucleotide_context"]['kwargs']) workflow.transform(name='build_results_file', ctx=dict(mem=4, mem_retry_increment=2, **docker_config), func='single_cell.utils.csvutils.concatenate_csv', args=( [x.as_input() for x in result_files.values()], pypeliner.managed.OutputFile(out_file, extensions=[".yaml"]), )) return workflow