def create_battenberg_workflow( seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs ): if normal_id is None: raise ValueError('cloneHD requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') utils.make_parent_directory(results_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file) workflow.subworkflow( name='run_battenberg', axes=('sample_id',), func=create_battenberg_single_workflow, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), normal_id, pypeliner.managed.InputInstance('sample_id'), pypeliner.managed.OutputFile('results', 'sample_id', template=results_files), config, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, }, ) workflow.transform( name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('results', 'sample_id', template=results_files), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow
def create_samtools_germline_workflow( normal_bam_files, normal_bai_files, ref_genome_fasta_file, vcf_file, config, chromosomes=default_chromosomes, base_docker=None, samtools_docker=None, vcftools_docker=None ): ctx = {'mem': config["memory"]['low'], 'pool_id': config['pools']['standard'], 'mem_retry_increment': 2, 'ncpus': 1} if base_docker: ctx.update(base_docker) regions = normal_bam_files.keys() workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('regions'), value=regions, ) workflow.transform( name='run_samtools_variant_calling', ctx=ctx, axes=('regions',), func="single_cell.workflows.germline.tasks.run_samtools_variant_calling", args=( pypeliner.managed.InputFile('normal.split.bam', 'regions', fnames=normal_bam_files), pypeliner.managed.InputFile('normal.split.bam.bai', 'regions', fnames=normal_bai_files), ref_genome_fasta_file, pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'), ), kwargs={ 'region': pypeliner.managed.InputInstance('regions'), 'samtools_docker': samtools_docker, 'vcftools_docker': samtools_docker }, ) workflow.transform( name='concatenate_variants', ctx=ctx, func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=( pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'), pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']), pypeliner.managed.TempSpace("merge_variants_germline"), ), kwargs={'docker_config': vcftools_docker} ) return workflow
def create_ascat_workflow(seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs): if normal_id is None: raise ValueError('ASCAT requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') utils.make_parent_directory(results_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) workflow.transform( name='prepare_normal_data', ctx={ 'mem': 16, 'num_retry': 3, 'mem_retry_increment': 4 }, func=tasks.prepare_normal_data, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.TempOutputFile('Germline_LogR.txt'), pypeliner.managed.TempOutputFile('Germline_BAF.txt'), config, ), ) workflow.transform( name='prepare_tumour_data', axes=('sample_id', ), ctx={'mem': 20}, func=tasks.prepare_tumour_data, args=( pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), pypeliner.managed.TempOutputFile('Germline_LogR.txt', 'sample_id'), pypeliner.managed.TempOutputFile('Germline_BAF.txt', 'sample_id'), config, ), ) return workflow
def create_samtools_germline_workflow(normal_bam_files, ref_genome_fasta_file, vcf_file, config, samtools_docker=None, vcftools_docker=None): baseimage = config['docker']['single_cell_pipeline'] ctx = { 'mem': config["memory"]['low'], 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'docker_image': baseimage } regions = list(normal_bam_files.keys()) workflow = Workflow(ctx=ctx) workflow.setobj( obj=pypeliner.managed.OutputChunks('regions'), value=regions, ) workflow.transform( name='run_samtools_variant_calling', axes=('regions', ), func= "single_cell.workflows.germline.tasks.run_samtools_variant_calling", args=( pypeliner.managed.InputFile('normal.split.bam', 'regions', fnames=normal_bam_files, extensions=['.bai']), ref_genome_fasta_file, pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'), ), kwargs={ 'region': pypeliner.managed.InputInstance('regions'), 'samtools_docker': samtools_docker, 'vcftools_docker': samtools_docker }, ) workflow.transform( name='concatenate_variants', func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=( pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'), pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']), pypeliner.managed.TempSpace("merge_variants_germline"), ), kwargs={'docker_config': vcftools_docker}) return workflow
def create_download_workflow(url, file_name): workflow = Workflow() workflow.setobj(obj=pypeliner.managed.TempOutputObj('url'), value=url) workflow.transform(name='download', ctx={'local': True}, func=tasks.download_from_url, args=(pypeliner.managed.TempInputObj('url'), pypeliner.managed.OutputFile(file_name))) return workflow
def call_and_annotate_pipeline(config, normal_bam_path, tumour_bam_paths, raw_data_dir, results_file, chromosomes=default_chromosomes): workflow = Workflow() workflow.setobj( pypeliner.managed.OutputChunks('tumour_sample_id', axes_origin=[ 0, ]), tumour_bam_paths.keys()) variant_files = get_variant_files(chromosomes, config, raw_data_dir) normal_bam_file = pypeliner.managed.File(normal_bam_path) tumour_bam_files = pypeliner.managed.File('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths) ref_genome_fasta_file = pypeliner.managed.File( config['databases']['ref_genome']['local_path']) #=================================================================================================================== # Multi sample calling #=================================================================================================================== if 'nuseq_multi_sample' in config: workflow.subworkflow( name='nuseq_multi_sample', axes=(), func= 'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow', args=( normal_bam_file.as_input(), [ pypeliner.managed.InputFile(x) for x in tumour_bam_paths.values() ], ref_genome_fasta_file.as_input(), variant_files['snv']['vcf']['nuseq_multi_sample'].as_output()), kwargs=config['nuseq_multi_sample']['kwargs']) workflow.transform( name='convert_nuseq_multi_sample_vcf_to_hdf5', axes=(), ctx=default_ctx, func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5", args=( variant_files['snv']['vcf']['nuseq_multi_sample'].as_input(), variant_files['snv']['hdf']['nuseq_multi_sample'].as_output(), '/snv/vcf/nuseq_multi_sample/all', ), kwargs={'score_callback': vcf_score_callbacks['snv']['nuseq']}) #=================================================================================================================== # Single sample calling #=================================================================================================================== if 'nuseq' in config: workflow.subworkflow( name='nuseq', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow', args=(normal_bam_file.as_input(), [ tumour_bam_files.as_input(), ], ref_genome_fasta_file.as_input(), variant_files['snv']['vcf']['nuseq'].as_output()), kwargs=config['nuseq']['kwargs']) if 'mutect' in config: workflow.subworkflow( name='mutect', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.mutect.create_mutect_workflow', args=(normal_bam_file.as_input(), tumour_bam_files.as_input(), ref_genome_fasta_file.as_input(), config['databases']['cosmic']['local_path'], config['databases']['dbsnp']['local_path'], variant_files['snv']['vcf']['mutect'].as_output()), kwargs=config['mutect']['kwargs']) if 'strelka' in config: workflow.subworkflow( name='strelka', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.strelka.create_strelka_workflow', args=(normal_bam_file.as_input(), tumour_bam_files.as_input(), ref_genome_fasta_file.as_input(), variant_files['indel']['vcf']['strelka'].as_output(), variant_files['snv']['vcf']['strelka'].as_output()), kwargs=config['strelka']['kwargs']) #=================================================================================================================== # Convert vcf to hdf5 #=================================================================================================================== for var_type in variant_files: for prog in variant_files[var_type]['vcf']: if prog == 'nuseq_multi_sample': continue workflow.transform( name='convert_{0}_indel_{1}_to_hdf5'.format(prog, var_type), axes=('tumour_sample_id', ), ctx=default_ctx, func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5", args=(variant_files[var_type]['vcf'][prog].as_input(), variant_files[var_type]['hdf'][prog].as_output(), pypeliner.managed.Template( '/{var_type}/vcf/{prog}/{{tumour_sample_id}}'.format( prog=prog, var_type=var_type), 'tumour_sample_id')), kwargs={'score_callback': vcf_score_callbacks[var_type][prog]}) #=================================================================================================================== # Indel annotation #=================================================================================================================== workflow.transform( name='merge_indels', ctx=big_mem_ctx, func='biowrappers.components.io.vcf.tasks.vcf_tasks.merge_vcfs', args=([x.as_input() for x in variant_files['indel']['vcf'].values()], pypeliner.managed.TempOutputFile('all.indel.vcf'))) workflow.transform( name='finalise_indels', func="biowrappers.components.io.vcf.tasks.finalise_vcf", args=(pypeliner.managed.TempInputFile('all.indel.vcf'), pypeliner.managed.TempOutputFile('all.indel.vcf.gz'))) workflow.subworkflow( name='annotate_indels', axes=(), func=create_annotation_workflow, args=( config, pypeliner.managed.TempInputFile('all.indel.vcf.gz'), pypeliner.managed.TempOutputFile('indel_annotations.h5'), os.path.join(raw_data_dir, 'indel'), ), kwargs={'variant_type': 'indel'}) #=================================================================================================================== # SNV #=================================================================================================================== workflow.transform( name='merge_snvs', ctx=big_mem_ctx, func="biowrappers.components.io.vcf.tasks.merge_vcfs", args=([x.as_input() for x in variant_files['snv']['vcf'].values()], pypeliner.managed.TempOutputFile('all.snv.vcf'))) workflow.transform( name='finalise_snvs', func="biowrappers.components.io.vcf.tasks.finalise_vcf", args=(pypeliner.managed.TempInputFile('all.snv.vcf'), pypeliner.managed.TempOutputFile('all.snv.vcf.gz'))) workflow.subworkflow( name='annotate_snvs', axes=(), func=create_annotation_workflow, args=( config, pypeliner.managed.TempInputFile('all.snv.vcf.gz'), pypeliner.managed.TempOutputFile('snv_annotations.h5'), os.path.join(raw_data_dir, 'snv'), ), kwargs={'variant_type': 'snv'}) workflow.subworkflow( name='normal_snv_counts', func= 'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow', args=( normal_bam_file.as_input(), pypeliner.managed.TempInputFile('all.snv.vcf.gz'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')), ), kwargs=get_kwargs(config['snv_counts']['kwargs'], '/snv/counts/normal')) workflow.subworkflow( name='tumour_snv_counts', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow', args=(tumour_bam_files.as_input(), pypeliner.managed.TempInputFile('all.snv.vcf.gz'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'snv', 'counts', '{tumour_sample_id}.h5'), 'tumour_sample_id')), kwargs=get_kwargs( config['snv_counts']['kwargs'], pypeliner.managed.Template('/snv/counts/{tumour_sample_id}', 'tumour_sample_id'))) #=================================================================================================================== # Create final output #=================================================================================================================== tables = [ pypeliner.managed.TempInputFile('indel_annotations.h5'), pypeliner.managed.TempInputFile('snv_annotations.h5'), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'snv', 'counts', '{tumour_sample_id}.h5'), 'tumour_sample_id'), ] for var_type in variant_files: for prog in variant_files[var_type]['hdf']: tables.append(variant_files[var_type]['hdf'][prog].as_input()) workflow.transform( name='build_results_file', ctx=default_ctx, func='biowrappers.components.io.hdf5.tasks.concatenate_tables', args=(tables, pypeliner.managed.OutputFile(results_file)), kwargs={ 'drop_duplicates': True, }) return workflow
def create_theta_workflow(seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs): if normal_id is None: raise ValueError('Theta requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_template = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') bicseq2_seg_template = os.path.join(raw_data_dir, 'bicseq2', 'bicseq2_{sample_id}.seg') utils.make_parent_directory(results_template) utils.make_parent_directory(bicseq2_seg_template) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile( somatic_breakpoint_file) workflow.transform( name='run_bicseq2', axes=('sample_id', ), ctx={'mem': 30}, func=tasks.run_bicseq2_seg, args=( pypeliner.managed.OutputFile('bicseq2_seg', 'sample_id', template=bicseq2_seg_template), pypeliner.managed.InputFile('normal_seqdata', template=normal_seqdata_file), pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), config, pypeliner.managed.TempSpace('bicseq2_work', 'sample_id', cleanup=None), ), ) workflow.transform( name='run_theta', axes=('sample_id', ), ctx={'mem': 32}, func=tasks.run_theta, args=( pypeliner.managed.OutputFile('results', 'sample_id', template=results_template), pypeliner.managed.InputFile('normal_seqdata', template=normal_seqdata_file), pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), pypeliner.managed.InputFile('bicseq2_seg', 'sample_id', template=bicseq2_seg_template), config, pypeliner.managed.TempSpace('theta_work', 'sample_id', cleanup=None), ), kwargs={ 'breakpoints_filename': somatic_breakpoint_file, 'num_clones': kwargs.get('num_clones', None), }, ) workflow.transform( name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('results', 'sample_id', template=results_template), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow
def create_strelka_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, indel_vcf_file, snv_vcf_file, config, chromosomes=default_chromosomes, split_size=int(1e7), use_depth_thresholds=True): ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'num_retry': 3, 'docker_image': config['docker']['single_cell_pipeline'] } strelka_docker = {'docker_image': config['docker']['strelka']} vcftools_docker = {'docker_image': config['docker']['vcftools']} regions = list(normal_bam_file.keys()) assert set(tumour_bam_file.keys()) == set(regions) workflow = Workflow(ctx=ctx) workflow.setobj( obj=pypeliner.managed.OutputChunks('chrom'), value=chromosomes, ) workflow.setobj( obj=pypeliner.managed.OutputChunks('region'), value=regions, ) workflow.transform( name='count_fasta_bases', ctx=dict(mem=2), func="single_cell.workflows.strelka.tasks.count_fasta_bases", args=(ref_genome_fasta_file, pypeliner.managed.TempOutputFile('ref_base_counts.tsv'), strelka_docker)) workflow.transform( name="get_chrom_sizes", ctx=dict(mem=2), func="single_cell.workflows.strelka.tasks.get_known_chromosome_sizes", ret=pypeliner.managed.TempOutputObj('known_sizes'), args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'), chromosomes)) workflow.transform( name='call_somatic_variants', ctx=dict(mem=4, disk=40), func="single_cell.workflows.strelka.tasks.call_somatic_variants", axes=('region', ), args=(pypeliner.managed.InputFile("normal.split.bam", "region", fnames=normal_bam_file, extensions=['.bai']), pypeliner.managed.InputFile("merged_bam", "region", fnames=tumour_bam_file, extensions=['.bai']), pypeliner.managed.TempInputObj('known_sizes'), ref_genome_fasta_file, pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf', 'region'), pypeliner.managed.TempOutputFile( 'somatic.indels.unfiltered.vcf.window', 'region'), pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf', 'region'), pypeliner.managed.TempOutputFile('strelka.stats', 'region'), pypeliner.managed.InputInstance("region"), strelka_docker), ) workflow.transform( name='add_indel_filters', axes=('chrom', ), ctx=dict(mem=4), func="single_cell.workflows.strelka.tasks.filter_indel_file_list", args=(pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf', 'region'), pypeliner.managed.TempInputFile('strelka.stats', 'region'), pypeliner.managed.TempInputFile( 'somatic.indels.unfiltered.vcf.window', 'region'), pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf', 'chrom'), pypeliner.managed.InputInstance("chrom"), pypeliner.managed.TempInputObj('known_sizes'), regions), kwargs={'use_depth_filter': use_depth_thresholds}) workflow.transform( name='add_snv_filters', axes=('chrom', ), ctx=dict(mem=4), func="single_cell.workflows.strelka.tasks.filter_snv_file_list", args=( pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf', 'region'), pypeliner.managed.TempInputFile('strelka.stats', 'region'), pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf', 'chrom'), pypeliner.managed.InputInstance("chrom"), pypeliner.managed.TempInputObj('known_sizes'), regions, ), kwargs={'use_depth_filter': use_depth_thresholds}) workflow.transform( name='merge_indels', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=( pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf', 'chrom'), pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf.gz'), pypeliner.managed.TempSpace("merge_indels_temp"), vcftools_docker)) workflow.transform( name='merge_snvs', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf', 'chrom'), pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz'), pypeliner.managed.TempSpace("merge_snvs_temp"), vcftools_docker)) workflow.transform( name='filter_indels', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.filter_vcf", args=( pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'), pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf'))) workflow.transform( name='filter_snvs', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.filter_vcf", args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'), pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf'))) workflow.transform( name='finalise_indels', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf", args=(pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'), pypeliner.managed.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']), vcftools_docker)) workflow.transform( name='finalise_snvs', ctx=dict(mem=2), func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf", args=(pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'), pypeliner.managed.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']), vcftools_docker)) return workflow
def call_and_annotate_pipeline( config, normal_bam_path, tumour_bam_paths, raw_data_dir, results_file, ): workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('tumour_sample_id'), value=tumour_bam_paths.keys(), ) merge_inputs = {} if 'destruct' in config: destruct_raw_data = os.path.join(raw_data_dir, 'destruct') destruct_results_filename = os.path.join(destruct_raw_data, 'results.h5') make_parent_directory(destruct_results_filename) workflow.subworkflow( name='destruct', func=destruct.destruct_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), config['destruct']['config'], config['destruct']['ref_data_dir'], pypeliner.managed.OutputFile(destruct_results_filename), destruct_raw_data, ), ) merge_inputs['/breakpoints/destruct'] = pypeliner.managed.InputFile( destruct_results_filename) if 'delly' in config: delly_raw_data = os.path.join(raw_data_dir, 'delly') delly_results_filename = os.path.join(delly_raw_data, 'results.h5') make_parent_directory(delly_results_filename) workflow.subworkflow( name='delly', func=delly.delly_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), config['delly']['ref_genome_fasta_file'], config['delly']['exclude_file'], pypeliner.managed.OutputFile(delly_results_filename), delly_raw_data, ), ) merge_inputs['/breakpoints/delly'] = pypeliner.managed.InputFile( delly_results_filename) if 'lumpysv' in config: lumpysv_raw_data = os.path.join(raw_data_dir, 'lumpysv') lumpysv_results_filename = os.path.join(lumpysv_raw_data, 'results.h5') make_parent_directory(lumpysv_results_filename) workflow.subworkflow( name='lumpysv', func=lumpysv.lumpysv_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), pypeliner.managed.OutputFile(lumpysv_results_filename), lumpysv_raw_data, ), ) merge_inputs['/breakpoints/lumpysv'] = pypeliner.managed.InputFile( lumpysv_results_filename) workflow.transform(name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( merge_inputs, pypeliner.managed.OutputFile(results_file), )) return workflow
def realignment_pipeline( config, in_file, out_file, read_group_info=None): if read_group_info is None: read_group_info = config.get('read_group', {}) if 'ID' not in read_group_info: read_group_info['ID'] = hash(in_file) % int(1e6) ref_genome = pypeliner.managed.InputFile(config['ref_genome']['file']) read_1 = pypeliner.managed.TempFile('read_1', 'split') read_2 = pypeliner.managed.TempFile('read_2', 'split') read_1_sai = pypeliner.managed.TempFile('read_1.sai', 'split') read_2_sai = pypeliner.managed.TempFile('read_2.sai', 'split') read_group_config = pypeliner.managed.TempObj('read_group_config') workflow = Workflow() if 'read_group' in config: workflow.setobj( obj=read_group_config.as_output(), value=read_group_info, ) else: workflow.transform( name='get_read_group_config', ctx={'local': True}, func=tasks.get_read_group_config, ret=read_group_config.as_output(), args=( pypeliner.managed.InputFile(in_file), ) ) workflow.transform( name='bam_to_fasta', axes=(), ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2}, func=bam_tasks.convert_to_fastqs, args=( pypeliner.managed.InputFile(in_file), { 1: read_1.as_output(), 2: read_2.as_output(), }, pypeliner.managed.TempSpace('bam_to_fastq'), ), kwargs={ 'split_size': config['split_size'] }, ) workflow.transform( name='aln_read_1', axes=('split',), ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, func=bwa_tasks.run_aln, args=( read_1.as_input(), ref_genome, read_1_sai.as_output(), ), ) workflow.transform( name='aln_read_2', axes=('split',), ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, func=bwa_tasks.run_aln, args=( read_2.as_input(), ref_genome, read_2_sai.as_output(), ), ) workflow.transform( name='sampe', axes=('split',), ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, func=bwa_tasks.run_sampe, args=( read_1.as_input(), read_2.as_input(), read_1_sai.as_input(), read_2_sai.as_input(), ref_genome, pypeliner.managed.TempOutputFile('aligned.bam', 'split'), ), kwargs={ 'read_group_info': read_group_config.as_input() }, ) workflow.transform( name='sort', axes=('split',), ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2}, func=bam_tasks.sort, args=( pypeliner.managed.TempInputFile('aligned.bam', 'split'), pypeliner.managed.TempOutputFile('sorted.bam', 'split'), ), ) workflow.transform( name='write_header_file', axes=(), ctx={'local': True}, func=tasks.write_header_file, args=( pypeliner.managed.TempInputFile('sorted.bam', 'split'), pypeliner.managed.TempOutputFile('header.sam'), config['ref_genome']['header'] ), ) workflow.transform( name='merge', axes=(), ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2}, func=bam_tasks.merge, args=( pypeliner.managed.TempInputFile('sorted.bam', 'split'), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'header_file': pypeliner.managed.TempInputFile('header.sam'), }, ) return workflow
def create_strelka_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, indel_vcf_file, snv_vcf_file, snv_csv_file, chromosomes=default_chromosomes, use_depth_thresholds=True): ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'num_retry': 3 } regions = list(normal_bam_file.keys()) assert set(tumour_bam_file.keys()) == set(regions) workflow = Workflow(ctx=ctx) workflow.setobj( obj=pypeliner.managed.OutputChunks('chrom'), value=chromosomes, ) workflow.setobj( obj=pypeliner.managed.OutputChunks('region'), value=regions, ) workflow.transform( name='count_fasta_bases', ctx=dict(mem=2), func="single_cell.workflows.strelka.tasks.count_fasta_bases", args=( ref_genome_fasta_file, pypeliner.managed.TempOutputFile('ref_base_counts.tsv'), )) workflow.transform( name="get_chrom_sizes", ctx=dict(mem=2), func="single_cell.workflows.strelka.tasks.get_known_chromosome_sizes", ret=pypeliner.managed.TempOutputObj('known_sizes'), args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'), chromosomes)) workflow.transform( name='call_somatic_variants', ctx=dict(mem=4, disk=40), func="single_cell.workflows.strelka.tasks.call_somatic_variants", axes=('region', ), args=( pypeliner.managed.InputFile("normal.split.bam", "region", fnames=normal_bam_file, extensions=['.bai']), pypeliner.managed.InputFile("merged_bam", "region", fnames=tumour_bam_file, extensions=['.bai']), pypeliner.managed.TempInputObj('known_sizes'), ref_genome_fasta_file, pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf', 'region'), pypeliner.managed.TempOutputFile( 'somatic.indels.unfiltered.vcf.window', 'region'), pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf', 'region'), pypeliner.managed.TempOutputFile('strelka.stats', 'region'), pypeliner.managed.InputInstance("region"), ), ) workflow.transform( name='add_indel_filters', axes=('chrom', ), ctx=dict(mem=4), func="single_cell.workflows.strelka.tasks.filter_indel_file_list", args=(pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf', 'region'), pypeliner.managed.TempInputFile('strelka.stats', 'region'), pypeliner.managed.TempInputFile( 'somatic.indels.unfiltered.vcf.window', 'region'), pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf', 'chrom'), pypeliner.managed.InputInstance("chrom"), pypeliner.managed.TempInputObj('known_sizes'), regions), kwargs={'use_depth_filter': use_depth_thresholds}) workflow.transform( name='add_snv_filters', axes=('chrom', ), ctx=dict(mem=4), func="single_cell.workflows.strelka.tasks.filter_snv_file_list", args=( pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf', 'region'), pypeliner.managed.TempInputFile('strelka.stats', 'region'), pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf', 'chrom'), pypeliner.managed.InputInstance("chrom"), pypeliner.managed.TempInputObj('known_sizes'), regions, ), kwargs={'use_depth_filter': use_depth_thresholds}) workflow.transform( name='merge_indels', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=( pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf', 'chrom'), pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf.gz'), pypeliner.managed.TempSpace("merge_indels_temp"), )) workflow.transform( name='merge_snvs', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=( pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf', 'chrom'), pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz'), pypeliner.managed.TempSpace("merge_snvs_temp"), )) workflow.transform( name='filter_indels', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.filter_vcf", args=( pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'), pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf'))) workflow.transform( name='filter_snvs', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.filter_vcf", args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'), pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf'))) workflow.transform( name='finalise_indels', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf", args=( pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'), pypeliner.managed.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']), )) workflow.transform( name='finalise_snvs', ctx=dict(mem=2), func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf", args=( pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'), pypeliner.managed.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']), )) workflow.transform( name='convert_strelka_to_csv', func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv", ctx=ctx, args=( pypeliner.managed.InputFile(snv_vcf_file), pypeliner.managed.TempOutputFile('strelka_snv.csv'), ), kwargs={ 'score_callback': strelka_snv_callback, }) workflow.transform( name='prep_strelka_csv', func='single_cell.utils.csvutils.rewrite_csv_file', args=(pypeliner.managed.TempInputFile('strelka_snv.csv'), pypeliner.managed.OutputFile(snv_csv_file, extensions=['.yaml'])), kwargs={'dtypes': dtypes()['snv_strelka']}) return workflow
def create_mutect_workflow( normal_bam_file, tumour_bam_file, ref_genome_fasta_file, cosmic_vcf_file, dbsnp_vcf_file, out_file, chromosomes=default_chromosomes, split_size=int(1e7)): workflow = Workflow() workflow.setobj( obj=pypeliner.managed.TempOutputObj('regions', 'regions'), value=utils.get_bam_regions(tumour_bam_file, split_size, chromosomes=chromosomes) ) workflow.transform( name='run_classify', axes=('regions',), ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 2, 'io': 1}, func=tasks.run_mutect, args=( pypeliner.managed.InputFile(normal_bam_file), pypeliner.managed.InputFile(tumour_bam_file), pypeliner.managed.InputFile(ref_genome_fasta_file), pypeliner.managed.InputFile(cosmic_vcf_file), pypeliner.managed.InputFile(dbsnp_vcf_file), pypeliner.managed.TempInputObj('regions', 'regions'), pypeliner.managed.TempOutputFile('classified.vcf', 'regions') ), ) workflow.transform( name='merge_vcf', ctx={'mem': 16, 'num_retry': 3, 'mem_retry_increment': 8}, func=vcf_tasks.concatenate_vcf, args=( pypeliner.managed.TempInputFile('classified.vcf', 'regions'), pypeliner.managed.TempOutputFile('merged.vcf.gz'), ), kwargs={ 'bcf_index_file': pypeliner.managed.TempOutputFile('merged.vcf.gz.csi'), 'vcf_index_file': pypeliner.managed.TempOutputFile('merged.vcf.gz.tbi'), } ) workflow.transform( name='filter_snvs', ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2}, func=vcf_tasks.filter_vcf, args=( pypeliner.managed.TempInputFile('merged.vcf.gz'), pypeliner.managed.TempOutputFile('merged.filtered.vcf') ) ) workflow.transform( name='finalise', func=vcf_tasks.finalise_vcf, args=( pypeliner.managed.TempInputFile('merged.filtered.vcf'), pypeliner.managed.OutputFile(out_file) ) ) return workflow
def create_titan_workflow(seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs): if normal_id is None: raise ValueError('Titan requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') utils.make_parent_directory(results_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) workflow.transform( name='prepare_normal_data', ctx={ 'mem': 16, 'num_retry': 3, 'mem_retry_increment': 4 }, func=tasks.prepare_normal_data, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.TempOutputFile('normal.wig'), pypeliner.managed.TempOutputFile('het_positions.tsv'), config, ), ) workflow.transform( name='prepare_tumour_data', axes=('sample_id', ), ctx={'mem': 20}, func=tasks.prepare_tumour_data, args=( pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), pypeliner.managed.TempInputFile('het_positions.tsv'), pypeliner.managed.TempOutputFile('tumour.wig', 'sample_id'), pypeliner.managed.TempOutputFile('tumour_alleles.tsv', 'sample_id'), config, ), ) workflow.transform( name='create_intialization_parameters', axes=('sample_id', ), ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, func=tasks.create_intialization_parameters, ret=pypeliner.managed.TempOutputObj('init_params', 'sample_id', 'init_param_id'), args=(config, ), ) workflow.transform( name='run_titan', axes=('sample_id', 'init_param_id'), ctx={ 'mem': 16, 'num_retry': 3, 'mem_retry_increment': 4 }, func=tasks.run_titan, args=( pypeliner.managed.TempInputObj('init_params', 'sample_id', 'init_param_id'), pypeliner.managed.TempInputFile('normal.wig'), pypeliner.managed.TempInputFile('tumour.wig', 'sample_id'), pypeliner.managed.TempInputFile('tumour_alleles.tsv', 'sample_id'), pypeliner.managed.TempOutputFile('cn.tsv', 'sample_id', 'init_param_id'), pypeliner.managed.TempOutputFile('params.tsv', 'sample_id', 'init_param_id'), config, ), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile( somatic_breakpoint_file) workflow.transform( name='select_solution', axes=('sample_id', ), ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, func=tasks.select_solution, args=( pypeliner.managed.TempInputObj('init_params', 'sample_id', 'init_param_id'), pypeliner.managed.TempInputFile('cn.tsv', 'sample_id', 'init_param_id'), pypeliner.managed.TempInputFile('params.tsv', 'sample_id', 'init_param_id'), pypeliner.managed.OutputFile('results', 'sample_id', template=results_files), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_loci.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_segments.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_igv.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'), 'sample_id'), config, pypeliner.managed.Template('{sample_id}', 'sample_id'), ), kwargs={ 'breakpoints_filename': somatic_breakpoint_file, }, ) workflow.setobj(obj=pypeliner.managed.OutputChunks('sample_id', 'chromosome'), value=config.get('chromosomes', default_chromosomes), axes=('sample_id', )) workflow.commandline( name='plot_chromosome', axes=('sample_id', 'chromosome'), ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, args=( 'plot_titan_chromosome.R', pypeliner.managed.Instance('chromosome'), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_cn_loci.tsv'), 'sample_id'), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'), 'sample_id'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', '{sample_id}_chr_{chromosome}.png'), 'sample_id', 'chromosome'), ), ) workflow.transform( name='merge_results', ctx={ 'mem': 8, 'num_retry': 3, 'mem_retry_increment': 2 }, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('results', 'sample_id', template=results_files), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow
def main(args): biowrappers.components.utils.make_directory(args.out_dir) with open(args.config_file) as config_file: config_text = config_file.read() config_text = config_text.format(out_dir=args.out_dir, ref_db_dir=args.ref_db_dir) config = yaml.load(config_text) pypeliner_args = vars(args) pypeliner_args['tmpdir'] = os.path.join(args.out_dir, 'pipeline') pyp = pypeliner.app.Pypeline(modules=[tasks], config=pypeliner_args) download_urls = {} for sample in ('tumour', 'normal'): lanes = config['lanes'][sample] for lane in lanes: download_urls[(sample, lane)] = config['lanes'][sample][lane]['url'] raw_lane_template = os.path.join(args.out_dir, 'lanes', 'raw', '{lane}.bam') realigned_lane_template = os.path.join(args.out_dir, 'lanes', 'realigned', '{lane}.bam') sample_bam_template = os.path.join(args.out_dir, '{sample}.bam') workflow = Workflow(default_ctx={'mem': 8}) workflow.setobj( obj=pypeliner.managed.TempOutputObj('url', 'sample', 'lane'), value=download_urls, ) workflow.subworkflow( name='download_lanes', axes=('sample', 'lane'), func=biowrappers.components.io.download.create_download_workflow, args=( pypeliner.managed.TempInputObj('url', 'sample', 'lane'), pypeliner.managed.OutputFile('raw_lane', 'sample', 'lane', template=raw_lane_template), ) ) workflow.subworkflow( name='realign_lanes', axes=('sample', 'lane'), func=biowrappers.pipelines.realignment.realignment_pipeline, args=( config['realignment'], pypeliner.managed.InputFile('raw_lane', 'sample', 'lane', template=raw_lane_template), pypeliner.managed.OutputFile('realigned_lane', 'sample', 'lane', template=realigned_lane_template), ) ) workflow.transform( name='merge_and_markdups', axes=('sample',), func=biowrappers.components.io.bam.tasks.mark_duplicates, args=( pypeliner.managed.InputFile('realigned_lane', 'sample', 'lane', template=realigned_lane_template), pypeliner.managed.OutputFile('bam', 'sample', template=sample_bam_template), ), kwargs={ 'tmp_dir': pypeliner.managed.TempSpace('markdup_temp', 'sample') } ) pyp.run(workflow) normal_bam_file = sample_bam_template.format(sample='normal') tumour_bam_file = sample_bam_template.format(sample='tumour') workflow = Workflow(default_ctx={'mem': 8}) breakpoint_raw_data_dir = os.path.join(args.out_dir, 'breakpoints', 'raw') breakpoint_results_file = os.path.join(args.out_dir, 'breakpoints', 'results.h5') workflow.subworkflow( name='breakpoint_call_and_annotate', func=biowrappers.pipelines.breakpoint_call_and_annotate.call_and_annotate_pipeline, args=( config, pypeliner.managed.InputFile(normal_bam_file), {'tumour': pypeliner.managed.InputFile(tumour_bam_file)}, pypeliner.managed.Template(os.path.join(breakpoint_raw_data_dir)), pypeliner.managed.OutputFile(breakpoint_results_file), ), ) somatic_breakpoints_file = os.path.join(args.out_dir, 'somatic_breakpoints.tsv') workflow.transform( name='extract_somatic_breakpoint', ctx={'mem': 4}, func=tasks.extract_somatic_breakpoint, args=( pypeliner.managed.InputFile(breakpoint_results_file), pypeliner.managed.OutputFile(somatic_breakpoints_file), config, ) ) copy_number_raw_data_dir = os.path.join(args.out_dir, 'copy_number', 'raw') breakpoint_results_file = os.path.join(args.out_dir, 'copy_number', 'results.h5') workflow.subworkflow( name='copy_number_call_and_annotate', func=biowrappers.pipelines.copy_number.call_and_annotate_pipeline, args=( config, pypeliner.managed.InputFile(normal_bam_file), {'tumour': pypeliner.managed.InputFile(tumour_bam_file)}, copy_number_raw_data_dir, pypeliner.managed.OutputFile(breakpoint_results_file), ), kwargs={ 'somatic_breakpoint_file': pypeliner.managed.InputFile(somatic_breakpoints_file), }, ) pyp.run(workflow)
def delly_pipeline( normal_bam_file, tumour_bam_files, ref_genome_fasta_file, delly_excl_chrom, out_file, raw_data_dir, ): bams = list() for lib_id, bam_filename in tumour_bam_files.items(): bams += [ utils.symlink(bam_filename, link_name='{0}.bam'.format(lib_id), link_directory=raw_data_dir) ] utils.symlink(bam_filename + '.bai', link_name='{0}.bam.bai'.format(lib_id), link_directory=raw_data_dir) bams += [ utils.symlink(normal_bam_file, link_name='Normal.bam', link_directory=raw_data_dir) ] utils.symlink(normal_bam_file + '.bai', link_name='Normal.bam.bai', link_directory=raw_data_dir) sample_type = {'Normal': 'control'} for lib_id in tumour_bam_files.keys(): sample_type[lib_id] = 'tumor' workflow = Workflow() workflow.setobj( obj=pypeliner.managed.TempOutputObj('sample_type', 'sample_id'), value=sample_type, ) workflow.setobj( obj=pypeliner.managed.OutputChunks('sv_type'), value=('DEL', 'DUP', 'INV', 'TRA', 'INS'), ) workflow.transform( name='delly_call', axes=('sv_type', ), ctx={ 'mem': 64, 'num_retry': 2, 'mem_retry_factor': 2 }, func=tasks.run_delly_call, args=( mgd.Instance('sv_type'), delly_excl_chrom, ref_genome_fasta_file, [mgd.InputFile(bam) for bam in bams], mgd.TempOutputFile('out.bcf', 'sv_type'), ), ) workflow.transform( name='write_samples_table', ctx={'mem': 1}, func=tasks.write_samples_table, args=( mgd.TempInputObj('sample_type', 'sample_id'), mgd.TempOutputFile('samples.tsv'), ), ) workflow.transform( name='delly_filter_somatic', axes=('sv_type', ), ctx={ 'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2 }, func=tasks.run_delly_filter, args=( mgd.Instance('sv_type'), mgd.TempInputFile('samples.tsv'), ref_genome_fasta_file, mgd.TempInputFile('out.bcf', 'sv_type'), mgd.TempOutputFile('somatic.bcf', 'sv_type'), ), ) workflow.transform( name='concatenate_vcf', func=vcf_tasks.concatenate_bcf, ctx={ 'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2 }, args=( mgd.TempInputFile('somatic.bcf', 'sv_type'), mgd.TempOutputFile('somatic.bcf'), ), ) workflow.transform( name='convert_vcf', func=tasks.convert_vcf, ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, args=( mgd.TempInputFile('somatic.bcf'), mgd.OutputFile(out_file), ), ) return workflow
def destruct_pipeline( normal_bam_file, tumour_bam_files, config, ref_data_dir, out_file, raw_data_dir, normal_sample_id='normal', ): bam_files = tumour_bam_files bam_files[normal_sample_id] = normal_bam_file utils.make_directory(os.path.join(raw_data_dir, 'raw')) breakpoint_file = os.path.join(raw_data_dir, 'raw', 'breakpoint.tsv') breakpoint_library_file = os.path.join(raw_data_dir, 'raw', 'breakpoint_library.tsv') breakpoint_read_file = os.path.join(raw_data_dir, 'raw', 'breakpoint_read.tsv') utils.make_directory(os.path.join(raw_data_dir, 'somatic')) somatic_breakpoint_file = os.path.join(raw_data_dir, 'somatic', 'breakpoint.tsv') somatic_breakpoint_library_file = os.path.join(raw_data_dir, 'somatic', 'breakpoint_library.tsv') raw_read_data_dir = os.path.join(raw_data_dir, 'read_data') utils.make_directory(raw_read_data_dir) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=bam_files.keys(), ) workflow.subworkflow( name='run_destruct', func="destruct.workflow.create_destruct_workflow", args=( pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files), pypeliner.managed.OutputFile(breakpoint_file), pypeliner.managed.OutputFile(breakpoint_library_file), pypeliner.managed.OutputFile(breakpoint_read_file), config, ref_data_dir, ), kwargs={ 'raw_data_dir': raw_read_data_dir, }, ) workflow.transform( name='filter_annotate_breakpoints', ctx={'mem': 8}, func= 'biowrappers.components.breakpoint_calling.destruct.tasks.filter_annotate_breakpoints', args=( pypeliner.managed.InputFile(breakpoint_file), pypeliner.managed.InputFile(breakpoint_library_file), [normal_sample_id], pypeliner.managed.OutputFile(somatic_breakpoint_file), pypeliner.managed.OutputFile(somatic_breakpoint_library_file), ), ) workflow.transform( name='write_store', func= 'biowrappers.components.breakpoint_calling.destruct.tasks.write_store', ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, args=( pypeliner.managed.InputFile(somatic_breakpoint_file), pypeliner.managed.InputFile(somatic_breakpoint_library_file), mgd.OutputFile(out_file), ), ) return workflow
def create_vardict_paired_sample_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=default_chromosomes, java=False, min_allele_frequency=0.01, remove_duplicate_reads=False, sample_names=None, split_size=int(1e7)): workflow = Workflow() workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=utils.get_bam_regions(normal_bam_file, split_size, chromosomes=chromosomes)) workflow.transform( name='run_vardict', axes=('regions', ), ctx={ 'mem': 12, 'num_retry': 4, 'mem_retry_increment': 2 }, func=tasks.run_paired_sample_vardict, args=( pypeliner.managed.InputFile(normal_bam_file), pypeliner.managed.InputFile(tumour_bam_file), pypeliner.managed.InputFile(ref_genome_fasta_file), pypeliner.managed.TempInputObj('config', 'regions'), pypeliner.managed.TempOutputFile('result.vcf', 'regions'), ), kwargs={ 'java': java, 'min_allele_frequency': min_allele_frequency, 'remove_duplicate_reads': remove_duplicate_reads, 'sample_names': sample_names, }, ) workflow.transform( name='compress_tmp', axes=('regions', ), ctx={ 'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2 }, func=vcf_tasks.compress_vcf, args=( pypeliner.managed.TempInputFile('result.vcf', 'regions'), pypeliner.managed.TempOutputFile('result.vcf.gz', 'regions'), ), kwargs={ 'index_file': pypeliner.managed.TempOutputFile('result.vcf.gz.tbi', 'regions'), }) workflow.transform( name='concatenate_vcf', ctx={ 'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2 }, func=vcf_tasks.concatenate_vcf, args=( pypeliner.managed.TempInputFile('result.vcf.gz', 'regions'), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'bcf_index_file': pypeliner.managed.OutputFile(out_file + '.csi'), 'vcf_index_file': pypeliner.managed.OutputFile(out_file + '.tbi'), }, ) return workflow