def realignment_readgroups_pipeline( config, in_file, out_file): workflow = Workflow() workflow.transform( name='get_read_group_configs', func=tasks.get_read_group_configs, ret=pypeliner.managed.TempOutputObj('read_group_config', 'read_group_id'), args=( pypeliner.managed.InputFile(in_file), ) ) workflow.commandline( name='create_read_group_bam', axes=('read_group_id',), args=( 'samtools', 'view', '-b', '-r', pypeliner.managed.InputInstance('read_group_id'), pypeliner.managed.InputFile(in_file), '>', pypeliner.managed.TempOutputFile('read_group_bam', 'read_group_id'), ) ) workflow.subworkflow( name='realignment_pipeline', axes=('read_group_id',), func=realignment_pipeline, args=( config, pypeliner.managed.TempInputFile('read_group_bam', 'read_group_id'), pypeliner.managed.TempOutputFile('realigned_read_group_bam', 'read_group_id'), ), kwargs={ 'read_group_info': pypeliner.managed.TempInputObj('read_group_config', 'read_group_id'), } ) workflow.transform( name='merge_and_markdups', axes=('read_group_id',), ctx={'mem' : 48, 'num_retry' : 3, 'mem_retry_increment' : 16}, func=bam_tasks.mark_duplicates, args=( pypeliner.managed.TempInputFile('realigned_read_group_bam', 'read_group_id'), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'tmp_dir' : pypeliner.managed.TempSpace('markdup_temp', 'read_group_id') } ) return workflow
def create_setup_theta_workflow(config, databases, **kwargs): mappability_dir = os.path.realpath( os.path.join(os.path.dirname(config['mappability_template']), os.pardir)) map_extract_log = os.path.join(mappability_dir, 'mappability_extract.log') chromosomes_dir = os.path.dirname(config['chromosome_template']) utils.make_directory(mappability_dir) utils.make_directory(chromosomes_dir) workflow = Workflow() workflow.subworkflow( name='download_mappability', func=biowrappers.components.io.download.create_download_workflow, args=( config['mappability_url'], pypeliner.managed.TempOutputFile('mappability.tar.gz'), )) workflow.commandline( name='extract_mappability', args=( 'tar', '-xzvf', pypeliner.managed.TempInputFile('mappability.tar.gz'), '-C', mappability_dir, '>', pypeliner.managed.OutputFile(map_extract_log), ), ) for chromosome in config['chromosomes']: workflow.subworkflow( name='download_chromosome_{}'.format(chromosome), func=biowrappers.components.io.download.create_download_workflow, args=( config['chromosome_url_template'].format(chromosome), pypeliner.managed.TempOutputFile( 'chromosome_{}.fa.gz'.format(chromosome)), )) workflow.commandline( name='extract_chromosome_{}'.format(chromosome), args=( 'gunzip', '-c', pypeliner.managed.TempInputFile( 'chromosome_{}.fa.gz'.format(chromosome)), '>', pypeliner.managed.OutputFile( config['chromosome_template'].format(chromosome)), ), ) return workflow
def create_tophat_transcriptome_index_workflow( ref_genome_fasta_file, transcript_gtf_file, ref_genome_index_prefix, transcriptome_index_prefix, copy_ref_genome=False): workflow = Workflow() local_ref_genome_fasta_path = ref_genome_index_prefix + '.fa' if copy_ref_genome: workflow.commandline( name='copy_genome', ctx={'local': True}, args=( 'cp', mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(local_ref_genome_fasta_path), ), ) else: workflow.commandline( name='link_genome', ctx={'local': True}, args=( 'ln', '-s', mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(local_ref_genome_fasta_path), ), ) workflow.transform( name='build_bowtie_index', ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8}, func=tasks.build_genome_index, args=( mgd.InputFile(local_ref_genome_fasta_path), mgd.OutputFile(ref_genome_index_prefix), ) ) workflow.transform( name='build_tophat_index', ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8}, func=tasks.build_transcriptome_index, args=( mgd.InputFile(ref_genome_index_prefix), mgd.InputFile(transcript_gtf_file), mgd.OutputFile(transcriptome_index_prefix), ) ) return workflow
def create_snpeff_annotation_workflow(db, data_dir, target_vcf_file, out_file, base_docker={}, snpeff_docker={}, classic_mode=True, split_size=int(1e3), table_name='snpeff'): ctx = {'num_retry': 3, 'mem_retry_increment': 2} if base_docker: ctx.update(base_docker) workflow = Workflow() workflow.transform(name='split_vcf', ctx=dict(mem=2, **ctx), func='biowrappers.components.io.vcf.tasks.split_vcf', args=(mgd.InputFile(target_vcf_file), mgd.TempOutputFile('split.vcf', 'split')), kwargs={'lines_per_file': split_size}) workflow.transform( name='run_snpeff', axes=('split', ), ctx=dict(mem=8, **ctx), func='biowrappers.components.variant_calling.snpeff.tasks.run_snpeff', args=(db, data_dir, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('snpeff.vcf', 'split')), kwargs={ 'classic_mode': classic_mode, 'docker_config': snpeff_docker }) workflow.transform( name='convert_vcf_to_csv', axes=('split', ), ctx=dict(mem=4, **ctx), func= 'biowrappers.components.variant_calling.snpeff.tasks.convert_vcf_to_table', args=(mgd.TempInputFile('snpeff.vcf', 'split'), mgd.TempOutputFile('snpeff.csv.gz', 'split', extensions=['.yaml']), table_name)) workflow.transform(name='concatenate_tables', ctx=dict(mem=4, **ctx), func='single_cell.utils.csvutils.concatenate_csv', args=(mgd.TempInputFile('snpeff.csv.gz', 'split'), mgd.OutputFile(out_file, extensions=['.yaml']))) return workflow
def create_battenberg_workflow( seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs ): if normal_id is None: raise ValueError('cloneHD requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') utils.make_parent_directory(results_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file) workflow.subworkflow( name='run_battenberg', axes=('sample_id',), func=create_battenberg_single_workflow, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), normal_id, pypeliner.managed.InputInstance('sample_id'), pypeliner.managed.OutputFile('results', 'sample_id', template=results_files), config, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, }, ) workflow.transform( name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('results', 'sample_id', template=results_files), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow
def create_samtools_germline_workflow( normal_bam_files, normal_bai_files, ref_genome_fasta_file, vcf_file, config, chromosomes=default_chromosomes, base_docker=None, samtools_docker=None, vcftools_docker=None ): ctx = {'mem': config["memory"]['low'], 'pool_id': config['pools']['standard'], 'mem_retry_increment': 2, 'ncpus': 1} if base_docker: ctx.update(base_docker) regions = normal_bam_files.keys() workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('regions'), value=regions, ) workflow.transform( name='run_samtools_variant_calling', ctx=ctx, axes=('regions',), func="single_cell.workflows.germline.tasks.run_samtools_variant_calling", args=( pypeliner.managed.InputFile('normal.split.bam', 'regions', fnames=normal_bam_files), pypeliner.managed.InputFile('normal.split.bam.bai', 'regions', fnames=normal_bai_files), ref_genome_fasta_file, pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'), ), kwargs={ 'region': pypeliner.managed.InputInstance('regions'), 'samtools_docker': samtools_docker, 'vcftools_docker': samtools_docker }, ) workflow.transform( name='concatenate_variants', ctx=ctx, func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=( pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'), pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']), pypeliner.managed.TempSpace("merge_variants_germline"), ), kwargs={'docker_config': vcftools_docker} ) return workflow
def create_clonehd_single_workflow( normal_seqdata_file, tumour_seqdata_file, config, results_file, somatic_breakpoint_file=None, **kwargs ): workflow = Workflow() workflow.transform( name='prepare_data', ctx={'mem': 20}, func=tasks.prepare_data, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.InputFile(tumour_seqdata_file), pypeliner.managed.TempOutputFile('normal.cna.txt'), pypeliner.managed.TempOutputFile('tumour.cna.txt'), pypeliner.managed.TempOutputFile('tumour.baf.txt'), config, ), ) workflow.transform( name='run_clonehd', ctx={'mem': 8}, func=tasks.run_clonehd, args=( pypeliner.managed.TempInputFile('normal.cna.txt'), pypeliner.managed.TempInputFile('tumour.cna.txt'), pypeliner.managed.TempInputFile('tumour.baf.txt'), pypeliner.managed.TempOutputFile('tumour.summary.txt'), pypeliner.managed.TempOutputFile('cna_subclone', 'subclone'), pypeliner.managed.TempOutputFile('bam_subclone', 'subclone', axes_origin=[]), pypeliner.managed.TempSpace('run_clonehd_temp', cleanup=None), ), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file) workflow.transform( name='report', ctx={'mem': 4}, func=tasks.report, args=( pypeliner.managed.TempInputFile('tumour.summary.txt'), pypeliner.managed.TempInputFile('cna_subclone', 'subclone'), pypeliner.managed.TempInputFile('bam_subclone', 'subclone'), pypeliner.managed.OutputFile(results_file), ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, }, ) return workflow
def create_setup_remixt_workflow(config, databases, **kwargs): workflow = Workflow() ref_data_sentinal = os.path.join(kwargs['ref_data_dir'], 'sentinal') workflow.transform( name='remixt_create_ref_data', func=remixt.ref_data.create_ref_data, args=( config, kwargs['ref_data_dir'], pypeliner.managed.OutputFile(ref_data_sentinal), ), ) workflow.subworkflow( name='remixt_create_bwa_mappability', func=remixt.mappability.bwa.workflow.create_bwa_mappability_workflow, args=( config, kwargs['ref_data_dir'], ), kwargs={ 'ref_data_sentinal': pypeliner.managed.InputFile(ref_data_sentinal), }, ) return workflow
def create_ascat_workflow(seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs): if normal_id is None: raise ValueError('ASCAT requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') utils.make_parent_directory(results_files) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) workflow.transform( name='prepare_normal_data', ctx={ 'mem': 16, 'num_retry': 3, 'mem_retry_increment': 4 }, func=tasks.prepare_normal_data, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.TempOutputFile('Germline_LogR.txt'), pypeliner.managed.TempOutputFile('Germline_BAF.txt'), config, ), ) workflow.transform( name='prepare_tumour_data', axes=('sample_id', ), ctx={'mem': 20}, func=tasks.prepare_tumour_data, args=( pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), pypeliner.managed.TempOutputFile('Germline_LogR.txt', 'sample_id'), pypeliner.managed.TempOutputFile('Germline_BAF.txt', 'sample_id'), config, ), ) return workflow
def create_samtools_germline_workflow(normal_bam_files, ref_genome_fasta_file, vcf_file, config, samtools_docker=None, vcftools_docker=None): baseimage = config['docker']['single_cell_pipeline'] ctx = { 'mem': config["memory"]['low'], 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'docker_image': baseimage } regions = list(normal_bam_files.keys()) workflow = Workflow(ctx=ctx) workflow.setobj( obj=pypeliner.managed.OutputChunks('regions'), value=regions, ) workflow.transform( name='run_samtools_variant_calling', axes=('regions', ), func= "single_cell.workflows.germline.tasks.run_samtools_variant_calling", args=( pypeliner.managed.InputFile('normal.split.bam', 'regions', fnames=normal_bam_files, extensions=['.bai']), ref_genome_fasta_file, pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'), ), kwargs={ 'region': pypeliner.managed.InputInstance('regions'), 'samtools_docker': samtools_docker, 'vcftools_docker': samtools_docker }, ) workflow.transform( name='concatenate_variants', func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=( pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'), pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']), pypeliner.managed.TempSpace("merge_variants_germline"), ), kwargs={'docker_config': vcftools_docker}) return workflow
def create_gc_wig_file(config, genome_file, out_file): workflow = Workflow() workflow.commandline( name='create_gc', ctx={'mem': 4}, args=( 'gcCounter', '-w', config['window_size'], pypeliner.managed.InputFile(genome_file), '>', pypeliner.managed.OutputFile(out_file), ), ) return workflow
def main(args): config = cli.load_pypeliner_config(args) pyp = pypeliner.app.Pypeline([], config) workflow = Workflow() workflow.subworkflow(name='snpeff', func=snpeff.create_snpeff_annotation_workflow, args=(pypeliner.managed.InputFile( args.target_vcf_file), pypeliner.managed.TempOutputFile('snpeff.h5')), kwargs={ 'data_base': args.data_base, 'split_size': args.split_size, 'table_name': 'snpeff' }) workflow.transform(name='convert_to_tsv', func=convert_hdf5_to_tsv, ctx={'mem': 2}, args=(pypeliner.managed.TempInputFile('snpeff.h5'), 'snpeff', pypeliner.managed.OutputFile(args.out_file)), kwargs={ 'compress': True, 'index': False }) pyp.run(workflow)
def create_mappability_wig_file(config, out_file): workflow = Workflow() workflow.subworkflow( name='download_mappability_bigwig', func=biowrappers.components.io.download.create_download_workflow, args=( config['mappability_url'], pypeliner.managed.OutputFile(out_file + '.bigwig'), )) workflow.commandline( name='convert_mappability_to_wig', ctx={'mem': 4}, args=( 'mapCounter', '-w', config['window_size'], pypeliner.managed.InputFile(out_file + '.bigwig'), '>', pypeliner.managed.OutputFile(out_file), ), ) return workflow
def create_hla_type_workflow( normal_bam_file, hla_type_file): workflow = Workflow() workflow.commandline( name='extract_chr6', args=( 'samtools', 'view', '-bh', '-f', '2', '-F', '4', pypeliner.managed.InputFile(normal_bam_file), '6', '|', 'samtools', 'collate', '-O', '-', pypeliner.managed.TempSpace('chr6_collate_temp'), '|', 'samtools', 'bam2fq', '-1', pypeliner.managed.TempOutputFile('chr6_reads_1.fq'), '-2', pypeliner.managed.TempOutputFile('chr6_reads_2.fq'), '-', ), ) workflow.transform( name='optitype', ctx={'mem': 24}, func=tasks.run_optitype, args=( pypeliner.managed.TempInputFile('chr6_reads_1.fq'), pypeliner.managed.TempInputFile('chr6_reads_2.fq'), pypeliner.managed.OutputFile(hla_type_file), pypeliner.managed.TempSpace('optitype_temp'), ) ) return workflow
def create_download_workflow(url, file_name): workflow = Workflow() workflow.setobj(obj=pypeliner.managed.TempOutputObj('url'), value=url) workflow.transform(name='download', ctx={'local': True}, func=tasks.download_from_url, args=(pypeliner.managed.TempInputObj('url'), pypeliner.managed.OutputFile(file_name))) return workflow
def create_battenberg_single_workflow( normal_seqdata_file, tumour_seqdata_file, normal_id, tumour_id, results_file, config, somatic_breakpoint_file=None, **kwargs ): workflow = Workflow() workflow.transform( name='prepare_data', ctx={'mem': 20}, func=tasks.prepare_data, args=( pypeliner.managed.InputFile(normal_seqdata_file), pypeliner.managed.InputFile(tumour_seqdata_file), normal_id, tumour_id, pypeliner.managed.TempOutputFile('allele_counts.tar.gz'), pypeliner.managed.TempSpace('prepare_battenberg_temp', cleanup=None), config, ), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file) workflow.transform( name='run_battenberg', ctx={'mem': 20}, func=tasks.run_battenberg, args=( pypeliner.managed.TempInputFile('allele_counts.tar.gz'), normal_id, tumour_id, pypeliner.managed.OutputFile(results_file), pypeliner.managed.TempSpace('run_battenberg_temp', cleanup=None), config ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, }, ) return workflow
def download_external_files(config): download_keys = [x for x in config if 'url' in config[x]] urls = dict(zip( download_keys, [config[x]['url'] for x in download_keys], )) downloaded_files = dict( zip( urls.keys(), [config[x]['local_path'] for x in urls.keys()], )) workflow = Workflow() workflow.setobj( obj=mgd.TempOutputObj('url', 'files'), value=urls, ) workflow.subworkflow( name='download', func=create_download_workflow, axes=('files', ), args=( mgd.TempInputObj('url', 'files'), mgd.TempOutputFile('download.file', 'files'), ), ) workflow.transform( name='unzip', axes=('files', ), func=tasks.unzip, args=( mgd.TempInputFile('download.file', 'files'), mgd.OutputFile('unzipped', 'files', fnames=downloaded_files), ), ) return workflow
def create_setup_titan_workflow(config, databases, **kwargs): workflow = Workflow() workflow.subworkflow(name='gc_wig', func=create_gc_wig_file, args=( config, pypeliner.managed.InputFile( databases['ref_genome']['local_path']), pypeliner.managed.OutputFile(config['gc_wig']), )) workflow.subworkflow(name='mappability_wig', func=create_mappability_wig_file, args=( config, pypeliner.managed.OutputFile( config['mappability_wig']), )) return workflow
def create_pvacseq_workflow( vcf_file, hla_type_file, results_file, config, ): workflow = Workflow() workflow.commandline( name='vep', ctx={'mem': 16}, args=( 'variant_effect_predictor.pl', '--input_file', pypeliner.managed.InputFile(vcf_file), '--format', 'vcf', '--output_file', pypeliner.managed.TempOutputFile('vep_annotated.vcf'), '--vcf', '--symbol', '--terms', 'SO', '--plugin', 'Downstream', '--plugin', 'Wildtype', '--cache', '--offline', '--force_overwrite', '--assembly', 'GRCh37', '--dir', config['vep_dir'], '--dir_plugins', os.path.join(config['vep_dir'], 'Plugins'), ), ) workflow.transform( name='run_pvacseq', func=tasks.run_pvacseq, args=( pypeliner.managed.TempInputFile('vep_annotated.vcf'), pypeliner.managed.InputFile(hla_type_file), pypeliner.managed.OutputFile(results_file), pypeliner.managed.TempSpace('pvacseq_temp'), config, ), ) return workflow
def create_dbsnp_download_workflow(config, out_file): workflow = Workflow() workflow.subworkflow( name='download', func=download.create_download_workflow, args=( config['url'], pypeliner.managed.OutputFile(out_file) ) ) workflow.transform( name='index', ctx={'mem': 4}, func=vcf_tasks.index_vcf, args=( pypeliner.managed.InputFile(out_file), ) ) return workflow
def create_annotation_workflow( config, in_vcf_file, cosmic_status_file, dbsnp_status_file, mappability_file, snpeff_file, trinuc_file, variant_type='snv', ): annotators = ('cosmic_status', 'dbsnp_status', 'mappability', 'snpeff', 'tri_nucleotide_context') kwargs = {} for a in annotators: kwargs[a] = get_kwargs(config[a]['kwargs'], '/{0}/{1}'.format(variant_type, a)) workflow = Workflow() workflow.subworkflow( name='cosmic_status', func= 'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2), args=(config['databases']['cosmic']['local_path'], pypeliner.managed.InputFile(in_vcf_file), pypeliner.managed.OutputFile(cosmic_status_file)), kwargs=config["cosmic_status"]['kwargs']) workflow.subworkflow( name='dbsnp_status', func= 'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2), args=(config['databases']['dbsnp']['local_path'], pypeliner.managed.InputFile(in_vcf_file), pypeliner.managed.OutputFile(dbsnp_status_file)), kwargs=config["dbsnp_status"]['kwargs']) workflow.subworkflow( name='mappability', func= 'biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2), args=( config['databases']['mappability']['local_path'], pypeliner.managed.InputFile(in_vcf_file, extensions=['.tbi']), pypeliner.managed.OutputFile(mappability_file), ), kwargs=config["mappability"]['kwargs']) workflow.subworkflow( name='snpeff', func= 'biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2), args=(config['databases']['snpeff']['db'], config['databases']['snpeff']['data_dir'], pypeliner.managed.InputFile(in_vcf_file), pypeliner.managed.OutputFile(snpeff_file)), kwargs=kwargs['snpeff']) workflow.subworkflow( name='tri_nucleotide_context', func= 'biowrappers.components.variant_calling.tri_nucleotide_context.create_vcf_tric_nucleotide_annotation_workflow', ctx=dict(mem=4, mem_retry_increment=2), args=( config['databases']['ref_genome']['local_path'], pypeliner.managed.InputFile(in_vcf_file), pypeliner.managed.OutputFile(trinuc_file), ), kwargs=config["tri_nucleotide_context"]['kwargs']) return workflow
def call_and_annotate_pipeline(config, normal_bam_path, tumour_bam_paths, raw_data_dir, results_file, chromosomes=default_chromosomes): workflow = Workflow() workflow.setobj( pypeliner.managed.OutputChunks('tumour_sample_id', axes_origin=[ 0, ]), tumour_bam_paths.keys()) variant_files = get_variant_files(chromosomes, config, raw_data_dir) normal_bam_file = pypeliner.managed.File(normal_bam_path) tumour_bam_files = pypeliner.managed.File('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths) ref_genome_fasta_file = pypeliner.managed.File( config['databases']['ref_genome']['local_path']) #=================================================================================================================== # Multi sample calling #=================================================================================================================== if 'nuseq_multi_sample' in config: workflow.subworkflow( name='nuseq_multi_sample', axes=(), func= 'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow', args=( normal_bam_file.as_input(), [ pypeliner.managed.InputFile(x) for x in tumour_bam_paths.values() ], ref_genome_fasta_file.as_input(), variant_files['snv']['vcf']['nuseq_multi_sample'].as_output()), kwargs=config['nuseq_multi_sample']['kwargs']) workflow.transform( name='convert_nuseq_multi_sample_vcf_to_hdf5', axes=(), ctx=default_ctx, func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5", args=( variant_files['snv']['vcf']['nuseq_multi_sample'].as_input(), variant_files['snv']['hdf']['nuseq_multi_sample'].as_output(), '/snv/vcf/nuseq_multi_sample/all', ), kwargs={'score_callback': vcf_score_callbacks['snv']['nuseq']}) #=================================================================================================================== # Single sample calling #=================================================================================================================== if 'nuseq' in config: workflow.subworkflow( name='nuseq', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow', args=(normal_bam_file.as_input(), [ tumour_bam_files.as_input(), ], ref_genome_fasta_file.as_input(), variant_files['snv']['vcf']['nuseq'].as_output()), kwargs=config['nuseq']['kwargs']) if 'mutect' in config: workflow.subworkflow( name='mutect', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.mutect.create_mutect_workflow', args=(normal_bam_file.as_input(), tumour_bam_files.as_input(), ref_genome_fasta_file.as_input(), config['databases']['cosmic']['local_path'], config['databases']['dbsnp']['local_path'], variant_files['snv']['vcf']['mutect'].as_output()), kwargs=config['mutect']['kwargs']) if 'strelka' in config: workflow.subworkflow( name='strelka', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.strelka.create_strelka_workflow', args=(normal_bam_file.as_input(), tumour_bam_files.as_input(), ref_genome_fasta_file.as_input(), variant_files['indel']['vcf']['strelka'].as_output(), variant_files['snv']['vcf']['strelka'].as_output()), kwargs=config['strelka']['kwargs']) #=================================================================================================================== # Convert vcf to hdf5 #=================================================================================================================== for var_type in variant_files: for prog in variant_files[var_type]['vcf']: if prog == 'nuseq_multi_sample': continue workflow.transform( name='convert_{0}_indel_{1}_to_hdf5'.format(prog, var_type), axes=('tumour_sample_id', ), ctx=default_ctx, func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5", args=(variant_files[var_type]['vcf'][prog].as_input(), variant_files[var_type]['hdf'][prog].as_output(), pypeliner.managed.Template( '/{var_type}/vcf/{prog}/{{tumour_sample_id}}'.format( prog=prog, var_type=var_type), 'tumour_sample_id')), kwargs={'score_callback': vcf_score_callbacks[var_type][prog]}) #=================================================================================================================== # Indel annotation #=================================================================================================================== workflow.transform( name='merge_indels', ctx=big_mem_ctx, func='biowrappers.components.io.vcf.tasks.vcf_tasks.merge_vcfs', args=([x.as_input() for x in variant_files['indel']['vcf'].values()], pypeliner.managed.TempOutputFile('all.indel.vcf'))) workflow.transform( name='finalise_indels', func="biowrappers.components.io.vcf.tasks.finalise_vcf", args=(pypeliner.managed.TempInputFile('all.indel.vcf'), pypeliner.managed.TempOutputFile('all.indel.vcf.gz'))) workflow.subworkflow( name='annotate_indels', axes=(), func=create_annotation_workflow, args=( config, pypeliner.managed.TempInputFile('all.indel.vcf.gz'), pypeliner.managed.TempOutputFile('indel_annotations.h5'), os.path.join(raw_data_dir, 'indel'), ), kwargs={'variant_type': 'indel'}) #=================================================================================================================== # SNV #=================================================================================================================== workflow.transform( name='merge_snvs', ctx=big_mem_ctx, func="biowrappers.components.io.vcf.tasks.merge_vcfs", args=([x.as_input() for x in variant_files['snv']['vcf'].values()], pypeliner.managed.TempOutputFile('all.snv.vcf'))) workflow.transform( name='finalise_snvs', func="biowrappers.components.io.vcf.tasks.finalise_vcf", args=(pypeliner.managed.TempInputFile('all.snv.vcf'), pypeliner.managed.TempOutputFile('all.snv.vcf.gz'))) workflow.subworkflow( name='annotate_snvs', axes=(), func=create_annotation_workflow, args=( config, pypeliner.managed.TempInputFile('all.snv.vcf.gz'), pypeliner.managed.TempOutputFile('snv_annotations.h5'), os.path.join(raw_data_dir, 'snv'), ), kwargs={'variant_type': 'snv'}) workflow.subworkflow( name='normal_snv_counts', func= 'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow', args=( normal_bam_file.as_input(), pypeliner.managed.TempInputFile('all.snv.vcf.gz'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')), ), kwargs=get_kwargs(config['snv_counts']['kwargs'], '/snv/counts/normal')) workflow.subworkflow( name='tumour_snv_counts', axes=('tumour_sample_id', ), func= 'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow', args=(tumour_bam_files.as_input(), pypeliner.managed.TempInputFile('all.snv.vcf.gz'), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'snv', 'counts', '{tumour_sample_id}.h5'), 'tumour_sample_id')), kwargs=get_kwargs( config['snv_counts']['kwargs'], pypeliner.managed.Template('/snv/counts/{tumour_sample_id}', 'tumour_sample_id'))) #=================================================================================================================== # Create final output #=================================================================================================================== tables = [ pypeliner.managed.TempInputFile('indel_annotations.h5'), pypeliner.managed.TempInputFile('snv_annotations.h5'), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'snv', 'counts', '{tumour_sample_id}.h5'), 'tumour_sample_id'), ] for var_type in variant_files: for prog in variant_files[var_type]['hdf']: tables.append(variant_files[var_type]['hdf'][prog].as_input()) workflow.transform( name='build_results_file', ctx=default_ctx, func='biowrappers.components.io.hdf5.tasks.concatenate_tables', args=(tables, pypeliner.managed.OutputFile(results_file)), kwargs={ 'drop_duplicates': True, }) return workflow
def call_and_annotate_pipeline( config, bam_files, raw_data_dir, results_file, normal_id=None, somatic_breakpoint_file=None, patient_config=None, ): sample_ids = bam_files.keys() tumour_ids = bam_files.keys() if normal_id is not None: tumour_ids.remove(normal_id) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=sample_ids, ) workflow.setobj( obj=pypeliner.managed.OutputChunks('tumour_id'), value=tumour_ids, ) seq_data_template = os.path.join(raw_data_dir, 'seqdata', 'sample_{sample_id}.h5') if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile( somatic_breakpoint_file) workflow.subworkflow( name='extract_seqdata_workflow', axes=('sample_id', ), func=remixt.workflow.create_extract_seqdata_workflow, args=( pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files), pypeliner.managed.OutputFile('seqdata', 'sample_id', template=seq_data_template), config['remixt'].get('extract_seqdata', {}), config['remixt']['ref_data_dir'], ), ) merge_inputs = {} if 'remixt' in config: remixt_raw_data = os.path.join(raw_data_dir, 'remixt') remixt_results_filename = os.path.join(remixt_raw_data, 'results.h5') make_parent_directory(remixt_results_filename) remixt_config = config['remixt']['config'] assert 'sample_specific' not in remixt_config remixt_config.update(patient_config) workflow.subworkflow( name='remixt', func=biowrappers.components.copy_number_calling.remixt. create_remixt_workflow, args=( pypeliner.managed.InputFile('seqdata', 'sample_id', template=seq_data_template), remixt_config, pypeliner.managed.OutputFile(remixt_results_filename), remixt_raw_data, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, 'ref_data_dir': config['remixt']['ref_data_dir'], 'normal_id': normal_id, }, ) merge_inputs['/copy_number/remixt'] = pypeliner.managed.InputFile( remixt_results_filename) if 'titan' in config: titan_raw_data = os.path.join(raw_data_dir, 'titan') titan_results_filename = os.path.join(titan_raw_data, 'results.h5') make_parent_directory(titan_results_filename) workflow.subworkflow( name='titan', func=biowrappers.components.copy_number_calling.titan. create_titan_workflow, args=( pypeliner.managed.InputFile('seqdata', 'sample_id', template=seq_data_template), config['titan']['config'], pypeliner.managed.OutputFile(titan_results_filename), titan_raw_data, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, 'normal_id': normal_id, }, ) merge_inputs['/copy_number/titan'] = pypeliner.managed.InputFile( titan_results_filename) if 'clonehd' in config: clonehd_raw_data = os.path.join(raw_data_dir, 'clonehd') clonehd_results_filename = os.path.join(clonehd_raw_data, 'results.h5') make_parent_directory(clonehd_results_filename) workflow.subworkflow( name='clonehd', func=biowrappers.components.copy_number_calling.clonehd. create_clonehd_workflow, args=( pypeliner.managed.InputFile('seqdata', 'sample_id', template=seq_data_template), config['clonehd']['config'], pypeliner.managed.OutputFile(clonehd_results_filename), clonehd_raw_data, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, 'normal_id': normal_id, }, ) merge_inputs['/copy_number/clonehd'] = pypeliner.managed.InputFile( clonehd_results_filename) if 'theta' in config: theta_raw_data = os.path.join(raw_data_dir, 'theta') theta_results_filename = os.path.join(theta_raw_data, 'results.h5') make_parent_directory(theta_results_filename) workflow.subworkflow( name='theta', func=biowrappers.components.copy_number_calling.theta. create_theta_workflow, args=( pypeliner.managed.InputFile('seqdata', 'sample_id', template=seq_data_template), config['theta']['config'], pypeliner.managed.OutputFile(theta_results_filename), theta_raw_data, ), kwargs={ 'somatic_breakpoint_file': somatic_breakpoint_file, 'normal_id': normal_id, 'num_clones': config['theta']['kwargs']['num_clones'], }, ) merge_inputs['/copy_number/theta'] = pypeliner.managed.InputFile( theta_results_filename) workflow.transform( name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( merge_inputs, pypeliner.managed.OutputFile(results_file), ), ) return workflow
def create_theta_workflow(seqdata_files, config, out_file, raw_data_dir, somatic_breakpoint_file=None, normal_id=None, **kwargs): if normal_id is None: raise ValueError('Theta requires normal sample') normal_seqdata_file = seqdata_files[normal_id] tumour_seqdata_files = seqdata_files.copy() del tumour_seqdata_files[normal_id] results_template = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5') bicseq2_seg_template = os.path.join(raw_data_dir, 'bicseq2', 'bicseq2_{sample_id}.seg') utils.make_parent_directory(results_template) utils.make_parent_directory(bicseq2_seg_template) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=tumour_seqdata_files.keys(), ) if somatic_breakpoint_file is not None: somatic_breakpoint_file = pypeliner.managed.InputFile( somatic_breakpoint_file) workflow.transform( name='run_bicseq2', axes=('sample_id', ), ctx={'mem': 30}, func=tasks.run_bicseq2_seg, args=( pypeliner.managed.OutputFile('bicseq2_seg', 'sample_id', template=bicseq2_seg_template), pypeliner.managed.InputFile('normal_seqdata', template=normal_seqdata_file), pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), config, pypeliner.managed.TempSpace('bicseq2_work', 'sample_id', cleanup=None), ), ) workflow.transform( name='run_theta', axes=('sample_id', ), ctx={'mem': 32}, func=tasks.run_theta, args=( pypeliner.managed.OutputFile('results', 'sample_id', template=results_template), pypeliner.managed.InputFile('normal_seqdata', template=normal_seqdata_file), pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files), pypeliner.managed.InputFile('bicseq2_seg', 'sample_id', template=bicseq2_seg_template), config, pypeliner.managed.TempSpace('theta_work', 'sample_id', cleanup=None), ), kwargs={ 'breakpoints_filename': somatic_breakpoint_file, 'num_clones': kwargs.get('num_clones', None), }, ) workflow.transform( name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( pypeliner.managed.InputFile('results', 'sample_id', template=results_template), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}', }, ) return workflow
def create_strelka_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, indel_vcf_file, snv_vcf_file, config, chromosomes=default_chromosomes, split_size=int(1e7), use_depth_thresholds=True): ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'num_retry': 3, 'docker_image': config['docker']['single_cell_pipeline'] } strelka_docker = {'docker_image': config['docker']['strelka']} vcftools_docker = {'docker_image': config['docker']['vcftools']} regions = list(normal_bam_file.keys()) assert set(tumour_bam_file.keys()) == set(regions) workflow = Workflow(ctx=ctx) workflow.setobj( obj=pypeliner.managed.OutputChunks('chrom'), value=chromosomes, ) workflow.setobj( obj=pypeliner.managed.OutputChunks('region'), value=regions, ) workflow.transform( name='count_fasta_bases', ctx=dict(mem=2), func="single_cell.workflows.strelka.tasks.count_fasta_bases", args=(ref_genome_fasta_file, pypeliner.managed.TempOutputFile('ref_base_counts.tsv'), strelka_docker)) workflow.transform( name="get_chrom_sizes", ctx=dict(mem=2), func="single_cell.workflows.strelka.tasks.get_known_chromosome_sizes", ret=pypeliner.managed.TempOutputObj('known_sizes'), args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'), chromosomes)) workflow.transform( name='call_somatic_variants', ctx=dict(mem=4, disk=40), func="single_cell.workflows.strelka.tasks.call_somatic_variants", axes=('region', ), args=(pypeliner.managed.InputFile("normal.split.bam", "region", fnames=normal_bam_file, extensions=['.bai']), pypeliner.managed.InputFile("merged_bam", "region", fnames=tumour_bam_file, extensions=['.bai']), pypeliner.managed.TempInputObj('known_sizes'), ref_genome_fasta_file, pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf', 'region'), pypeliner.managed.TempOutputFile( 'somatic.indels.unfiltered.vcf.window', 'region'), pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf', 'region'), pypeliner.managed.TempOutputFile('strelka.stats', 'region'), pypeliner.managed.InputInstance("region"), strelka_docker), ) workflow.transform( name='add_indel_filters', axes=('chrom', ), ctx=dict(mem=4), func="single_cell.workflows.strelka.tasks.filter_indel_file_list", args=(pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf', 'region'), pypeliner.managed.TempInputFile('strelka.stats', 'region'), pypeliner.managed.TempInputFile( 'somatic.indels.unfiltered.vcf.window', 'region'), pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf', 'chrom'), pypeliner.managed.InputInstance("chrom"), pypeliner.managed.TempInputObj('known_sizes'), regions), kwargs={'use_depth_filter': use_depth_thresholds}) workflow.transform( name='add_snv_filters', axes=('chrom', ), ctx=dict(mem=4), func="single_cell.workflows.strelka.tasks.filter_snv_file_list", args=( pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf', 'region'), pypeliner.managed.TempInputFile('strelka.stats', 'region'), pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf', 'chrom'), pypeliner.managed.InputInstance("chrom"), pypeliner.managed.TempInputObj('known_sizes'), regions, ), kwargs={'use_depth_filter': use_depth_thresholds}) workflow.transform( name='merge_indels', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=( pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf', 'chrom'), pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf.gz'), pypeliner.managed.TempSpace("merge_indels_temp"), vcftools_docker)) workflow.transform( name='merge_snvs', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf', 'chrom'), pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz'), pypeliner.managed.TempSpace("merge_snvs_temp"), vcftools_docker)) workflow.transform( name='filter_indels', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.filter_vcf", args=( pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'), pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf'))) workflow.transform( name='filter_snvs', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.filter_vcf", args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'), pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf'))) workflow.transform( name='finalise_indels', ctx=dict(mem=4), func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf", args=(pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'), pypeliner.managed.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']), vcftools_docker)) workflow.transform( name='finalise_snvs', ctx=dict(mem=2), func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf", args=(pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'), pypeliner.managed.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']), vcftools_docker)) return workflow
def call_and_annotate_pipeline( config, normal_bam_path, tumour_bam_paths, raw_data_dir, results_file, ): workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('tumour_sample_id'), value=tumour_bam_paths.keys(), ) merge_inputs = {} if 'destruct' in config: destruct_raw_data = os.path.join(raw_data_dir, 'destruct') destruct_results_filename = os.path.join(destruct_raw_data, 'results.h5') make_parent_directory(destruct_results_filename) workflow.subworkflow( name='destruct', func=destruct.destruct_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), config['destruct']['config'], config['destruct']['ref_data_dir'], pypeliner.managed.OutputFile(destruct_results_filename), destruct_raw_data, ), ) merge_inputs['/breakpoints/destruct'] = pypeliner.managed.InputFile( destruct_results_filename) if 'delly' in config: delly_raw_data = os.path.join(raw_data_dir, 'delly') delly_results_filename = os.path.join(delly_raw_data, 'results.h5') make_parent_directory(delly_results_filename) workflow.subworkflow( name='delly', func=delly.delly_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), config['delly']['ref_genome_fasta_file'], config['delly']['exclude_file'], pypeliner.managed.OutputFile(delly_results_filename), delly_raw_data, ), ) merge_inputs['/breakpoints/delly'] = pypeliner.managed.InputFile( delly_results_filename) if 'lumpysv' in config: lumpysv_raw_data = os.path.join(raw_data_dir, 'lumpysv') lumpysv_results_filename = os.path.join(lumpysv_raw_data, 'results.h5') make_parent_directory(lumpysv_results_filename) workflow.subworkflow( name='lumpysv', func=lumpysv.lumpysv_pipeline, args=( pypeliner.managed.InputFile(normal_bam_path), pypeliner.managed.InputFile('tumour_bams', 'tumour_sample_id', fnames=tumour_bam_paths), pypeliner.managed.OutputFile(lumpysv_results_filename), lumpysv_raw_data, ), ) merge_inputs['/breakpoints/lumpysv'] = pypeliner.managed.InputFile( lumpysv_results_filename) workflow.transform(name='merge_results', ctx={'mem': 8}, func=hdf5_tasks.merge_hdf5, args=( merge_inputs, pypeliner.managed.OutputFile(results_file), )) return workflow
def create_strelka_workflow(normal_bam_file, tumour_bam_file, snv_vcf_file, snv_maf_file, indel_vcf_file, indel_maf_file, reference, reference_vep, chromosomes, normal_id, tumour_id, single_node=False, is_exome=False): params = config.default_params('variant_calling') workflow = Workflow(ctx=helpers.get_default_ctx(memory=5, walltime='4:00'), ) workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ret=mgd.OutputChunks('regions'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) workflow.transform( name='count_fasta_bases', func="wgs.workflows.strelka.tasks.count_fasta_bases", args=( reference, pypeliner.managed.TempOutputFile('ref_base_counts.tsv'), ), ) workflow.transform( name="get_chrom_sizes", func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes", ret=pypeliner.managed.TempOutputObj('known_sizes'), args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'), chromosomes)) if single_node: workflow.transform(name='strelka_one_node', func="wgs.workflows.strelka.tasks.strelka_one_node", args=( pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai' ]), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai' ]), reference, mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace('call_genome_segment_tmp'), mgd.InputChunks('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': is_exome, }) else: workflow.transform( name='get_chromosome_depths', axes=('regions', ), func="wgs.workflows.strelka.tasks.get_chromosome_depth", args=( mgd.InputInstance('regions'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('chrom_depth.txt', 'regions'), ), ) workflow.transform( name='merge_chromosome_depths', func="wgs.workflows.strelka.tasks.merge_chromosome_depths", args=(mgd.TempInputFile('chrom_depth.txt', 'regions', axes_origin=[]), mgd.TempOutputFile('merged_chrom_depth.txt'))) workflow.transform( name='call_genome_segment', axes=('regions', ), func="wgs.workflows.strelka.tasks.call_genome_segment", args=( mgd.TempInputFile('merged_chrom_depth.txt'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('indels.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf', 'regions'), mgd.TempSpace('call_genome_segment_tmp', 'regions'), mgd.InputInstance('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': False, }) workflow.transform( name='merge_indels', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('indels.vcf', 'regions'), mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("indels_merge")), ) workflow.transform( name='merge_snvs', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('snvs.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("snvs_merge")), ) workflow.transform(name='bcftools_normalize_snv', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('snvs.vcf.gz'), mgd.TempOutputFile('normalized_snvs.vcf'), reference, )) workflow.transform( name='finalise_normalize_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_snvs.vcf'), mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform(name='bcftools_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('indels.vcf.gz'), mgd.TempOutputFile('normalized_indels.vcf'), reference, )) workflow.transform( name='finalise_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_indels.vcf'), mgd.TempOutputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_indel', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_snv', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="strelka_snv_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(snv_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) workflow.subworkflow(name="strelka_indel_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(indel_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(indel_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) return workflow
def build_indexes(config): workflow = Workflow() if 'ref_genome_fasta_file' in config: workflow.transform( name='index_ref_genome', func=biowrappers.components.ngs.samtools.tasks.faidx, args=( mgd.InputFile(config['ref_genome_fasta_file']['local_path']), mgd.OutputFile(config['ref_genome_fasta_file']['local_path'] + '.fai'), ), ) if 'transcriptome_fasta_file' in config: workflow.transform( name='index_transcriptome', func=biowrappers.components.ngs.samtools.tasks.faidx, args=( mgd.InputFile( config['transcriptome_fasta_file']['local_path']), mgd.OutputFile( config['transcriptome_fasta_file']['local_path'] + '.fai'), ), ) if 'kallisto' in config: workflow.transform( name='build_kallisto_index', func=biowrappers.components.rna.kallisto.tasks.build_index, ctx={ 'mem': 32, 'num_retry': 3, 'mem_retry_increment': 8 }, args=( mgd.OutputFile(config['kallisto']['index']), mgd.InputFile( config['transcriptome_fasta_file']['local_path']), ), kwargs={'kmer_length': config['kallisto']['kmer_length']}) if 'salmon' in config: workflow.transform( name='build_salmon_index', func=biowrappers.components.rna.salmon.tasks.build_index, ctx={ 'mem': 32, 'num_retry': 3, 'mem_retry_increment': 8 }, args=( mgd.OutputFile( os.path.join(config['salmon']['index'], 'index.finished')), mgd.InputFile( config['transcriptome_fasta_file']['local_path']), ), kwargs={ 'kmer_length': config['salmon']['kmer_length'], 'gencode': config['salmon'].get('gencode', False), }) if 'star' in config: workflow.transform( name='build_star_index', func=biowrappers.components.rna.star.tasks.build_index, ctx={ 'mem': 32, 'num_retry': 3, 'mem_retry_increment': 8, 'local': config['star'].get('local', False) }, args=( mgd.OutputFile( os.path.join(config['star']['index'], 'index.finished')), mgd.InputFile(config['ref_genome_fasta_file']['local_path']), mgd.InputFile( config['gene_annotation_gtf_file']['local_path']), ), kwargs={ 'overhang': config['star']['overhang'], 'num_threads': config['star'].get('num_threads', 1), }) if 'tophat' in config: workflow.subworkflow( name='build_tophat_index', func=biowrappers.components.rna.tophat.workflow. create_tophat_transcriptome_index_workflow, args=( mgd.InputFile(config['ref_genome_fasta_file']['local_path']), mgd.InputFile( config['gene_annotation_gtf_file']['local_path']), mgd.OutputFile(config['tophat']['ref_genome_index']), mgd.OutputFile(config['tophat']['transcriptome_index']), ), ) return workflow
def destruct_pipeline( normal_bam_file, tumour_bam_files, config, ref_data_dir, out_file, raw_data_dir, normal_sample_id='normal', ): bam_files = tumour_bam_files bam_files[normal_sample_id] = normal_bam_file utils.make_directory(os.path.join(raw_data_dir, 'raw')) breakpoint_file = os.path.join(raw_data_dir, 'raw', 'breakpoint.tsv') breakpoint_library_file = os.path.join(raw_data_dir, 'raw', 'breakpoint_library.tsv') breakpoint_read_file = os.path.join(raw_data_dir, 'raw', 'breakpoint_read.tsv') utils.make_directory(os.path.join(raw_data_dir, 'somatic')) somatic_breakpoint_file = os.path.join(raw_data_dir, 'somatic', 'breakpoint.tsv') somatic_breakpoint_library_file = os.path.join(raw_data_dir, 'somatic', 'breakpoint_library.tsv') raw_read_data_dir = os.path.join(raw_data_dir, 'read_data') utils.make_directory(raw_read_data_dir) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('sample_id'), value=bam_files.keys(), ) workflow.subworkflow( name='run_destruct', func="destruct.workflow.create_destruct_workflow", args=( pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files), pypeliner.managed.OutputFile(breakpoint_file), pypeliner.managed.OutputFile(breakpoint_library_file), pypeliner.managed.OutputFile(breakpoint_read_file), config, ref_data_dir, ), kwargs={ 'raw_data_dir': raw_read_data_dir, }, ) workflow.transform( name='filter_annotate_breakpoints', ctx={'mem': 8}, func= 'biowrappers.components.breakpoint_calling.destruct.tasks.filter_annotate_breakpoints', args=( pypeliner.managed.InputFile(breakpoint_file), pypeliner.managed.InputFile(breakpoint_library_file), [normal_sample_id], pypeliner.managed.OutputFile(somatic_breakpoint_file), pypeliner.managed.OutputFile(somatic_breakpoint_library_file), ), ) workflow.transform( name='write_store', func= 'biowrappers.components.breakpoint_calling.destruct.tasks.write_store', ctx={ 'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2 }, args=( pypeliner.managed.InputFile(somatic_breakpoint_file), pypeliner.managed.InputFile(somatic_breakpoint_library_file), mgd.OutputFile(out_file), ), ) return workflow
def realignment_pipeline( config, in_file, out_file, read_group_info=None): if read_group_info is None: read_group_info = config.get('read_group', {}) if 'ID' not in read_group_info: read_group_info['ID'] = hash(in_file) % int(1e6) ref_genome = pypeliner.managed.InputFile(config['ref_genome']['file']) read_1 = pypeliner.managed.TempFile('read_1', 'split') read_2 = pypeliner.managed.TempFile('read_2', 'split') read_1_sai = pypeliner.managed.TempFile('read_1.sai', 'split') read_2_sai = pypeliner.managed.TempFile('read_2.sai', 'split') read_group_config = pypeliner.managed.TempObj('read_group_config') workflow = Workflow() if 'read_group' in config: workflow.setobj( obj=read_group_config.as_output(), value=read_group_info, ) else: workflow.transform( name='get_read_group_config', ctx={'local': True}, func=tasks.get_read_group_config, ret=read_group_config.as_output(), args=( pypeliner.managed.InputFile(in_file), ) ) workflow.transform( name='bam_to_fasta', axes=(), ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2}, func=bam_tasks.convert_to_fastqs, args=( pypeliner.managed.InputFile(in_file), { 1: read_1.as_output(), 2: read_2.as_output(), }, pypeliner.managed.TempSpace('bam_to_fastq'), ), kwargs={ 'split_size': config['split_size'] }, ) workflow.transform( name='aln_read_1', axes=('split',), ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, func=bwa_tasks.run_aln, args=( read_1.as_input(), ref_genome, read_1_sai.as_output(), ), ) workflow.transform( name='aln_read_2', axes=('split',), ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, func=bwa_tasks.run_aln, args=( read_2.as_input(), ref_genome, read_2_sai.as_output(), ), ) workflow.transform( name='sampe', axes=('split',), ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2}, func=bwa_tasks.run_sampe, args=( read_1.as_input(), read_2.as_input(), read_1_sai.as_input(), read_2_sai.as_input(), ref_genome, pypeliner.managed.TempOutputFile('aligned.bam', 'split'), ), kwargs={ 'read_group_info': read_group_config.as_input() }, ) workflow.transform( name='sort', axes=('split',), ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2}, func=bam_tasks.sort, args=( pypeliner.managed.TempInputFile('aligned.bam', 'split'), pypeliner.managed.TempOutputFile('sorted.bam', 'split'), ), ) workflow.transform( name='write_header_file', axes=(), ctx={'local': True}, func=tasks.write_header_file, args=( pypeliner.managed.TempInputFile('sorted.bam', 'split'), pypeliner.managed.TempOutputFile('header.sam'), config['ref_genome']['header'] ), ) workflow.transform( name='merge', axes=(), ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2}, func=bam_tasks.merge, args=( pypeliner.managed.TempInputFile('sorted.bam', 'split'), pypeliner.managed.OutputFile(out_file), ), kwargs={ 'header_file': pypeliner.managed.TempInputFile('header.sam'), }, ) return workflow