def create_remixt_bam_workflow( breakpoint_filename, bam_filenames, results_filenames, raw_data_directory, config, ref_data_dir, normal_id=None, ): sample_ids = bam_filenames.keys() tumour_ids = bam_filenames.keys() if normal_id is not None: tumour_ids.remove(normal_id) seqdata_template = os.path.join(raw_data_directory, 'seqdata', 'sample_{sample_id}.h5') results_filenames = dict([(tumour_id, results_filenames[tumour_id]) for tumour_id in tumour_ids]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=sample_ids, ) workflow.setobj( obj=mgd.OutputChunks('tumour_id'), value=tumour_ids, ) workflow.subworkflow( name='extract_seqdata_workflow', axes=('sample_id',), func=remixt.workflow.create_extract_seqdata_workflow, args=( mgd.InputFile('bam', 'sample_id', fnames=bam_filenames), mgd.OutputFile('seqdata', 'sample_id', template=seqdata_template), config, ref_data_dir, ), ) workflow.subworkflow( name='remixt_seqdata_workflow', func=create_remixt_seqdata_workflow, args=( mgd.InputFile(breakpoint_filename), mgd.InputFile('seqdata', 'sample_id', template=seqdata_template), mgd.OutputFile('results', 'tumour_id', fnames=results_filenames, axes_origin=[]), raw_data_directory, config, ref_data_dir, ), kwargs={ 'normal_id': normal_id, }, ) return workflow
def create_svaba_workflow( tumour_bam, normal_bam, svaba_vcf, reference, ): workflow = pypeliner.workflow.Workflow() workflow.transform( name='run_svaba', ctx=helpers.get_default_ctx(memory=10, walltime='72:00', ncpus='8', disk=300), func='wgs.workflows.svaba.tasks.run_svaba', args=(mgd.InputFile(tumour_bam), mgd.InputFile(normal_bam), mgd.TempOutputFile('germline.indel.vcf.gz'), mgd.TempOutputFile('germline.sv.vcf.gz'), mgd.TempOutputFile('somatic.indel.vcf.gz'), mgd.OutputFile(svaba_vcf), mgd.TempOutputFile('unfiltered.germline.indel.vcf.gz'), mgd.TempOutputFile('unfiltered.germline.sv.vcf.gz'), mgd.TempOutputFile('unfiltered.somatic.indel.vcf.gz'), mgd.TempOutputFile('unfiltered.somatic.sv.vcf.gz'), reference, mgd.TempSpace('svaba_tempdir_full')), kwargs={ 'ncores': 8, }) return workflow
def run_Strelka(config, normal_bam, tumour_bam, snv_output_file, indel_output_file): workflow = pypeliner.workflow.Workflow() workflow.transform(name='configure_bed', func=tasks.configure_bed, args=(mgd.TempSpace('bed_space'), mgd.InputFile(config['bed_file']), mgd.TempOutputFile('bed.gz'), mgd.TempOutputFile('bed.gz.tbi'))) workflow.transform(name='run_strelka', ctx={ 'mem': 10, 'ncpus': 1, 'walltime': '08:00' }, func=tasks.run_strelka, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.TempInputFile('bed.gz'), mgd.TempInputFile('bed.gz.tbi'), mgd.TempSpace('strelka_workspace'), mgd.OutputFile(snv_output_file), mgd.OutputFile(indel_output_file), )) return workflow
def fastqc_workflow(fastq_r1, fastq_r2, r1_html, r1_plot, r2_html, r2_plot): workflow = pypeliner.workflow.Workflow() workflow.transform( name="fastqc_r1", ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), func='wgs.workflows.alignment.tasks.run_fastqc', args=( mgd.InputFile(fastq_r1), mgd.OutputFile(r1_html), mgd.OutputFile(r1_plot), mgd.TempSpace('fastqc_R1'), ), ) workflow.transform( name="fastqc_r2", func='wgs.workflows.alignment.tasks.run_fastqc', ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), args=( mgd.InputFile(fastq_r2), mgd.OutputFile(r2_html), mgd.OutputFile(r2_plot), mgd.TempSpace('fastqc_R2'), ), ) return workflow
def create_db_workflow(in_file, ref_proteome_fasta_file, out_file, genome_version='GRCh37', pyensembl_cache_dir=None): sandbox = pypeliner.sandbox.CondaSandbox(pip_packages=['varcode']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='clean_ref_fasta', func=tasks.clean_ref_proteome_ids, args=(mgd.InputFile(ref_proteome_fasta_file), mgd.TempOutputFile('ref.fasta'))) workflow.transform(name='build_variant_table', func=tasks.build_variant_table, args=(mgd.InputFile(in_file), mgd.TempOutputFile('variant_table.tsv.gz')), kwargs={ 'genome_version': genome_version, 'pyensembl_cache_dir': pyensembl_cache_dir }) workflow.transform(name='build_variant_fasta', func=tasks.build_variant_fasta, args=(mgd.TempInputFile('variant_table.tsv.gz'), mgd.TempOutputFile('var.fasta'))) workflow.commandline(name='build_db', args=('cat', mgd.TempInputFile('ref.fasta'), mgd.TempInputFile('var.fasta'), '>', mgd.OutputFile(out_file))) return workflow
def create_lumpy_workflow(lumpy_vcf, tumour_bam=None, normal_bam=None, single_node=False): workflow = pypeliner.workflow.Workflow() lumpy_job_name = 'run_lumpy' if normal_bam: normal_bam = mgd.InputFile(normal_bam) normal_disc = mgd.TempInputFile('normal.discordants.sorted.bam') normal_split = mgd.TempInputFile('normal.splitters.sorted.bam') lumpy_job_name += '_normal' else: normal_disc = None normal_split = None if tumour_bam: tumour_bam = mgd.InputFile(tumour_bam) tumour_disc = mgd.TempInputFile('tumour.discordants.sorted.bam') tumour_split = mgd.TempInputFile('tumour.splitters.sorted.bam') lumpy_job_name += '_tumour' else: tumour_disc = None tumour_split = None if normal_bam: workflow.subworkflow( name='preprocess_lumpy_normal', func=lumpy_preprocess_workflow, args=(normal_bam, mgd.TempOutputFile('normal.discordants.sorted.bam'), mgd.TempOutputFile('normal.splitters.sorted.bam')), kwargs={'single_node': single_node}) if tumour_bam: workflow.subworkflow( name='preprocess_lumpy_tumour', func=lumpy_preprocess_workflow, args=(tumour_bam, mgd.TempOutputFile('tumour.discordants.sorted.bam'), mgd.TempOutputFile('tumour.splitters.sorted.bam')), kwargs={'single_node': single_node}) workflow.transform( name=lumpy_job_name, ctx=helpers.get_default_ctx(memory=10, disk=500, walltime='72:00'), func='wgs.workflows.lumpy.tasks.run_lumpyexpress', args=(mgd.OutputFile(lumpy_vcf), config.default_params('breakpoint_calling')['lumpy_paths']), kwargs={ 'tumour_bam': tumour_bam, 'tumour_discordants': tumour_disc, 'tumour_splitters': tumour_split, 'normal_bam': normal_bam, 'normal_discordants': normal_disc, 'normal_splitters': normal_split, 'docker_image': config.containers('lumpy') }) return workflow
def run_MutationSeq(config, normal_bam, tumour_bam, output_file): workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X']))) workflow.transform( name='run_museq_paired', ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'}, axes=('interval',), func=tasks.run_museq, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.InputInstance('interval'), mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), ) ) workflow.transform( name='merge_vcfs', func=tasks.merge_vcfs, args=( mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]), mgd.OutputFile(output_file), mgd.TempSpace('merge_vcf'), ) ) return workflow
def annotation_workflow(args): config = inpututils.load_config(args) annotation_infiles = inpututils.load_yaml(args['input_yaml']) lib = args["library_id"] workflow = pypeliner.workflow.Workflow(ctx={ 'docker_image': config['annotation']['docker']['single_cell_pipeline'] }, ) annotation_dir = args["out_dir"] input_yaml_blob = os.path.join(annotation_dir, 'input.yaml') annotation_files = get_output_files(annotation_dir, lib) annotation_meta = os.path.join(annotation_dir, 'metadata.yaml') workflow.subworkflow( name='annotation_workflow', func=qc_annotation.create_qc_annotation_workflow, args=( mgd.InputFile(annotation_infiles['hmmcopy_metrics']), mgd.InputFile(annotation_infiles['hmmcopy_reads']), mgd.InputFile(annotation_infiles['alignment_metrics']), mgd.InputFile(annotation_infiles['gc_metrics']), mgd.InputFile(annotation_infiles['segs_pdf_tar']), mgd.OutputFile(annotation_files['merged_metrics_csvs']), mgd.OutputFile(annotation_files['qc_report']), mgd.OutputFile(annotation_files['corrupt_tree_newick']), mgd.OutputFile(annotation_files['consensus_tree_newick']), mgd.OutputFile(annotation_files['phylo_csv']), mgd.OutputFile(annotation_files['loci_rank_trees']), mgd.OutputFile(annotation_files['filtered_data']), mgd.OutputFile(annotation_files['corrupt_tree_pdf']), mgd.OutputFile(annotation_files['segs_pass']), mgd.OutputFile(annotation_files['segs_fail']), mgd.OutputFile(annotation_files['corrupt_heatmap_pdf']), mgd.OutputFile(annotation_files['heatmap_filt_pdf']), config['annotation'], lib, ), kwargs={'no_corrupt_tree': args['no_corrupt_tree']}) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], annotation_dir, list(annotation_files.values()), mgd.OutputFile(annotation_meta)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'library_id': lib, 'type': 'annotation' } }) return workflow
def create_align_workflow(fastq_file_1, fastq_file_2, ref_genome_dir, out_bam_file, add_xs_tag=False, align_threads=1, read_group_info=None, sort_threads=1): sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='star_align', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': align_threads }, func=tasks.align, args=( mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), ref_genome_dir, mgd.TempOutputFile('aligned.bam'), mgd.TempSpace('align_tmp'), ), kwargs={ 'add_xs_tag': add_xs_tag, 'read_group_info': read_group_info, 'threads': align_threads, }) workflow.transform(name='sort', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': sort_threads }, func=soil.wrappers.sambamba.tasks.sort, args=( mgd.TempInputFile('aligned.bam'), mgd.OutputFile(out_bam_file), mgd.TempSpace('sort_tmp'), ), kwargs={'threads': sort_threads}) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def destruct_preprocess_workflow(normal_bam_files, normal_stats, normal_reads_1, normal_reads_2, normal_sample_1, normal_sample_2, ref_data_directory, destruct_config, tag=False): workflow = pypeliner.workflow.Workflow() workflow.transform(name="get_destruct_config", func="destruct.defaultconfig.get_config", ret=mgd.TempOutputObj("destruct_config"), args=(ref_data_directory, destruct_config)) if isinstance(normal_bam_files, str): workflow.transform( name='bamdisc_normal', func= "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads", ctx={ 'io': 1, 'mem': 8, 'disk': 200 }, args=( mgd.TempInputObj("destruct_config"), mgd.InputFile(normal_bam_files), mgd.OutputFile(normal_stats), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.TempSpace('bamdisc_normal_tempspace'), )) else: workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_bam_files.keys()), ) workflow.subworkflow(name='process_normal_cells', func=process_cells_destruct, args=( mgd.TempInputObj("destruct_config"), mgd.InputFile('bam', 'normal_cell_id', fnames=normal_bam_files), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.OutputFile(normal_stats), ), kwargs={'tag': tag}) return workflow
def destruct_preprocess_workflow(normal_bam_files, normal_stats, normal_reads_1, normal_reads_2, normal_sample_1, normal_sample_2, ref_data_directory, destruct_config, config, tag=False): workflow = pypeliner.workflow.Workflow() workflow.transform(name="get_destruct_config", func="destruct.defaultconfig.get_config", ctx={ 'docker_image': config['docker']['destruct'], 'disk': 200 }, ret=mgd.TempOutputObj("destruct_config"), args=(ref_data_directory, destruct_config)) if isinstance(normal_bam_files, str): workflow.subworkflow(name='process_individual_cells', func=process_cells_destruct, args=( mgd.TempInputObj("destruct_config"), config, mgd.InputFile(normal_bam_files), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.OutputFile(normal_stats), ), kwargs={'tag': tag}) else: workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_bam_files.keys()), ) workflow.subworkflow(name='process_individual_cells', func=process_cells_destruct, args=( mgd.TempInputObj("destruct_config"), config, mgd.InputFile('bam', 'normal_cell_id', fnames=normal_bam_files), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.OutputFile(normal_stats), ), kwargs={'tag': tag}) return workflow
def create_somatic_consensus_workflow( mutect_snv_vcf, strelka_snv_vcf, strelka_indel_vcf, museq_snv_vcf, consensus_maf, chromosomes, reference_vep, normal_id, tumour_id, ): workflow = pypeliner.workflow.Workflow() workflow.transform( name='snv_consensus', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.somatic_calling_consensus.consensus.main', args=( mgd.InputFile(museq_snv_vcf), mgd.InputFile(strelka_snv_vcf), mgd.InputFile(mutect_snv_vcf), mgd.InputFile(strelka_indel_vcf), mgd.TempOutputFile('consensus.vcf'), mgd.TempOutputFile('counts.csv'), chromosomes, ), ) workflow.subworkflow(name="consensus_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.TempInputFile('consensus.vcf'), mgd.TempOutputFile('consensus.maf'), reference_vep, ), kwargs={ 'normal_id': normal_id, 'tumour_id': tumour_id }) workflow.transform( name='maf_counts', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.somatic_calling_consensus.tasks.update_maf_counts', args=( mgd.TempInputFile('consensus.maf'), mgd.TempInputFile('counts.csv'), mgd.OutputFile(consensus_maf), )) return workflow
def create_consensus_workflow( destruct_breakpoints, lumpy_vcf, output, chromosomes ): params = config.default_params('breakpoint_calling') workflow = pypeliner.workflow.Workflow() workflow.transform( name='parse_lumpy', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_lumpy_task', args=( mgd.InputFile(lumpy_vcf), mgd.TempOutputFile('lumpy.csv'), params["parse_lumpy"], ), kwargs={'chromosomes': chromosomes} ) workflow.transform( name='parse_destruct', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_destruct_task', args=( mgd.InputFile(destruct_breakpoints), mgd.TempOutputFile('destruct.csv'), params["parse_destruct"], ), kwargs={'chromosomes': chromosomes} ) workflow.transform( name='consensus_breakpoint_calling', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.consensus_calls', args=( mgd.TempInputFile('destruct.csv'), mgd.TempInputFile('lumpy.csv'), mgd.OutputFile(output, extensions=['.yaml']), params['consensus'] ), ) return workflow
def call_copynumber( samples, config, tumours, normals, breakpoints, titan_raw_dir, remixt_results, remixt_raw_dir, titan_segments, titan_params, titan_markers ): breakpoints = dict([(sampid, breakpoints[sampid]) for sampid in samples]) remixt_results = dict([(sampid, remixt_results[sampid]) for sampid in samples]) titan_segments = dict([(sampid, titan_segments[sampid]) for sampid in samples]) titan_params = dict([(sampid, titan_params[sampid]) for sampid in samples]) titan_markers = dict([(sampid, titan_markers[sampid]) for sampid in samples]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.Template(titan_raw_dir, 'sample_id'), mgd.OutputFile('titan_segments', 'sample_id', fnames=titan_segments), mgd.OutputFile('titan_params', 'sample_id', fnames=titan_params), mgd.OutputFile('titan_markers', 'sample_id', fnames=titan_markers), config['globals'], config['cna_calling'], config['cna_calling']['titan_intervals'], ), ) workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('breakpoints', 'sample_id', fnames=breakpoints), mgd.InputInstance('sample_id'), config['cna_calling']['remixt_refdata'], mgd.OutputFile('remixt_results', 'sample_id', fnames=remixt_results), mgd.Template(remixt_raw_dir, 'sample_id'), config['cna_calling']['min_num_reads'] ), ) return workflow
def create_tophat_transcriptome_index_workflow( ref_genome_fasta_file, transcript_gtf_file, ref_genome_index_prefix, transcriptome_index_prefix, copy_ref_genome=False): workflow = Workflow() local_ref_genome_fasta_path = ref_genome_index_prefix + '.fa' if copy_ref_genome: workflow.commandline( name='copy_genome', ctx={'local': True}, args=( 'cp', mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(local_ref_genome_fasta_path), ), ) else: workflow.commandline( name='link_genome', ctx={'local': True}, args=( 'ln', '-s', mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(local_ref_genome_fasta_path), ), ) workflow.transform( name='build_bowtie_index', ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8}, func=tasks.build_genome_index, args=( mgd.InputFile(local_ref_genome_fasta_path), mgd.OutputFile(ref_genome_index_prefix), ) ) workflow.transform( name='build_tophat_index', ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8}, func=tasks.build_transcriptome_index, args=( mgd.InputFile(ref_genome_index_prefix), mgd.InputFile(transcript_gtf_file), mgd.OutputFile(transcriptome_index_prefix), ) ) return workflow
def create_pileup2snp_workflow(bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)): sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools', 'varscan']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj( obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions(bam_file, split_size, chromosomes=chromosomes) ) workflow.commandline( name='run_mpileup', axes=('regions',), args=( 'samtools', 'mpileup', '-f', mgd.InputFile(ref_genome_fasta_file), '-o', mgd.TempOutputFile('region.mpileup', 'regions'), '-r', mgd.TempInputObj('config', 'regions'), mgd.InputFile(bam_file), ) ) workflow.transform( name='run_mpileup2snp', axes=('regions',), ctx=med_mem_ctx, func=tasks.mpileup2snp, args=( mgd.TempInputFile('region.mpileup', 'regions'), mgd.TempOutputFile('region.vcf', 'regions'), ) ) workflow.transform( name='compress', axes=('regions',), func=soil.wrappers.samtools.tasks.compress_vcf, args=( mgd.TempInputFile('region.vcf', 'regions'), mgd.TempOutputFile('region.vcf.gz', 'regions'), ), ) workflow.transform( name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.OutputFile(out_file), ), ) return workflow
def count_haps_workflow(args): config = inpututils.load_config(args) config = config['count_haps'] baseimage = config['docker']['single_cell_pipeline'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, docker_image=baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) allele_counts_filename = os.path.join(args["out_dir"], "allele_counts.csv.gz") meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') haplotypes_filename, tumour_cells = inpututils.load_count_haps_input( args['input_yaml']) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=list(tumour_cells.keys()), ) workflow.subworkflow( name='extract_allele_readcounts', func= 'single_cell.workflows.extract_allele_readcounts.extract_allele_readcounts', args=( mgd.InputFile(haplotypes_filename, extensions=['.yaml']), mgd.InputFile('tumour_cells.bam', 'tumour_cell_id', extensions=['.bai'], axes_origin=[], fnames=tumour_cells), mgd.OutputFile(allele_counts_filename), config, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], [allele_counts_filename], mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'count_haps' } }) return workflow
def infer_haps_workflow(args): config = inpututils.load_config(args) config = config['infer_haps'] baseimage = config['docker']['single_cell_pipeline'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, docker_image=baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) haplotypes_filename = os.path.join(args["out_dir"], "haplotypes.tsv") meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') normal_data = inpututils.load_infer_haps_input(args['input_yaml']) if isinstance(normal_data, dict): workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_data.keys()), ) bam_file = mgd.InputFile('normal.bam', 'normal_cell_id', fnames=normal_data, extensions=['.bai']) else: bam_file = mgd.InputFile(normal_data, extensions=['.bai']) workflow.subworkflow( name='infer_haps', func=infer_haps, args=( bam_file, mgd.OutputFile(haplotypes_filename), config, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], [haplotypes_filename], mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'infer_haps' } }) return workflow
def patient_workflow(config, patient_id, patient_input, output_file): workflow = pypeliner.workflow.Workflow() patient_bam_dir = config["bam_directory"] + patient_id patient_result_dir = config["results_dir"] + patient_id helpers.makedirs(patient_bam_dir) helpers.makedirs(patient_result_dir) input_args = helpers.create_input_args(patient_input, patient_bam_dir) workflow.setobj(obj=mgd.OutputChunks('sample_id', ), value=input_args['all_samples']) workflow.subworkflow(name='align_samples', func=alignment.align_sample, axes=('sample_id', ), args=( config, mgd.InputFile('fastq_1', 'sample_id', fnames=input_args['fastqs_r1']), mgd.InputFile('fastq_2', 'sample_id', fnames=input_args['fastqs_r2']), mgd.InputInstance('sample_id'), mgd.OutputFile('sample.bam', 'sample_id', fnames=input_args['all_bams']), mgd.OutputFile('sample.bam.bai', 'sample_id', fnames=input_args['all_bais']), )) workflow.subworkflow(name='run_analyses', func=analysis.partition_tumour, args=( config, input_args, patient_id, patient_result_dir, mgd.InputFile('sample.bam', 'sample_id', fnames=input_args['all_bams'], axes_origin=[]), mgd.InputFile('sample.bam.bai', 'sample_id', fnames=input_args['all_bais'], axes_origin=[]), mgd.OutputFile(output_file), )) return workflow
def create_eagle_ref_data_workflow(vcf_url_template, out_file, local_download=False): chrom_map_file = soil.utils.package_data.load_data_file( 'ref_data/data/GRCh37/chrom_map.tsv') chrom_map = pd.read_csv(chrom_map_file, sep='\t') chrom_map = chrom_map[chrom_map['ncbi'].isin( [str(x) for x in range(1, 23)])] chrom_map['url'] = chrom_map['ncbi'].apply( lambda x: vcf_url_template.format(chrom=x)) vcf_urls = chrom_map['url'].to_dict() sandbox = soil.utils.workflow.get_sandbox(['bcftools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls) workflow.transform(name='download_vcf_files', axes=('chrom', ), ctx={'local': local_download}, func=soil.ref_data.tasks.download, args=(mgd.TempInputObj('vcf_url', 'chrom'), mgd.TempOutputFile('raw.vcf.gz', 'chrom'))) workflow.transform(name='write_chrom_map', func=tasks.write_chrom_map_file, args=(mgd.InputFile(chrom_map_file), mgd.TempOutputFile('chrom_map.tsv'))) workflow.transform(name='rename_chroms', axes=('chrom', ), func=soil.wrappers.bcftools.tasks.rename_chroms, args=(mgd.TempInputFile('chrom_map.tsv'), mgd.TempInputFile('raw.vcf.gz', 'chrom'), mgd.TempOutputFile('renamed.bcf', 'chrom'))) workflow.transform(name='concat_vcfs', func=soil.wrappers.bcftools.tasks.concatenate_vcf, args=(mgd.TempInputFile('renamed.bcf', 'chrom'), mgd.OutputFile(out_file)), kwargs={'bcf_output': True}) workflow.commandline(name='index', args=('bcftools', 'index', mgd.InputFile(out_file), '-o', mgd.OutputFile(out_file + '.csi'))) return workflow
def create_patient_workflow(pseudo_bulk_group, mafs, sample_all_snv_csvs, mutationreport, merged_maf, high_impact_maf, merged_snvs, merged_high_impact_snvs): ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.transform( name='merge_mafs', func='single_cell.workflows.pseudo_bulk_qc.tasks.merge_mafs', args=( mafs, mgd.OutputFile(merged_maf), ), kwargs={"id_colname": True}) workflow.transform( name='filter_merged_maf', func= 'single_cell.workflows.pseudo_bulk_qc.tasks.filter_maf_for_high_impact', args=( mgd.InputFile(merged_maf), mgd.OutputFile(high_impact_maf), ), ) workflow.transform( name='merge_snvs', func='single_cell.workflows.pseudo_bulk_qc.tasks.merge_snvs', args=( sample_all_snv_csvs, mgd.OutputFile(merged_snvs), ), kwargs={"id_colname": True}) workflow.transform( name='filter_snvs', func= 'single_cell.workflows.pseudo_bulk_qc.tasks.filter_snvs_for_high_impact', args=( mgd.InputFile(merged_snvs), mgd.OutputFile(merged_high_impact_snvs), ), ) workflow.transform( name='mutationreport', func= 'single_cell.workflows.pseudo_bulk_qc.tasks.create_mutation_report', args=(pseudo_bulk_group, mgd.InputFile(merged_maf), mgd.InputFile(high_impact_maf), mgd.InputFile(merged_high_impact_snvs), mgd.OutputFile(mutationreport), mgd.TempSpace("mutationreport")), ) return workflow
def conversion_workflow(args): docker = docker_containers() converted_dir = args["out_dir"] cell_ids, cfse_images, livedead_images = get_cell_images( args['input_yaml']) converted_image_template = os.path.join(converted_dir, '{cell_id}.png') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': docker['microscope_image_converter']}) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.transform( name='convert', func='microscope_image_converter.tasks.convert', axes=('cell_id', ), args=( mgd.InputFile('livedead.tif', 'cell_id', fnames=livedead_images), mgd.InputFile('cfse.tif', 'cell_id', fnames=cfse_images), mgd.OutputFile('converted.png', 'cell_id', template=converted_image_template, axes_origin=[]), ), ) converted_meta = os.path.join(converted_dir, 'metadata.yaml') input_yaml_blob = os.path.join(converted_dir, 'input.yaml') workflow.transform( name='generate_meta_files_results', func='microscope_image_converter.tasks.generate_and_upload_metadata', args=(sys.argv[0:], converted_dir, mgd.Template('converted.png', 'cell_id', template=converted_image_template), mgd.OutputFile(converted_meta)), kwargs={ 'input_yaml_data': load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'cell_ids': cell_ids, 'type': 'dlp_microscope_merged', } }) return workflow
def infer_haps_workflow(args): config = helpers.load_config(args) config = config['infer_haps'] baseimage = config['docker']['single_cell_pipeline'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, baseimage=baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) haps_dir = os.path.join(args["out_dir"], "infer_haps") haplotypes_filename = os.path.join(haps_dir, "results", "haplotypes.tsv") allele_counts_filename = os.path.join(haps_dir, "results", "allele_counts.tsv") data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_wgs = data['tumour_wgs'] normal_wgs = data['normal_wgs'] tumour_cells = data['tumour_cells'] normal_cells = data['normal_cells'] if args['normal']: bam_file = normal_cells if normal_cells else normal_wgs else: bam_file = tumour_cells if tumour_cells else tumour_wgs if isinstance(bam_file, dict): workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_file.keys()), ) bam_file = mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_file, extensions=['.bai']) else: bam_file = mgd.InputFile(bam_file, extensions=['.bai']) workflow.subworkflow( name='infer_haps', func=infer_haps, args=( bam_file, mgd.OutputFile(haplotypes_filename), mgd.OutputFile(allele_counts_filename), config, ), kwargs={'normal': args['normal']}, ) return workflow
def create_variant_counting_workflow( vcfs, tumour_cell_bams, results_h5, config, ): """ Count variant reads for multiple sets of variants across cells. """ workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=tumour_cell_bams.keys(), ) workflow.transform(name='merge_snvs', func='biowrappers.components.io.vcf.tasks.merge_vcfs', args=([mgd.InputFile(vcf) for vcf in vcfs], mgd.TempOutputFile('all.snv.vcf'))) workflow.transform(name='finalise_snvs', func="biowrappers.components.io.vcf.tasks.finalise_vcf", args=(mgd.TempInputFile('all.snv.vcf'), mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi'])), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'vcftools') }) workflow.subworkflow( name='count_alleles', func=create_snv_allele_counts_for_vcf_targets_workflow, args=( config, mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams), mgd.TempInputFile('all.snv.vcf.gz'), mgd.OutputFile(results_h5), ), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') }, ) return workflow
def create_vcf_mappability_annotation_workflow( mappability_file, vcf_file, out_file, chromosomes=default_chromosomes, split_size=int(1e7), ): ctx = {'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2} workflow = pypeliner.workflow.Workflow() workflow.transform( name='get_regions', ret=mgd.TempOutputObj('regions_obj', 'regions'), ctx=ctx, func='biowrappers.components.variant_calling.utils.get_vcf_regions', args=( mgd.InputFile(vcf_file, extensions=['.tbi']), split_size, ), kwargs={ 'chromosomes': chromosomes, }, ) workflow.transform( name='annotate_db_status', axes=('regions',), ctx=ctx, func='biowrappers.components.variant_calling.mappability.tasks.get_mappability', args=( mappability_file, mgd.InputFile(vcf_file, extensions=['.tbi']), mgd.TempOutputFile('mappability.csv.gz', 'regions') ), kwargs={ 'region': mgd.TempInputObj('regions_obj', 'regions'), }, ) workflow.transform( name='merge_tables', ctx=ctx, func='biowrappers.components.io.csv.tasks.concatenate_csv', args=( mgd.TempInputFile('mappability.csv.gz', 'regions'), mgd.OutputFile(out_file) ) ) return workflow
def create_merge_bams_workflow( input_bams, merged_bams, regions, config, ): merged_bams = dict([(region, merged_bams[region]) for region in regions]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(input_bams.keys()), ) workflow.setobj( obj=mgd.OutputChunks('region'), value=regions, ) one_split_job = config["one_split_job"] if one_split_job: workflow.transform( name='merge_bams', ctx={'mem': config['memory']['med'], 'ncpus': config['max_cores']}, func="single_cell.workflows.merge_bams.tasks.merge_bams", args=( mgd.InputFile('bam', 'cell_id', fnames=input_bams, extensions=['.bai']), mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[], extensions=['.bai']), regions, mgd.TempSpace("merge_bams_tempdir") ), kwargs={"ncores": config["max_cores"]} ) else: workflow.transform( name='split_merge_tumour', func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams', axes=('region',), args=( mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=input_bams), mgd.OutputFile( 'tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=merged_bams), mgd.Instance('region'), ), ) return workflow
def create_somatic_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)): regions = utils.get_bam_regions(normal_bam_file, split_size, chromosomes=chromosomes) workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=regions) workflow.transform( name='run_somatic', axes=('regions', ), ctx={ 'mem': 6, 'mem_retry_increment': 2, 'num_retry': 3 }, func=tasks.run_somatic, args=( mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('region.vcf.gz', 'regions'), mgd.TempInputObj('config', 'regions'), mgd.TempSpace('varscan_tmp', 'regions'), ), ) workflow.transform( name='merge', axes=(), ctx={ 'mem': 2, 'mem_retry_increment': 2, 'num_retry': 3 }, func=vcf_tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.OutputFile(out_file), ), ) return workflow
def run_remixt_local(tempdir, breakpoints, tumour_bam, normal_bam, sample_id, remixt_results_filename, remixt_raw_dir, remixt_config, remixt_refdata, ncpus=None): pipelinedir = os.path.join(tempdir, 'pipeline') tmpdir = os.path.join(tempdir, 'tmp') if not ncpus: ncpus = multiprocessing.cpu_count() config = { 'pipelinedir': pipelinedir, 'tmpdir': tmpdir, 'submit': 'local', 'maxjobs': ncpus, 'loglevel': 'DEBUG' } pyp = pypeliner.app.Pypeline(config=config) workflow = pypeliner.workflow.Workflow() logging.getLogger().setLevel(logging.DEBUG) workflow.subworkflow(name='remixt', func="remixt.workflow.create_remixt_bam_workflow", args=( mgd.InputFile(breakpoints), { sample_id: mgd.InputFile(tumour_bam), sample_id + 'N': mgd.InputFile(normal_bam) }, { sample_id: mgd.OutputFile(remixt_results_filename) }, remixt_raw_dir, remixt_config, remixt_refdata, ), kwargs={ 'normal_id': sample_id + 'N', }) pyp.run(workflow)
def create_cohort_qc_report(cohort_label, out_dir, filtered_cohort_maf, cna_table, report_path): oncoplot = os.path.join(out_dir, cohort_label, "cohort_oncoplot.png") somatic_interactions_plot = os.path.join(out_dir, cohort_label, "somatic_interactions.png") summary_plot = os.path.join(out_dir, cohort_label, "summary.png") burden_plot = os.path.join(out_dir, cohort_label, "mutation_burden.png") workflow = pypeliner.workflow.Workflow() non_synonymous_labels = [ "Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site", "Translation_Start_Site", "Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation" ] workflow.transform( name='postprocess_maf', func='wgs.workflows.cohort_qc.tasks.prepare_maf_for_maftools', args=(cohort_label, mgd.InputFile(filtered_cohort_maf), mgd.TempOutputFile("prepared_maf"), non_synonymous_labels, mgd.TempOutputFile("vcNames")), ) workflow.transform( name='burden_plot', func='wgs.workflows.cohort_qc.tasks.plot_mutation_burden', args=( mgd.InputFile(filtered_cohort_maf), mgd.OutputFile(burden_plot), ), ) workflow.transform( name='build_gene_list', func='wgs.workflows.cohort_qc.tasks.build_gene_list', args=(mgd.InputFile(cna_table), mgd.TempOutputFile("genelist")), ) workflow.transform( name='make_cohort_plots', func='wgs.workflows.cohort_qc.tasks.make_R_cohort_plots', args=(mgd.TempInputFile("prepared_maf"), mgd.InputFile(cna_table), mgd.OutputFile(oncoplot), mgd.OutputFile(somatic_interactions_plot), mgd.OutputFile(summary_plot), mgd.TempInputFile("vcNames"), mgd.TempInputFile("genelist"))) workflow.transform(name='make_report', func='wgs.workflows.cohort_qc.tasks.make_report', args=( cohort_label, mgd.InputFile(oncoplot), mgd.InputFile(somatic_interactions_plot), mgd.InputFile(summary_plot), mgd.InputFile(burden_plot), mgd.OutputFile(report_path), )) return workflow
def create_sample_qc_workflow_normal_only( sample_id, refdir, normal_bam, roh, germline_calls, genome_wide_plot, normal_coverage, chromosomes, bins, mapping_qual_threshold, single_node=False ): workflow = pypeliner.workflow.Workflow() workflow.subworkflow( name='coverage_normal_data', func=get_coverage_data, args=( mgd.InputFile(normal_bam), mgd.OutputFile(normal_coverage), refdir, chromosomes, mapping_qual_threshold, bins, ), kwargs={'single_node': single_node} ) workflow.transform( name='generate_genome_wide_plot', ctx=helpers.get_default_ctx( memory=10, ), func="wgs.workflows.sample_qc.tasks.genome_wide", args=( sample_id, mgd.InputFile(roh), mgd.InputFile(germline_calls), mgd.InputFile(normal_coverage), chromosomes, mgd.OutputFile(genome_wide_plot), ), kwargs={"normal_only":True} ) return workflow