def realign_bam_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(docker_image=config.containers('wgs'))) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') yamldata = yaml.safe_load(open(args['input_yaml'])) samples = list(yamldata.keys()) input_bams = {sample: yamldata[sample]['input'] for sample in samples} output_bams = os.path.join(outdir, '{sample_id}', '{sample_id}.bam') metrics = os.path.join(outdir, '{sample_id}', '{sample_id}.txt') metrics_tar = os.path.join(outdir, '{sample_id}', '{sample_id}.tar') workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name="realign", func=realign_bams, ctx=helpers.get_default_ctx(), args=( samples, mgd.InputFile("input.bam", 'sample_id', fnames=input_bams, extensions=['.bai'], axes_origin=[]), mgd.OutputFile("realigned.bam", 'sample_id', template=output_bams, extensions=['.bai', '.tdf'], axes_origin=[]), mgd.OutputFile("realigned.txt", 'sample_id', template=metrics, extensions=['.bai'], axes_origin=[]), mgd.OutputFile("realigned.tar", 'sample_id', template=metrics_tar, extensions=['.bai'], axes_origin=[]), args['refdir'], ), kwargs={'single_node': args['single_node']} ) outputted_filenames = helpers.expand_list([output_bams, metrics, metrics_tar], samples, 'sample_id') workflow.transform( name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=( sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml) ), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': {'type': 'realignment'} } ) pyp.run(workflow)
def create_remixt_bam_workflow( breakpoint_filename, bam_filenames, results_filenames, raw_data_directory, config, ref_data_dir, normal_id=None, ): sample_ids = bam_filenames.keys() tumour_ids = bam_filenames.keys() if normal_id is not None: tumour_ids.remove(normal_id) seqdata_template = os.path.join(raw_data_directory, 'seqdata', 'sample_{sample_id}.h5') results_filenames = dict([(tumour_id, results_filenames[tumour_id]) for tumour_id in tumour_ids]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=sample_ids, ) workflow.setobj( obj=mgd.OutputChunks('tumour_id'), value=tumour_ids, ) workflow.subworkflow( name='extract_seqdata_workflow', axes=('sample_id',), func=remixt.workflow.create_extract_seqdata_workflow, args=( mgd.InputFile('bam', 'sample_id', fnames=bam_filenames), mgd.OutputFile('seqdata', 'sample_id', template=seqdata_template), config, ref_data_dir, ), ) workflow.subworkflow( name='remixt_seqdata_workflow', func=create_remixt_seqdata_workflow, args=( mgd.InputFile(breakpoint_filename), mgd.InputFile('seqdata', 'sample_id', template=seqdata_template), mgd.OutputFile('results', 'tumour_id', fnames=results_filenames, axes_origin=[]), raw_data_directory, config, ref_data_dir, ), kwargs={ 'normal_id': normal_id, }, ) return workflow
def realign_bams(samples, inputs, outputs, metrics, metrics_tar, refdir, single_node=False): outputs = dict([(sampid, outputs[sampid]) for sampid in samples]) inputs = dict([(sampid, inputs[sampid]) for sampid in samples]) metrics = dict([(sampid, metrics[sampid]) for sampid in samples]) metrics_tar = dict([(sampid, metrics_tar[sampid]) for sampid in samples]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='realign_bam_file', func=realignment.realign_bam_files, args=( mgd.InputFile("input.bam", "sample_id", axes_origin=[], fnames=inputs), mgd.OutputFile("output.bam", "sample_id", axes_origin=[], fnames=outputs), mgd.OutputFile("output.txt", "sample_id", axes_origin=[], fnames=metrics), mgd.OutputFile("output.tar", "sample_id", axes_origin=[], fnames=metrics_tar), refdir, samples ), kwargs={'single_node': single_node} ) return workflow
def fastqc_workflow(fastq_r1, fastq_r2, r1_html, r1_plot, r2_html, r2_plot): workflow = pypeliner.workflow.Workflow() workflow.transform( name="fastqc_r1", ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), func='wgs.workflows.alignment.tasks.run_fastqc', args=( mgd.InputFile(fastq_r1), mgd.OutputFile(r1_html), mgd.OutputFile(r1_plot), mgd.TempSpace('fastqc_R1'), ), ) workflow.transform( name="fastqc_r2", func='wgs.workflows.alignment.tasks.run_fastqc', ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), args=( mgd.InputFile(fastq_r2), mgd.OutputFile(r2_html), mgd.OutputFile(r2_plot), mgd.TempSpace('fastqc_R2'), ), ) return workflow
def run_Strelka(config, normal_bam, tumour_bam, snv_output_file, indel_output_file): workflow = pypeliner.workflow.Workflow() workflow.transform(name='configure_bed', func=tasks.configure_bed, args=(mgd.TempSpace('bed_space'), mgd.InputFile(config['bed_file']), mgd.TempOutputFile('bed.gz'), mgd.TempOutputFile('bed.gz.tbi'))) workflow.transform(name='run_strelka', ctx={ 'mem': 10, 'ncpus': 1, 'walltime': '08:00' }, func=tasks.run_strelka, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.TempInputFile('bed.gz'), mgd.TempInputFile('bed.gz.tbi'), mgd.TempSpace('strelka_workspace'), mgd.OutputFile(snv_output_file), mgd.OutputFile(indel_output_file), )) return workflow
def create_cohort_qc_report(cohort_label, out_dir, filtered_cohort_maf, cna_table, report_path): oncoplot = os.path.join(out_dir, cohort_label, "cohort_oncoplot.png") somatic_interactions_plot = os.path.join(out_dir, cohort_label, "somatic_interactions.png") summary_plot = os.path.join(out_dir, cohort_label, "summary.png") burden_plot = os.path.join(out_dir, cohort_label, "mutation_burden.png") workflow = pypeliner.workflow.Workflow() non_synonymous_labels = [ "Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site", "Translation_Start_Site", "Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation" ] workflow.transform( name='postprocess_maf', func='wgs.workflows.cohort_qc.tasks.prepare_maf_for_maftools', args=(cohort_label, mgd.InputFile(filtered_cohort_maf), mgd.TempOutputFile("prepared_maf"), non_synonymous_labels, mgd.TempOutputFile("vcNames")), ) workflow.transform( name='burden_plot', func='wgs.workflows.cohort_qc.tasks.plot_mutation_burden', args=( mgd.InputFile(filtered_cohort_maf), mgd.OutputFile(burden_plot), ), ) workflow.transform( name='build_gene_list', func='wgs.workflows.cohort_qc.tasks.build_gene_list', args=(mgd.InputFile(cna_table), mgd.TempOutputFile("genelist")), ) workflow.transform( name='make_cohort_plots', func='wgs.workflows.cohort_qc.tasks.make_R_cohort_plots', args=(mgd.TempInputFile("prepared_maf"), mgd.InputFile(cna_table), mgd.OutputFile(oncoplot), mgd.OutputFile(somatic_interactions_plot), mgd.OutputFile(summary_plot), mgd.TempInputFile("vcNames"), mgd.TempInputFile("genelist"))) workflow.transform(name='make_report', func='wgs.workflows.cohort_qc.tasks.make_report', args=( cohort_label, mgd.InputFile(oncoplot), mgd.InputFile(somatic_interactions_plot), mgd.InputFile(summary_plot), mgd.InputFile(burden_plot), mgd.OutputFile(report_path), )) return workflow
def _create_download_cosmic_workflow(ref_data_version, out_file, user, password, host='sftp-cancer.sanger.ac.uk', local_download=False): host_base_path = '/files/{}/cosmic/v83/VCF'.format( ref_data_version.lower()) coding_host_path = '/'.join([host_base_path, 'CosmicCodingMuts.vcf.gz']) non_coding_host_path = '/'.join( [host_base_path, 'CosmicNonCodingVariants.vcf.gz']) sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('coding_host_path'), value=coding_host_path) workflow.setobj(obj=mgd.TempOutputObj('non_coding_host_path'), value=non_coding_host_path) workflow.subworkflow(name='download_coding', func=_create_download_cosmic_file_subworkflow, args=( host, mgd.TempInputObj('coding_host_path'), user, password, mgd.TempOutputFile('coding.vcf.gz'), ), kwargs={'local_download': local_download}) workflow.subworkflow(name='download_non_coding', func=_create_download_cosmic_file_subworkflow, args=( host, mgd.TempInputObj('non_coding_host_path'), user, password, mgd.TempOutputFile('non_coding.vcf.gz'), ), kwargs={'local_download': local_download}) workflow.transform(name='merge_files', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=([ mgd.TempInputFile('coding.vcf.gz'), mgd.TempInputFile('non_coding.vcf.gz') ], mgd.OutputFile(out_file)), kwargs={ 'allow_overlap': True, 'index_file': mgd.OutputFile(out_file + '.tbi') }) return workflow
def destruct_preprocess_workflow(normal_bam_files, normal_stats, normal_reads_1, normal_reads_2, normal_sample_1, normal_sample_2, ref_data_directory, destruct_config, config, tag=False): workflow = pypeliner.workflow.Workflow() workflow.transform(name="get_destruct_config", func="destruct.defaultconfig.get_config", ctx={ 'docker_image': config['docker']['destruct'], 'disk': 200 }, ret=mgd.TempOutputObj("destruct_config"), args=(ref_data_directory, destruct_config)) if isinstance(normal_bam_files, str): workflow.subworkflow(name='process_individual_cells', func=process_cells_destruct, args=( mgd.TempInputObj("destruct_config"), config, mgd.InputFile(normal_bam_files), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.OutputFile(normal_stats), ), kwargs={'tag': tag}) else: workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_bam_files.keys()), ) workflow.subworkflow(name='process_individual_cells', func=process_cells_destruct, args=( mgd.TempInputObj("destruct_config"), config, mgd.InputFile('bam', 'normal_cell_id', fnames=normal_bam_files), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.OutputFile(normal_stats), ), kwargs={'tag': tag}) return workflow
def destruct_preprocess_workflow(normal_bam_files, normal_stats, normal_reads_1, normal_reads_2, normal_sample_1, normal_sample_2, ref_data_directory, destruct_config, tag=False): workflow = pypeliner.workflow.Workflow() workflow.transform(name="get_destruct_config", func="destruct.defaultconfig.get_config", ret=mgd.TempOutputObj("destruct_config"), args=(ref_data_directory, destruct_config)) if isinstance(normal_bam_files, str): workflow.transform( name='bamdisc_normal', func= "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads", ctx={ 'io': 1, 'mem': 8, 'disk': 200 }, args=( mgd.TempInputObj("destruct_config"), mgd.InputFile(normal_bam_files), mgd.OutputFile(normal_stats), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.TempSpace('bamdisc_normal_tempspace'), )) else: workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_bam_files.keys()), ) workflow.subworkflow(name='process_normal_cells', func=process_cells_destruct, args=( mgd.TempInputObj("destruct_config"), mgd.InputFile('bam', 'normal_cell_id', fnames=normal_bam_files), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.OutputFile(normal_stats), ), kwargs={'tag': tag}) return workflow
def create_align_workflow(fastq_file_1, fastq_file_2, ref_genome_dir, out_bam_file, add_xs_tag=False, align_threads=1, read_group_info=None, sort_threads=1): sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='star_align', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': align_threads }, func=tasks.align, args=( mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), ref_genome_dir, mgd.TempOutputFile('aligned.bam'), mgd.TempSpace('align_tmp'), ), kwargs={ 'add_xs_tag': add_xs_tag, 'read_group_info': read_group_info, 'threads': align_threads, }) workflow.transform(name='sort', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': sort_threads }, func=soil.wrappers.sambamba.tasks.sort, args=( mgd.TempInputFile('aligned.bam'), mgd.OutputFile(out_bam_file), mgd.TempSpace('sort_tmp'), ), kwargs={'threads': sort_threads}) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_lumpy_workflow(config, normal_bam, tumour_cell_bams, lumpy_breakpoints_csv, lumpy_breakpoints_evidence, lumpy_breakpoints_bed): ctx = {'docker_image': config['docker']['single_cell_pipeline']} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(tumour_cell_bams.keys()), ) workflow.subworkflow( name='normal_preprocess_lumpy', func='single_cell.workflows.lumpy.lumpy_preprocess_workflow', ctx={'docker_image': config['docker']['single_cell_pipeline']}, args=(normal_bam, config, mgd.TempOutputFile('normal.discordants.sorted.bam'), mgd.TempOutputFile('normal.splitters.sorted.bam'), mgd.TempOutputFile('hist_normal_formatted.csv'), mgd.TempOutputFile('normal_mean_stdev.yaml')), ) workflow.subworkflow( name='tumour_preprocess_lumpy', func='single_cell.workflows.lumpy.lumpy_preprocess_workflow', ctx={'docker_image': config['docker']['single_cell_pipeline']}, args=(mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams), config, mgd.TempOutputFile('tumour.discordants.sorted.bam'), mgd.TempOutputFile('tumour.splitters.sorted.bam'), mgd.TempOutputFile('hist_tumour_formatted.csv'), mgd.TempOutputFile('tumour_mean_stdev.yaml')), ) workflow.subworkflow( name='lumpy', ctx={'docker_image': config['docker']['single_cell_pipeline']}, func="single_cell.workflows.lumpy.lumpy_calling_workflow", args=( config, mgd.TempInputFile('normal.discordants.sorted.bam'), mgd.TempInputFile('normal.splitters.sorted.bam'), mgd.TempInputFile('hist_normal_formatted.csv'), mgd.TempInputFile('normal_mean_stdev.yaml'), mgd.TempInputFile('tumour.discordants.sorted.bam'), mgd.TempInputFile('tumour.splitters.sorted.bam'), mgd.TempInputFile('hist_tumour_formatted.csv'), mgd.TempInputFile('tumour_mean_stdev.yaml'), mgd.OutputFile(lumpy_breakpoints_bed), mgd.OutputFile(lumpy_breakpoints_csv, extensions=['.yaml']), mgd.OutputFile(lumpy_breakpoints_evidence, extensions=['.yaml']), ), ) return workflow
def call_copynumber( samples, config, tumours, normals, breakpoints, titan_raw_dir, remixt_results, remixt_raw_dir, titan_segments, titan_params, titan_markers ): breakpoints = dict([(sampid, breakpoints[sampid]) for sampid in samples]) remixt_results = dict([(sampid, remixt_results[sampid]) for sampid in samples]) titan_segments = dict([(sampid, titan_segments[sampid]) for sampid in samples]) titan_params = dict([(sampid, titan_params[sampid]) for sampid in samples]) titan_markers = dict([(sampid, titan_markers[sampid]) for sampid in samples]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.Template(titan_raw_dir, 'sample_id'), mgd.OutputFile('titan_segments', 'sample_id', fnames=titan_segments), mgd.OutputFile('titan_params', 'sample_id', fnames=titan_params), mgd.OutputFile('titan_markers', 'sample_id', fnames=titan_markers), config['globals'], config['cna_calling'], config['cna_calling']['titan_intervals'], ), ) workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('breakpoints', 'sample_id', fnames=breakpoints), mgd.InputInstance('sample_id'), config['cna_calling']['remixt_refdata'], mgd.OutputFile('remixt_results', 'sample_id', fnames=remixt_results), mgd.Template(remixt_raw_dir, 'sample_id'), config['cna_calling']['min_num_reads'] ), ) return workflow
def create_tophat_transcriptome_index_workflow( ref_genome_fasta_file, transcript_gtf_file, ref_genome_index_prefix, transcriptome_index_prefix, copy_ref_genome=False): workflow = Workflow() local_ref_genome_fasta_path = ref_genome_index_prefix + '.fa' if copy_ref_genome: workflow.commandline( name='copy_genome', ctx={'local': True}, args=( 'cp', mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(local_ref_genome_fasta_path), ), ) else: workflow.commandline( name='link_genome', ctx={'local': True}, args=( 'ln', '-s', mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(local_ref_genome_fasta_path), ), ) workflow.transform( name='build_bowtie_index', ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8}, func=tasks.build_genome_index, args=( mgd.InputFile(local_ref_genome_fasta_path), mgd.OutputFile(ref_genome_index_prefix), ) ) workflow.transform( name='build_tophat_index', ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8}, func=tasks.build_transcriptome_index, args=( mgd.InputFile(ref_genome_index_prefix), mgd.InputFile(transcript_gtf_file), mgd.OutputFile(transcriptome_index_prefix), ) ) return workflow
def count_haps_workflow(args): config = inpututils.load_config(args) config = config['count_haps'] baseimage = config['docker']['single_cell_pipeline'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, docker_image=baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) allele_counts_filename = os.path.join(args["out_dir"], "allele_counts.csv.gz") meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') haplotypes_filename, tumour_cells = inpututils.load_count_haps_input( args['input_yaml']) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=list(tumour_cells.keys()), ) workflow.subworkflow( name='extract_allele_readcounts', func= 'single_cell.workflows.extract_allele_readcounts.extract_allele_readcounts', args=( mgd.InputFile(haplotypes_filename, extensions=['.yaml']), mgd.InputFile('tumour_cells.bam', 'tumour_cell_id', extensions=['.bai'], axes_origin=[], fnames=tumour_cells), mgd.OutputFile(allele_counts_filename), config, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], [allele_counts_filename], mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'count_haps' } }) return workflow
def infer_haps_workflow(args): config = inpututils.load_config(args) config = config['infer_haps'] baseimage = config['docker']['single_cell_pipeline'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, docker_image=baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) haplotypes_filename = os.path.join(args["out_dir"], "haplotypes.tsv") meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') normal_data = inpututils.load_infer_haps_input(args['input_yaml']) if isinstance(normal_data, dict): workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_data.keys()), ) bam_file = mgd.InputFile('normal.bam', 'normal_cell_id', fnames=normal_data, extensions=['.bai']) else: bam_file = mgd.InputFile(normal_data, extensions=['.bai']) workflow.subworkflow( name='infer_haps', func=infer_haps, args=( bam_file, mgd.OutputFile(haplotypes_filename), config, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], [haplotypes_filename], mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'infer_haps' } }) return workflow
def patient_workflow(config, patient_id, patient_input, output_file): workflow = pypeliner.workflow.Workflow() patient_bam_dir = config["bam_directory"] + patient_id patient_result_dir = config["results_dir"] + patient_id helpers.makedirs(patient_bam_dir) helpers.makedirs(patient_result_dir) input_args = helpers.create_input_args(patient_input, patient_bam_dir) workflow.setobj(obj=mgd.OutputChunks('sample_id', ), value=input_args['all_samples']) workflow.subworkflow(name='align_samples', func=alignment.align_sample, axes=('sample_id', ), args=( config, mgd.InputFile('fastq_1', 'sample_id', fnames=input_args['fastqs_r1']), mgd.InputFile('fastq_2', 'sample_id', fnames=input_args['fastqs_r2']), mgd.InputInstance('sample_id'), mgd.OutputFile('sample.bam', 'sample_id', fnames=input_args['all_bams']), mgd.OutputFile('sample.bam.bai', 'sample_id', fnames=input_args['all_bais']), )) workflow.subworkflow(name='run_analyses', func=analysis.partition_tumour, args=( config, input_args, patient_id, patient_result_dir, mgd.InputFile('sample.bam', 'sample_id', fnames=input_args['all_bams'], axes_origin=[]), mgd.InputFile('sample.bam.bai', 'sample_id', fnames=input_args['all_bais'], axes_origin=[]), mgd.OutputFile(output_file), )) return workflow
def create_eagle_ref_data_workflow(vcf_url_template, out_file, local_download=False): chrom_map_file = soil.utils.package_data.load_data_file( 'ref_data/data/GRCh37/chrom_map.tsv') chrom_map = pd.read_csv(chrom_map_file, sep='\t') chrom_map = chrom_map[chrom_map['ncbi'].isin( [str(x) for x in range(1, 23)])] chrom_map['url'] = chrom_map['ncbi'].apply( lambda x: vcf_url_template.format(chrom=x)) vcf_urls = chrom_map['url'].to_dict() sandbox = soil.utils.workflow.get_sandbox(['bcftools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls) workflow.transform(name='download_vcf_files', axes=('chrom', ), ctx={'local': local_download}, func=soil.ref_data.tasks.download, args=(mgd.TempInputObj('vcf_url', 'chrom'), mgd.TempOutputFile('raw.vcf.gz', 'chrom'))) workflow.transform(name='write_chrom_map', func=tasks.write_chrom_map_file, args=(mgd.InputFile(chrom_map_file), mgd.TempOutputFile('chrom_map.tsv'))) workflow.transform(name='rename_chroms', axes=('chrom', ), func=soil.wrappers.bcftools.tasks.rename_chroms, args=(mgd.TempInputFile('chrom_map.tsv'), mgd.TempInputFile('raw.vcf.gz', 'chrom'), mgd.TempOutputFile('renamed.bcf', 'chrom'))) workflow.transform(name='concat_vcfs', func=soil.wrappers.bcftools.tasks.concatenate_vcf, args=(mgd.TempInputFile('renamed.bcf', 'chrom'), mgd.OutputFile(out_file)), kwargs={'bcf_output': True}) workflow.commandline(name='index', args=('bcftools', 'index', mgd.InputFile(out_file), '-o', mgd.OutputFile(out_file + '.csi'))) return workflow
def create_patient_workflow(pseudo_bulk_group, mafs, sample_all_snv_csvs, mutationreport, merged_maf, high_impact_maf, merged_snvs, merged_high_impact_snvs): ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.transform( name='merge_mafs', func='single_cell.workflows.pseudo_bulk_qc.tasks.merge_mafs', args=( mafs, mgd.OutputFile(merged_maf), ), kwargs={"id_colname": True}) workflow.transform( name='filter_merged_maf', func= 'single_cell.workflows.pseudo_bulk_qc.tasks.filter_maf_for_high_impact', args=( mgd.InputFile(merged_maf), mgd.OutputFile(high_impact_maf), ), ) workflow.transform( name='merge_snvs', func='single_cell.workflows.pseudo_bulk_qc.tasks.merge_snvs', args=( sample_all_snv_csvs, mgd.OutputFile(merged_snvs), ), kwargs={"id_colname": True}) workflow.transform( name='filter_snvs', func= 'single_cell.workflows.pseudo_bulk_qc.tasks.filter_snvs_for_high_impact', args=( mgd.InputFile(merged_snvs), mgd.OutputFile(merged_high_impact_snvs), ), ) workflow.transform( name='mutationreport', func= 'single_cell.workflows.pseudo_bulk_qc.tasks.create_mutation_report', args=(pseudo_bulk_group, mgd.InputFile(merged_maf), mgd.InputFile(high_impact_maf), mgd.InputFile(merged_high_impact_snvs), mgd.OutputFile(mutationreport), mgd.TempSpace("mutationreport")), ) return workflow
def conversion_workflow(args): docker = docker_containers() converted_dir = args["out_dir"] cell_ids, cfse_images, livedead_images = get_cell_images( args['input_yaml']) converted_image_template = os.path.join(converted_dir, '{cell_id}.png') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': docker['microscope_image_converter']}) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.transform( name='convert', func='microscope_image_converter.tasks.convert', axes=('cell_id', ), args=( mgd.InputFile('livedead.tif', 'cell_id', fnames=livedead_images), mgd.InputFile('cfse.tif', 'cell_id', fnames=cfse_images), mgd.OutputFile('converted.png', 'cell_id', template=converted_image_template, axes_origin=[]), ), ) converted_meta = os.path.join(converted_dir, 'metadata.yaml') input_yaml_blob = os.path.join(converted_dir, 'input.yaml') workflow.transform( name='generate_meta_files_results', func='microscope_image_converter.tasks.generate_and_upload_metadata', args=(sys.argv[0:], converted_dir, mgd.Template('converted.png', 'cell_id', template=converted_image_template), mgd.OutputFile(converted_meta)), kwargs={ 'input_yaml_data': load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'cell_ids': cell_ids, 'type': 'dlp_microscope_merged', } }) return workflow
def infer_haps_workflow(args): config = helpers.load_config(args) config = config['infer_haps'] baseimage = config['docker']['single_cell_pipeline'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, baseimage=baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) haps_dir = os.path.join(args["out_dir"], "infer_haps") haplotypes_filename = os.path.join(haps_dir, "results", "haplotypes.tsv") allele_counts_filename = os.path.join(haps_dir, "results", "allele_counts.tsv") data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_wgs = data['tumour_wgs'] normal_wgs = data['normal_wgs'] tumour_cells = data['tumour_cells'] normal_cells = data['normal_cells'] if args['normal']: bam_file = normal_cells if normal_cells else normal_wgs else: bam_file = tumour_cells if tumour_cells else tumour_wgs if isinstance(bam_file, dict): workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_file.keys()), ) bam_file = mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_file, extensions=['.bai']) else: bam_file = mgd.InputFile(bam_file, extensions=['.bai']) workflow.subworkflow( name='infer_haps', func=infer_haps, args=( bam_file, mgd.OutputFile(haplotypes_filename), mgd.OutputFile(allele_counts_filename), config, ), kwargs={'normal': args['normal']}, ) return workflow
def create_merge_bams_workflow( input_bams, merged_bams, regions, config, ): merged_bams = dict([(region, merged_bams[region]) for region in regions]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(input_bams.keys()), ) workflow.setobj( obj=mgd.OutputChunks('region'), value=regions, ) one_split_job = config["one_split_job"] if one_split_job: workflow.transform( name='merge_bams', ctx={'mem': config['memory']['med'], 'ncpus': config['max_cores']}, func="single_cell.workflows.merge_bams.tasks.merge_bams", args=( mgd.InputFile('bam', 'cell_id', fnames=input_bams, extensions=['.bai']), mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[], extensions=['.bai']), regions, mgd.TempSpace("merge_bams_tempdir") ), kwargs={"ncores": config["max_cores"]} ) else: workflow.transform( name='split_merge_tumour', func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams', axes=('region',), args=( mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=input_bams), mgd.OutputFile( 'tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=merged_bams), mgd.Instance('region'), ), ) return workflow
def create_sample_qc_workflow_normal_only( sample_id, refdir, normal_bam, roh, germline_calls, genome_wide_plot, normal_coverage, chromosomes, bins, mapping_qual_threshold, single_node=False ): workflow = pypeliner.workflow.Workflow() workflow.subworkflow( name='coverage_normal_data', func=get_coverage_data, args=( mgd.InputFile(normal_bam), mgd.OutputFile(normal_coverage), refdir, chromosomes, mapping_qual_threshold, bins, ), kwargs={'single_node': single_node} ) workflow.transform( name='generate_genome_wide_plot', ctx=helpers.get_default_ctx( memory=10, ), func="wgs.workflows.sample_qc.tasks.genome_wide", args=( sample_id, mgd.InputFile(roh), mgd.InputFile(germline_calls), mgd.InputFile(normal_coverage), chromosomes, mgd.OutputFile(genome_wide_plot), ), kwargs={"normal_only":True} ) return workflow
def create_custom_dna_proteome_from_fastq_workflow(normal_fastq_file_1, normal_fastq_file_2, tumour_fastq_file_1, tumour_fastq_file_2, ref_genome_fasta_file, ref_proteome_fasta_file, normal_bam_file, tumour_bam_file, custom_proteome_file, strelka_file, genome_version='GRCh37', is_exome=False, pyensembl_cache_dir=None, threads=1): workflow = pypeliner.workflow.Workflow() workflow.subworkflow(name='align_normal', func=create_align_workflow, args=(mgd.InputFile(normal_fastq_file_1), mgd.InputFile(normal_fastq_file_2), mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(normal_bam_file)), kwargs={'threads': threads}) workflow.subworkflow(name='align_tumour', func=create_align_workflow, args=(mgd.InputFile(tumour_fastq_file_1), mgd.InputFile(tumour_fastq_file_2), mgd.InputFile(ref_genome_fasta_file), mgd.OutputFile(tumour_bam_file)), kwargs={'threads': threads}) workflow.subworkflow(name='create_db', func=create_custom_proteom_from_bam_workflow, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.InputFile(ref_proteome_fasta_file), mgd.OutputFile(custom_proteome_file), mgd.OutputFile(strelka_file)), kwargs={ 'genome_version': genome_version, 'is_exome': is_exome, 'pyensembl_cache_dir': pyensembl_cache_dir }) return workflow
def create_vcf2maf_workflow(vcf_file, maf_file, reference, tumour_id=None, normal_id=None): workflow = pypeliner.workflow.Workflow() workflow.transform(name='vcf2maf', func='wgs.workflows.vcf2maf.tasks.run_vcf2maf', args=(mgd.InputFile(vcf_file), mgd.TempOutputFile('maf_file.maf'), mgd.TempSpace('vcf2maf_temp'), reference), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) workflow.transform(name='update_ids', func='wgs.workflows.vcf2maf.tasks.update_ids', args=( mgd.TempInputFile('maf_file.maf'), tumour_id, normal_id, mgd.OutputFile(maf_file), )) return workflow
def run_MutationSeq(config, normal_bam, tumour_bam, output_file): workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X']))) workflow.transform( name='run_museq_paired', ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'}, axes=('interval',), func=tasks.run_museq, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.InputInstance('interval'), mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), ) ) workflow.transform( name='merge_vcfs', func=tasks.merge_vcfs, args=( mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]), mgd.OutputFile(output_file), mgd.TempSpace('merge_vcf'), ) ) return workflow
def create_workflow_1(input_filename, output_filename): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1}) # Read data into a managed object workflow.transform(name='read', func=read_stuff, ret=mgd.TempOutputObj('input_data'), args=(mgd.InputFile(input_filename), )) # Extract a property of the managed object, modify it # and store the result in another managed object workflow.transform( name='do', func=do_stuff, ret=mgd.TempOutputObj('output_data'), args=(mgd.TempInputObj('input_data').prop('some_string'), )) # Write the object to an output file workflow.transform(name='write', func=write_stuff, args=(mgd.TempInputObj('output_data'), mgd.TempOutputFile('output_file'))) # Recursive workflow workflow.subworkflow(name='sub_workflow_2', func=create_workflow_2, args=(mgd.TempInputFile('output_file'), mgd.OutputFile(output_filename))) return workflow
def create_lumpy_workflow(lumpy_vcf, tumour_bam=None, normal_bam=None, single_node=False): workflow = pypeliner.workflow.Workflow() lumpy_job_name = 'run_lumpy' if normal_bam: normal_bam = mgd.InputFile(normal_bam) normal_disc = mgd.TempInputFile('normal.discordants.sorted.bam') normal_split = mgd.TempInputFile('normal.splitters.sorted.bam') lumpy_job_name += '_normal' else: normal_disc = None normal_split = None if tumour_bam: tumour_bam = mgd.InputFile(tumour_bam) tumour_disc = mgd.TempInputFile('tumour.discordants.sorted.bam') tumour_split = mgd.TempInputFile('tumour.splitters.sorted.bam') lumpy_job_name += '_tumour' else: tumour_disc = None tumour_split = None if normal_bam: workflow.subworkflow( name='preprocess_lumpy_normal', func=lumpy_preprocess_workflow, args=(normal_bam, mgd.TempOutputFile('normal.discordants.sorted.bam'), mgd.TempOutputFile('normal.splitters.sorted.bam')), kwargs={'single_node': single_node}) if tumour_bam: workflow.subworkflow( name='preprocess_lumpy_tumour', func=lumpy_preprocess_workflow, args=(tumour_bam, mgd.TempOutputFile('tumour.discordants.sorted.bam'), mgd.TempOutputFile('tumour.splitters.sorted.bam')), kwargs={'single_node': single_node}) workflow.transform( name=lumpy_job_name, ctx=helpers.get_default_ctx(memory=10, disk=500, walltime='72:00'), func='wgs.workflows.lumpy.tasks.run_lumpyexpress', args=(mgd.OutputFile(lumpy_vcf), config.default_params('breakpoint_calling')['lumpy_paths']), kwargs={ 'tumour_bam': tumour_bam, 'tumour_discordants': tumour_disc, 'tumour_splitters': tumour_split, 'normal_bam': normal_bam, 'normal_discordants': normal_disc, 'normal_splitters': normal_split, 'docker_image': config.containers('lumpy') }) return workflow
def create_db_annotation_workflow(in_vcf_file, out_csv_file, db_vcf_file, split_size=1e4): workflow = pypeliner.workflow.Workflow( ctx=dict(mem=2, num_retry=3, mem_retry_increment=2)) workflow.transform(name='split_vcf', func='single_cell.utils.vcfutils.split_vcf', args=(mgd.InputFile(in_vcf_file), mgd.TempOutputFile('split.vcf', 'split')), kwargs={'lines_per_file': split_size}) workflow.transform( name='annotate_db_status', axes=('split', ), func='single_cell.workflows.db_annotation.tasks.annotate_db_status', args=(db_vcf_file, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('annotated.csv.gz', 'split', extensions=['.yaml']))) workflow.transform(name='merge_tables', func='single_cell.utils.csvutils.concatenate_csv', args=(mgd.TempInputFile('annotated.csv.gz', 'split', extensions=['.yaml']), mgd.OutputFile(out_csv_file, extensions=['.yaml']))) return workflow
def create_extract_seqdata_workflow( bam_filename, seqdata_filename, remixt_config, remixt_ref_data_dir, config, ): ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'docker_image': config['docker']['single_cell_pipeline'], 'mem': config["memory"]['high'] } workflow = pypeliner.workflow.Workflow() workflow.transform( name='create_cell_seqdata', ctx=ctx, func= "single_cell.workflows.extract_seqdata.tasks.create_chromosome_seqdata", args=( mgd.OutputFile(seqdata_filename), mgd.InputFile(bam_filename, extensions=['.bai']), mgd.TempSpace("extract_seqdata_temp"), remixt_config, remixt_ref_data_dir, ), kwargs={'chromosomes': config['chromosomes']}) return workflow
def create_db_workflow(in_file, ref_proteome_fasta_file, out_file, genome_version='GRCh37', pyensembl_cache_dir=None): sandbox = pypeliner.sandbox.CondaSandbox(pip_packages=['varcode']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='clean_ref_fasta', func=tasks.clean_ref_proteome_ids, args=(mgd.InputFile(ref_proteome_fasta_file), mgd.TempOutputFile('ref.fasta'))) workflow.transform(name='build_variant_table', func=tasks.build_variant_table, args=(mgd.InputFile(in_file), mgd.TempOutputFile('variant_table.tsv.gz')), kwargs={ 'genome_version': genome_version, 'pyensembl_cache_dir': pyensembl_cache_dir }) workflow.transform(name='build_variant_fasta', func=tasks.build_variant_fasta, args=(mgd.TempInputFile('variant_table.tsv.gz'), mgd.TempOutputFile('var.fasta'))) workflow.commandline(name='build_db', args=('cat', mgd.TempInputFile('ref.fasta'), mgd.TempInputFile('var.fasta'), '>', mgd.OutputFile(out_file))) return workflow