def create_snv_allele_counts_for_vcf_targets_workflow( bam_files, vcf_file, out_file, memory_cfg, count_duplicates=False, min_bqual=0, min_mqual=0, table_name='snv_allele_counts', vcf_to_bam_chrom_map=None, ): ctx = { 'mem': memory_cfg['low'], 'num_retry': 3, 'mem_retry_increment': 2, 'ncpus': 1, 'disk_retry_increment': 50, } workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(bam_files.keys()), ) workflow.transform( name='get_snv_allele_counts_for_vcf_targets', axes=('sample_id', 'library_id', 'cell_id'), func= "biowrappers.components.variant_calling.snv_allele_counts.tasks.get_snv_allele_counts_for_vcf_targets", args=( mgd.InputFile('tumour.bam', 'sample_id', 'library_id', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.InputFile(vcf_file), mgd.TempOutputFile('counts.h5', 'sample_id', 'library_id', 'cell_id'), table_name, ), kwargs={ 'count_duplicates': count_duplicates, 'min_bqual': min_bqual, 'min_mqual': min_mqual, 'vcf_to_bam_chrom_map': vcf_to_bam_chrom_map, 'cell_id': mgd.Instance('cell_id'), 'sample_id': mgd.Instance('sample_id'), 'library_id': mgd.Instance('library_id'), 'report_zero_count_positions': False, }) workflow.transform( name='merge_snv_allele_counts', ctx={ 'mem': memory_cfg['high'], 'disk': 20 }, func="biowrappers.components.io.hdf5.tasks.concatenate_tables", args=( mgd.TempInputFile('counts.h5', 'sample_id', 'library_id', 'cell_id'), mgd.TempOutputFile('merged_counts.h5'), ), kwargs={ 'in_memory': False, }, ) workflow.transform(name='convert_h5_to_csv', func='single_cell.utils.hdfutils.convert_hdf_to_csv', args=(mgd.TempInputFile('merged_counts.h5'), { '/snv_allele_counts': mgd.OutputFile(out_file, extensions=['.yaml']), })) return workflow
def germline_calling_workflow(args): config = inpututils.load_config(args) config = config['germline_calling'] vcftoolsdocker = {'docker_image': config['docker']['vcftools']} samtoolsdocker = {'docker_image': config['docker']['samtools']} snpeffdocker = {'docker_image': config['docker']['snpeff']} normal_bams = inpututils.load_germline_data(args['input_yaml']) varcalls_meta = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') out_files = get_output_files(args['out_dir']) workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config['docker']['single_cell_pipeline']}) workflow.setobj( obj=mgd.OutputChunks('region'), value=list(normal_bams.keys()), ) workflow.subworkflow(name='samtools_germline', func=germline.create_samtools_germline_workflow, args=( mgd.InputFile("normal_split.bam", "region", extensions=['.bai'], fnames=normal_bams), config['ref_genome'], mgd.OutputFile(out_files['samtools_germline_vcf'], extensions=['.tbi']), config, ), kwargs={ 'vcftools_docker': vcftoolsdocker, 'samtools_docker': samtoolsdocker, }) workflow.subworkflow( name='annotate_mappability', func= "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow", args=( config['databases']['mappability']['local_path'], mgd.InputFile(out_files['samtools_germline_vcf'], extensions=['.tbi']), mgd.OutputFile(out_files['mappability_filename']), ), kwargs={'chromosomes': config['chromosomes']}) workflow.transform( name='annotate_genotype', func="single_cell.workflows.germline.tasks.annotate_normal_genotype", args=( mgd.InputFile(out_files['samtools_germline_vcf'], extensions=['.tbi']), mgd.OutputFile(out_files['normal_genotype_filename']), config["chromosomes"], ), ) workflow.subworkflow( name='snpeff', func= "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow", args=( config['databases']['snpeff']['db'], mgd.InputFile(out_files['samtools_germline_vcf'], extensions=['.tbi']), mgd.OutputFile(out_files['snpeff_vcf_filename']), ), kwargs={ 'hdf5_output': False, 'vcftools_docker': vcftoolsdocker, 'snpeff_docker': snpeffdocker, }) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], list(out_files.values()), mgd.OutputFile(varcalls_meta)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'germline_calling' } }) return workflow
def create_alignment_workflow(fastq_1_filename, fastq_2_filename, bam_filename, alignment_metrics, gc_metrics, detailed_fastqscreen_metrics, plot_metrics, ref_genome, config, triminfo, centerinfo, sample_info, cell_ids, metrics_tar, library_id, realign=False): baseimage = config['docker']['single_cell_pipeline'] bam_filename = dict([(cellid, bam_filename[cellid]) for cellid in cell_ids]) chromosomes = config["chromosomes"] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('chrom'), value=chromosomes, ) workflow.setobj(obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]), value=sample_info) workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=list(fastq_1_filename.keys()), ) workflow.setobj(obj=mgd.TempOutputObj('trim', 'cell_id', 'lane', axes_origin=[]), value=triminfo) workflow.setobj(obj=mgd.TempOutputObj('center', 'cell_id', 'lane', axes_origin=[]), value=centerinfo) workflow.transform( name='run_fastq_screen', ctx={ 'mem': 7, 'ncpus': 1, 'docker_image': baseimage }, axes=( 'cell_id', 'lane', ), func="single_cell.workflows.align.fastqscreen.organism_filter", args=(mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq_1_filename), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq_2_filename), mgd.TempOutputFile('fastq_r1_matching_reads.fastq.gz', 'cell_id', 'lane'), mgd.TempOutputFile('fastq_r2_matching_reads.fastq.gz', 'cell_id', 'lane'), mgd.TempOutputFile('organism_detailed_count_per_lane.csv', 'cell_id', 'lane'), mgd.TempOutputFile('organism_summary_count_per_lane.csv', 'cell_id', 'lane'), mgd.TempSpace("tempdir_organism_filter", 'cell_id', 'lane'), mgd.InputInstance('cell_id'), config['fastq_screen_params'], config['ref_type']), kwargs={ 'docker_image': config['docker']['fastq_screen'], 'filter_contaminated_reads': config['fastq_screen_params']['filter_contaminated_reads'] }) workflow.transform( name='merge_fastq_screen_metrics', ctx={ 'mem': 7, 'ncpus': 1, 'docker_image': baseimage }, func= "single_cell.workflows.align.fastqscreen.merge_fastq_screen_counts", args=( mgd.TempInputFile('organism_detailed_count_per_lane.csv', 'cell_id', 'lane'), mgd.TempInputFile('organism_summary_count_per_lane.csv', 'cell_id', 'lane'), mgd.OutputFile(detailed_fastqscreen_metrics, extensions=['.yaml']), mgd.TempOutputFile('organism_summary_count_per_cell.csv'), )) workflow.transform( name='align_reads', ctx={ 'mem': 7, 'ncpus': 1, 'docker_image': baseimage }, axes=( 'cell_id', 'lane', ), func="single_cell.workflows.align.tasks.align_pe", args=( mgd.TempInputFile('fastq_r1_matching_reads.fastq.gz', 'cell_id', 'lane'), mgd.TempInputFile('fastq_r2_matching_reads.fastq.gz', 'cell_id', 'lane'), mgd.TempOutputFile('aligned_per_cell_per_lane.sorted.bam', 'cell_id', 'lane'), mgd.TempOutputFile('fastqc_reports.tar.gz', 'cell_id', 'lane'), mgd.TempOutputFile('flagstat_metrics.txt', 'cell_id', 'lane'), mgd.TempSpace('alignment_temp', 'cell_id', 'lane'), ref_genome, mgd.TempInputObj('trim', 'cell_id', 'lane'), mgd.TempInputObj('center', 'cell_id', 'lane'), mgd.TempInputObj('sampleinfo', 'cell_id'), mgd.InputInstance('cell_id'), mgd.InputInstance('lane'), library_id, config['aligner'], config['docker'], config['adapter'], config['adapter2'], config['fastq_screen_params'], )) workflow.transform(name='merge_bams', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.align.tasks.merge_bams", axes=('cell_id', ), args=(mgd.TempInputFile( 'aligned_per_cell_per_lane.sorted.bam', 'cell_id', 'lane'), mgd.TempOutputFile('merged_lanes.bam', 'cell_id'), mgd.TempOutputFile('merged_lanes.bam.bai', 'cell_id'), config['docker'])) if realign: workflow.transform(name='realignment', axes=('chrom', ), ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.align.tasks.realign", args=(mgd.TempInputFile('merged_lanes.bam', 'cell_id'), mgd.TempInputFile('merged_lanes.bam.bai', 'cell_id'), mgd.TempOutputFile('realigned.bam', 'chrom', 'cell_id'), mgd.TempSpace('realignment_temp', 'chrom'), config, mgd.InputInstance('chrom'))) workflow.transform( name='merge_realignment', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, axes=('cell_id', ), func="single_cell.workflows.align.tasks.merge_realignment", args=(mgd.TempInputFile('realigned.bam', 'chrom', 'cell_id'), mgd.TempOutputFile('merged_realign.bam', 'cell_id'), config, mgd.InputInstance('cell_id'))) final_bam = mgd.TempInputFile('merged_lanes.bam', 'cell_id') if realign: final_bam = mgd.TempInputFile('merged_realign.bam', 'cell_id') workflow.transform( name='postprocess_bam', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, axes=('cell_id', ), func="single_cell.workflows.align.tasks.postprocess_bam", args=( final_bam, mgd.OutputFile('sorted_markdups', 'cell_id', fnames=bam_filename, extensions=['.bai']), mgd.TempSpace('tempdir', 'cell_id'), config['docker'], ), ) workflow.subworkflow( name='metrics_subworkflow', func="single_cell.workflows.align.bam_metrics_workflow", args=(mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename, extensions=['.bai']), mgd.TempInputFile('organism_summary_count_per_cell.csv'), mgd.TempOutputFile('alignment_metrics.csv', extensions=['.yaml']), mgd.OutputFile(gc_metrics, extensions=['.yaml']), mgd.TempOutputFile('markdups_metrics.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('flagstat_metrics.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('wgs_metrics.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('gc_metrics.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('gc_metrics_summary.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('gc_metrics.pdf', 'cell_id', axes_origin=[]), mgd.TempOutputFile('insert_metrics.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('insert_metrics.pdf', 'cell_id', axes_origin=[]), ref_genome, sample_info, config, cell_ids)) workflow.transform( name='add_contamination_status', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.align.tasks.add_contamination_status", args=( mgd.TempInputFile('alignment_metrics.csv', extensions=['.yaml']), mgd.OutputFile(alignment_metrics, extensions=['.yaml']), ), kwargs={ 'reference': config['ref_type'], 'strict_validation': config['fastq_screen_params']['strict_validation'] }) workflow.transform(name='plot_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.align.tasks.plot_metrics", args=( mgd.InputFile(alignment_metrics, extensions=['.yaml']), mgd.OutputFile(plot_metrics), 'QC pipeline metrics', mgd.InputFile(gc_metrics, extensions=['.yaml']), config['gc_windows'], )) workflow.transform(name='tar_all_files', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.utils.helpers.tar_files", args=([ mgd.TempInputFile('fastqc_reports.tar.gz', 'cell_id', 'lane'), mgd.TempInputFile('flagstat_metrics.txt', 'cell_id', 'lane'), mgd.TempInputFile('markdups_metrics.txt', 'cell_id'), mgd.TempInputFile('flagstat_metrics.txt', 'cell_id'), mgd.TempInputFile('wgs_metrics.txt', 'cell_id'), mgd.TempInputFile('gc_metrics.txt', 'cell_id'), mgd.TempInputFile('gc_metrics_summary.txt', 'cell_id'), mgd.TempInputFile('gc_metrics.pdf', 'cell_id'), mgd.TempInputFile('insert_metrics.txt', 'cell_id'), mgd.TempInputFile('insert_metrics.pdf', 'cell_id'), ], mgd.OutputFile(metrics_tar), mgd.TempSpace("merge_metrics_tar"))) return workflow
def breakpoint_calling_workflow(workflow, args): config = helpers.load_config(args) normal_bam_file = args['matched_normal'] bam_files, bai_files = helpers.get_bams(args['input_yaml']) varcalls_dir = os.path.join(args['out_dir'], 'results', 'breakpoint_calling') raw_data_directory = os.path.join(varcalls_dir, 'raw') breakpoints_filename = os.path.join(varcalls_dir, 'breakpoints.h5') ref_data_directory = '/refdata' pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=bam_files.keys(), ) workflow.subworkflow( name='destruct', func= "biowrappers.components.breakpoint_calling.destruct.destruct_pipeline", args=( mgd.InputFile(normal_bam_file), mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files), config.get('destruct', {}), ref_data_directory, mgd.OutputFile(breakpoints_filename), raw_data_directory, ), ) info_file = os.path.join(args["out_dir"], 'results', 'breakpoint_calling', "info.yaml") results = { 'destruct_data': helpers.format_file_yaml(breakpoints_filename), } input_datasets = { k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems() } input_datasets = {'normal': normal_bam_file, 'tumour': input_datasets} metadata = { 'breakpoint_calling': { 'ref_data': ref_data_directory, 'version': single_cell.__version__, 'results': results, 'containers': config['containers'], 'input_datasets': input_datasets, 'output_datasets': None } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], mem_retry_increment=2, ncpus=1), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def wgs_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config_file']) inputs = helpers.load_yaml(args['input_yaml']) tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') targets = helpers.get_values_from_input(inputs, 'target_list') samples = tumours.keys() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) if args['alignment']: tumour_fastqs_r1, tumour_fastqs_r2 = get_fastqs(inputs, samples, 'tumour') normal_fastqs_r1, normal_fastqs_r2 = get_fastqs(inputs, samples, 'normal') normal_alignment_template = os.path.join( args['out_dir'], 'alignment', '{norm_sample_id}', '{norm_lane}', 'normal' ) tumour_alignment_template = os.path.join( args['out_dir'], 'alignment', '{tum_sample_id}', '{tum_lane}', 'tumour' ) workflow.subworkflow( name='wgs_alignment_paired_lanes', func=paired_alignment, args=( config, mgd.OutputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.OutputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), samples, tumour_fastqs_r1, tumour_fastqs_r2, normal_fastqs_r1, normal_fastqs_r2, normal_alignment_template, tumour_alignment_template, ) ) museq_dir = os.path.join(args['out_dir'], 'variants') museq_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_paired_annotated.vcf.gz') museq_ss_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_single_annotated.vcf.gz') strelka_snv_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_snv_annotated.vcf.gz') strelka_indel_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_indel_annotated.vcf.gz') parsed_snv_csv = os.path.join(museq_dir, '{sample_id}', 'allcalls.csv') museq_paired_pdf = os.path.join(museq_dir, '{sample_id}', 'paired_museqportrait.pdf') museq_single_pdf = os.path.join(museq_dir, '{sample_id}', 'single_museqportrait.pdf') workflow.subworkflow( name='variant_calling', func=call_variants, args=( samples, config, mgd.OutputFile('parsed_snv_csv', 'sample_id', template=parsed_snv_csv, axes_origin=[]), mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq', 'sample_id', template=museq_vcf, axes_origin=[]), mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('strelka_snv', 'sample_id', template=strelka_snv_vcf, axes_origin=[]), mgd.OutputFile('strelka_indel', 'sample_id', template=strelka_indel_vcf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), ) ) sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}') destruct_breakpoints = os.path.join(sv_outdir, 'destruct_breakpoints.csv') destruct_library = os.path.join(sv_outdir, 'destruct_library.csv') destruct_raw_breakpoints = os.path.join(sv_outdir, 'destruct_raw_breakpoints.csv') destruct_raw_library = os.path.join(sv_outdir, 'destruct_raw_library.csv') destruct_reads = os.path.join(sv_outdir, 'destruct_reads.csv') lumpy_vcf = os.path.join(sv_outdir, 'lumpy.vcf') parsed_csv = os.path.join(sv_outdir, 'filtered_consensus_calls.csv') workflow.subworkflow( name="call_breakpoints", func=call_breakpoints, args=( samples, config, mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints, axes_origin=[]), mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library, axes_origin=[]), mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints, axes_origin=[]), mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library, axes_origin=[]), mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads, axes_origin=[]), mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf, axes_origin=[]), mgd.OutputFile('parsed_csv', 'sample_id', template=parsed_csv, axes_origin=[]) ) ) cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data') titan_raw_dir = os.path.join(cna_outdir, 'titan') remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5') titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5') titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5') titan_params_filename = os.path.join(titan_raw_dir, 'params.h5') workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("target_list", 'sample_id', fnames=targets, axes_origin=[]), mgd.Template(titan_raw_dir, 'sample_id'), mgd.OutputFile('titan_segments_filename', 'sample_id', axes_origin=[], template=titan_segments_filename), mgd.OutputFile('titan_params_filename', 'sample_id', axes_origin=[], template=titan_params_filename), mgd.OutputFile('titan_markers_filename', 'sample_id', axes_origin=[], template=titan_markers_filename), config['globals'], config['cna_calling'], config['cna_calling']['titan_intervals'], mgd.InputInstance('sample_id'), ), ) workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('destruct_breakpoints', 'sample_id', axes_origin=[], template=destruct_breakpoints), mgd.InputInstance('sample_id'), config['cna_calling']['remixt_refdata'], mgd.OutputFile('remixt_results_filename', 'sample_id', axes_origin=[], template=remixt_results_filename), mgd.Template(remixt_raw_dir, 'sample_id'), config['cna_calling']['min_num_reads'] ), ) pyp.run(workflow)
def split_bam_workflow(workflow, args): config = helpers.load_config(args) info_file = os.path.join(args["out_dir"], 'results', 'split_bam', 'info.yaml') split_bam_template = args["split_bam_template"] split_bai_template = args["split_bam_template"] + ".bai" by_reads = False if "{region}" in split_bam_template else True splitkeyword = "region" if "{region}" in split_bam_template else "reads" if by_reads: splitnames = [str(i) for i in range(config["num_splits_byreads"])] workflow.setobj( obj=mgd.OutputChunks('reads'), value=splitnames, ) else: workflow.transform( name="get_regions", ctx={ 'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2, 'pool_id': config['pools']['standard'], 'ncpus': 1 }, func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.TempOutputObj('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow(name="split_normal", func=split_bams.create_split_workflow, args=( mgd.InputFile(args['wgs_bam']), mgd.InputFile(args['wgs_bam'] + ".bai"), mgd.OutputFile("normal.split.bam", splitkeyword, template=split_bam_template, axes_origin=[]), mgd.OutputFile("normal.split.bam.bai", splitkeyword, template=split_bai_template, axes_origin=[]), pypeliner.managed.TempInputObj(splitkeyword), config, ), kwargs={"by_reads": by_reads}) regions = mgd.InputChunks( 'reads') if by_reads else pypeliner.managed.TempInputObj('region') workflow.transform(name="get_files", func='single_cell.utils.helpers.resolve_template', ret=pypeliner.managed.TempOutputObj('outputs'), args=(pypeliner.managed.TempInputObj('region'), split_bam_template, 'region')) metadata = { 'split_bams': { 'name': 'merge_bams', 'ref_genome': config["ref_genome"], 'version': single_cell.__version__, 'containers': config['containers'], 'output_datasets': pypeliner.managed.TempInputObj('outputs'), 'input_datasets': args['wgs_bam'], 'results': None } } workflow.transform(name='generate_meta_yaml', ctx=dict( mem=config['memory']['med'], pool_id=config['pools']['standard'], ), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def variant_calling_workflow(args): config = inpututils.load_config(args) config = config['variant_calling'] normal_bams, tumour_bams = inpututils.load_variant_calling_input( args['input_yaml']) filepaths = get_file_paths(args['out_dir']) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') basedocker = {'docker_image': config['docker']['single_cell_pipeline']} vcftools_docker = {'docker_image': config['docker']['vcftools']} baseimage = config['docker']['single_cell_pipeline'] ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'mem': config["memory"]['low'], 'docker_image': baseimage } workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('region'), value=list(normal_bams.keys()), ) workflow.subworkflow( name='museq', func=mutationseq.create_museq_workflow, args=( mgd.InputFile('normal_regions.bam', 'region', extensions=['.bai'], fnames=normal_bams), mgd.InputFile('tumour_regions.bam', 'region', extensions=['.bai'], fnames=tumour_bams), config['ref_genome'], mgd.OutputFile(filepaths['museq_vcf'], extensions=['.tbi', '.csi']), config, ), ) workflow.subworkflow(name='strelka', func=strelka.create_strelka_workflow, args=( mgd.InputFile('normal_regions.bam', 'region', extensions=['.bai'], fnames=normal_bams), mgd.InputFile('tumour_regions.bam', 'region', extensions=['.bai'], fnames=tumour_bams), config['ref_genome'], mgd.OutputFile(filepaths['strelka_indel'], extensions=['.tbi', '.csi']), mgd.OutputFile(filepaths['strelka_snv'], extensions=['.tbi', '.csi']), config, ), kwargs={ "chromosomes": config["chromosomes"], "use_depth_thresholds": config['use_depth_thresholds'] }) workflow.transform( name='merge_snvs', func='biowrappers.components.io.vcf.tasks.merge_vcfs', ctx=ctx, args=([ mgd.InputFile(filepaths['museq_vcf'], extensions=['.tbi', '.csi']), mgd.InputFile(filepaths['strelka_snv'], extensions=['.tbi', '.csi']), ], mgd.TempOutputFile('all.snv.vcf')), ) workflow.transform(name='finalise_snvs', func="biowrappers.components.io.vcf.tasks.finalise_vcf", ctx=ctx, args=(mgd.TempInputFile('all.snv.vcf'), mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi'])), kwargs={'docker_config': vcftools_docker}) workflow.subworkflow( name='annotate_snvs', axes=(), ctx=ctx, func= "biowrappers.pipelines.snv_call_and_annotate.create_annotation_workflow", args=( config, mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempOutputFile('snv_annotations.h5'), mgd.TempSpace('raw_data_dir_annotate'), ), kwargs={ 'variant_type': 'snv', 'docker_config': basedocker, 'snpeff_docker': vcftools_docker, 'vcftools_docker': vcftools_docker }) workflow.transform( name='convert_museq_to_csv', func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv", ctx=ctx, args=( mgd.InputFile(filepaths['museq_vcf']), mgd.TempOutputFile('museq.csv'), ), kwargs={ 'score_callback': museq_callback, }) workflow.transform( name='prep_museq_csv', func='single_cell.utils.csvutils.finalize_csv', args=(mgd.TempInputFile('museq.csv'), mgd.OutputFile(filepaths['museq_csv'], extensions=['.yaml'])), ) workflow.transform( name='convert_strelka_to_csv', func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv", ctx=ctx, args=( mgd.InputFile(filepaths['strelka_snv']), mgd.TempOutputFile('strelka_snv.csv'), ), kwargs={ 'score_callback': strelka_snv_callback, }) workflow.transform( name='prep_strelka_csv', func='single_cell.utils.csvutils.finalize_csv', args=(mgd.TempInputFile('strelka_snv.csv'), mgd.OutputFile(filepaths['strelka_csv'], extensions=['.yaml'])), ) workflow.transform(name='convert_h5_to_csv', func='single_cell.utils.hdfutils.convert_hdf_to_csv', args=(mgd.TempInputFile('snv_annotations.h5'), { '/snv/cosmic_status': mgd.OutputFile(filepaths['cosmic_csv'], extensions=['.yaml']), '/snv/dbsnp_status': mgd.OutputFile(filepaths['dbsnp_csv'], extensions=['.yaml']), '/snv/mappability': mgd.OutputFile(filepaths['mappability_csv'], extensions=['.yaml']), '/snv/snpeff': mgd.OutputFile(filepaths['snpeff_csv'], extensions=['.yaml']), '/snv/tri_nucleotide_context': mgd.OutputFile(filepaths['trinuc_csv'], extensions=['.yaml']), })) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], list(filepaths.values()), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'variant_calling' } }) return workflow
def infer_haps( bam_file, haplotypes_filename, config, from_tumour=False, ): remixt_config = config.get('extract_seqdata', {}) remixt_ref_data_dir = config['ref_data_dir'] chromosomes = config['chromosomes'] remixt_config['chromosomes'] = chromosomes ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1) workflow = pypeliner.workflow.Workflow(ctx=ctx) if isinstance(bam_file, dict): workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_file.keys()), ) # dont parallelize over chromosomes for per cell bams workflow.subworkflow( name="extract_seqdata", axes=('cell_id', ), func='remixt.workflow.create_extract_seqdata_workflow', args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']), mgd.TempOutputFile('seqdata_cell.h5', 'cell_id'), remixt_config, remixt_ref_data_dir, ), kwargs={'no_parallelism': True}) workflow.transform( name='merge_all_seqdata', func="remixt.seqdataio.merge_overlapping_seqdata", args=(mgd.TempOutputFile('seqdata_file.h5'), mgd.TempInputFile("seqdata_cell.h5", "cell_id"), config["chromosomes"]), ) else: workflow.subworkflow( name='extract_seqdata', func='remixt.workflow.create_extract_seqdata_workflow', ctx={'disk': 150}, args=( mgd.InputFile(bam_file, extensions=['.bai']), mgd.TempOutputFile('seqdata_file.h5'), remixt_config, remixt_ref_data_dir, ), ) workflow.setobj( obj=mgd.OutputChunks('chromosome'), value=chromosomes, ) if from_tumour: func = 'remixt.analysis.haplotype.infer_snp_genotype_from_tumour' else: func = 'remixt.analysis.haplotype.infer_snp_genotype_from_normal' workflow.transform( name='infer_snp_genotype', axes=('chromosome', ), ctx={'mem': 16}, func=func, args=( mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'), mgd.TempInputFile('seqdata_file.h5'), mgd.InputInstance('chromosome'), config, ), ) workflow.transform( name='infer_haps', axes=('chromosome', ), ctx={'mem': 16}, func='remixt.analysis.haplotype.infer_haps', args=( mgd.TempOutputFile('haplotypes.tsv', 'chromosome'), mgd.TempInputFile('snp_genotype.tsv', 'chromosome'), mgd.InputInstance('chromosome'), mgd.TempSpace('haplotyping', 'chromosome'), remixt_config, remixt_ref_data_dir, ), ) workflow.transform(name='merge_haps', ctx={'mem': 16}, func='remixt.utils.merge_tables', args=( mgd.TempOutputFile('haplotypes_merged.tsv'), mgd.TempInputFile('haplotypes.tsv', 'chromosome'), )) workflow.transform( name='annotate_haps', ctx={'mem': 16}, func='single_cell.workflows.infer_haps.tasks.annotate_ref_alt', args=( mgd.TempInputFile('haplotypes_merged.tsv'), remixt_ref_data_dir, mgd.TempOutputFile('haplotypes_annotated.tsv'), )) workflow.transform( name='finalize_csv', ctx={'mem': 16}, func='single_cell.utils.csvutils.rewrite_csv_file', args=( mgd.TempInputFile('haplotypes_annotated.tsv'), mgd.OutputFile(haplotypes_filename, extensions=['.yaml']), ), kwargs={ 'write_header': True, 'dtypes': dtypes()['haplotypes'] }, ) return workflow
def create_museq_workflow(snv_vcf, snv_maf, museqportrait_pdf, reference, reference_vep, chromosomes, normal_id=None, tumour_id=None, thousand_genomes=None, dbsnp=None, germline_refdata=None, tumour_bam=None, normal_bam=None, single_node=None): name = 'run_museq' if tumour_bam: tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai']) name += '_tumour' if normal_bam: normal_bam = mgd.InputFile(normal_bam, extensions=['.bai']) name += '_normal' single = False if name == 'run_museq_tumour_normal' else True params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow() workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform(name=name, ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus='8', disk=600), func='wgs.utils.museq_utils.run_museq_one_job', args=( mgd.TempSpace("run_museq_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), params['museq_params'], ), kwargs={ 'tumour_bam': tumour_bam, 'normal_bam': normal_bam, }) workflow.transform( name='fix_vcf_merged', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.mutationseq.tasks.fix_museq_vcf', args=( mgd.TempInputFile('merged.vcf'), mgd.TempOutputFile('merged_fixed.vcf'), ), ) else: workflow.transform(name=name, ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.utils.museq_utils.run_museq', args=(mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), reference, mgd.InputInstance('interval'), params['museq_params'], mgd.TempSpace('museq_temp', 'interval')), kwargs={ 'tumour_bam': tumour_bam, 'normal_bam': normal_bam, }) workflow.transform( name='fix_vcf', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), axes=('interval', ), func='wgs.workflows.mutationseq.tasks.fix_museq_vcf', args=( mgd.TempInputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq_fixed.vcf', 'interval'), ), ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('museq_fixed.vcf', 'interval'), mgd.TempOutputFile('merged_fixed.vcf'), mgd.TempSpace('merge_vcf'), ), ) workflow.transform(name='bcftools_normalize', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('merged_fixed.vcf'), mgd.TempOutputFile('normalized.vcf'), reference, )) workflow.transform( name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized.vcf'), mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']), ), ) workflow.transform( name='run_museqportrait', ctx=helpers.get_default_ctx( memory=5, walltime='8:00', ), func='wgs.workflows.mutationseq.tasks.run_museqportrait', args=( mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(museqportrait_pdf), mgd.TempOutputFile('museqportrait.txt'), mgd.TempOutputFile('museqportrait.log'), single, ), kwargs={ 'thousand_genomes': thousand_genomes, 'dbsnp': dbsnp, 'germline_refdata': germline_refdata, 'germline_plot_threshold': params['germline_portrait_threshold'] }) workflow.subworkflow(name="mutationseq_single_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=(mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf), reference_vep), kwargs={ 'normal_id': normal_id, 'tumour_id': tumour_id }) return workflow
def create_alignment_workflow(fastq_1_filename, fastq_2_filename, bam_filename, bai_filename, ref_genome, config, args, instrumentinfo, centerinfo, sample_info, cell_ids): out_dir = args['out_dir'] merge_metrics = os.path.join(out_dir, 'metrics') lane_metrics = os.path.join(args['out_dir'], 'metrics_per_lane', '{lane}') bam_filename = dict([(cellid, bam_filename[cellid]) for cellid in cell_ids]) bai_filename = dict([(cellid, bai_filename[cellid]) for cellid in cell_ids]) chromosomes = config["chromosomes"] ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('chrom'), value=chromosomes, ) workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=fastq_1_filename.keys(), ) workflow.setobj(obj=mgd.TempOutputObj('instrument', 'cell_id', 'lane', axes_origin=[]), value=instrumentinfo) workflow.setobj(obj=mgd.TempOutputObj('center', 'cell_id', 'lane', axes_origin=[]), value=centerinfo) workflow.setobj(obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]), value=sample_info) fastqc_reports = os.path.join(lane_metrics, "fastqc", "{cell_id}_reports.tar.gz") flagstat_metrics = os.path.join(lane_metrics, 'flagstat', '{cell_id}.txt') workflow.transform( name='align_reads', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), axes=( 'cell_id', 'lane', ), func="single_cell.workflows.align.tasks.align_pe", args=(mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq_1_filename), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq_2_filename), mgd.TempOutputFile('aligned_per_cell_per_lane.sorted.bam', 'cell_id', 'lane'), mgd.OutputFile(fastqc_reports, 'cell_id', 'lane'), mgd.OutputFile(flagstat_metrics, 'cell_id', 'lane'), mgd.TempSpace('alignment_temp', 'cell_id', 'lane'), ref_genome, mgd.TempInputObj('instrument', 'cell_id', 'lane'), mgd.TempInputObj('center', 'cell_id', 'lane'), mgd.TempInputObj('sampleinfo', 'cell_id'), mgd.InputInstance('cell_id'), mgd.InputInstance('lane'), args['library_id'], config)) workflow.transform(name='merge_bams', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.align.tasks.merge_bams", axes=('cell_id', ), args=(mgd.TempInputFile( 'aligned_per_cell_per_lane.sorted.bam', 'cell_id', 'lane'), mgd.TempOutputFile('merged_lanes.bam', 'cell_id'), mgd.TempOutputFile('merged_lanes.bam.bai', 'cell_id'), config)) if args['realign']: workflow.transform(name='realignment', axes=('chrom', ), ctx=dict(mem=config['memory']['high'], pool_id=config['pools']['highmem'], **ctx), func="single_cell.workflows.align.tasks.realign", args=(mgd.TempInputFile('merged_lanes.bam', 'cell_id'), mgd.TempInputFile('merged_lanes.bam.bai', 'cell_id'), mgd.TempOutputFile('realigned.bam', 'chrom', 'cell_id'), mgd.TempSpace('realignment_temp', 'chrom', cleanup='before'), config, mgd.InputInstance('chrom'))) workflow.transform( name='merge_realignment', ctx=dict(mem=config['memory']['high'], pool_id=config['pools']['highmem'], **ctx), axes=('cell_id', ), func="single_cell.workflows.align.tasks.merge_realignment", args=(mgd.TempInputFile('realigned.bam', 'chrom', 'cell_id'), mgd.TempOutputFile('merged_realign.bam', 'cell_id'), config, mgd.InputInstance('cell_id'))) final_bam = mgd.TempInputFile('merged_lanes.bam', 'cell_id') if args["realign"]: final_bam = mgd.TempInputFile('merged_realign.bam', 'cell_id') markdups_metrics = os.path.join(merge_metrics, 'markdups_metrics', '{cell_id}.markdups_metrics.txt') flagstat_metrics = os.path.join(merge_metrics, 'flagstat_metrics', '{cell_id}.flagstat_metrics.txt') workflow.transform( name='postprocess_bam', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), axes=('cell_id', ), func="single_cell.workflows.align.tasks.postprocess_bam", args=( final_bam, mgd.OutputFile('sorted_markdups', 'cell_id', fnames=bam_filename), mgd.OutputFile('sorted_markdups_index', 'cell_id', fnames=bai_filename), mgd.TempSpace('tempdir', 'cell_id'), config, mgd.OutputFile(markdups_metrics, 'cell_id'), mgd.OutputFile(flagstat_metrics, 'cell_id'), ), ) return workflow
def alignment_workflow(args): config = inpututils.load_config(args) config = config['alignment'] lib = args["library_id"] alignment_dir = args["out_dir"] bams_dir = args["bams_dir"] sampleinfo = inpututils.get_sample_info(args['input_yaml']) laneinfo = inpututils.get_lane_info(args['input_yaml']) cellids = inpututils.get_samples(args['input_yaml']) fastq1_files, fastq2_files = inpututils.get_fastqs(args['input_yaml']) alignment_files = get_output_files(alignment_dir, lib) alignment_meta = os.path.join(alignment_dir, 'metadata.yaml') bam_files_template = os.path.join(bams_dir, '{cell_id}.bam') mt_bam_files_template = os.path.join(bams_dir, '{cell_id}_MT.bam') bams_meta = os.path.join(bams_dir, 'metadata.yaml') lanes = sorted(set([v[1] for v in fastq1_files.keys()])) cells = sorted(set([v[0] for v in fastq1_files.keys()])) input_yaml_blob = os.path.join(alignment_dir, 'input.yaml') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config['docker']['single_cell_pipeline']}) workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=list(fastq1_files.keys()), ) workflow.subworkflow( name='alignment_workflow', func=align.create_alignment_workflow, args=( mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq1_files, axes_origin=[]), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq2_files, axes_origin=[]), mgd.OutputFile('bam_markdups', 'cell_id', template=bam_files_template, axes_origin=[], extensions=['.bai']), mgd.OutputFile('mt_bam_markdups', 'cell_id', template=mt_bam_files_template, axes_origin=[], extensions=['.bai']), mgd.OutputFile(alignment_files['alignment_metrics_csv']), mgd.OutputFile(alignment_files['gc_metrics_csv']), mgd.OutputFile(alignment_files['fastqc_metrics_csv']), mgd.OutputFile(alignment_files['plot_metrics_output']), config['ref_genome'], config, laneinfo, sampleinfo, cellids, mgd.OutputFile(alignment_files['alignment_metrics_tar']), lib, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], alignment_dir, list(alignment_files.values()), mgd.OutputFile(alignment_meta)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'library_id': lib, 'cell_ids': cells, 'lane_ids': lanes, 'type': 'alignment' } }) workflow.transform( name='generate_meta_files_bams', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], bams_dir, mgd.Template('aligned.bam', 'cell_id', template=bam_files_template), mgd.OutputFile(bams_meta)), kwargs={ 'metadata': { 'library_id': lib, 'cell_ids': cells, 'lane_ids': lanes, 'type': 'cellbams' }, 'template': (mgd.InputChunks('cell_id'), bam_files_template, 'cell_id'), }) return workflow
def create_merge_bams_workflow( input_bams, merged_bams, regions, config, ): baseimage = config['docker']['single_cell_pipeline'] merged_bams = dict([(region, merged_bams[region]) for region in regions]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(input_bams.keys()), ) workflow.setobj( obj=mgd.OutputChunks('region'), value=regions, ) one_split_job = config["one_split_job"] if one_split_job: workflow.transform( name='merge_bams', ctx={ 'mem': config['memory']['med'], 'ncpus': config['max_cores'], 'docker_image': baseimage }, func="single_cell.workflows.merge_bams.tasks.merge_bams", args=(mgd.InputFile('bam', 'cell_id', fnames=input_bams, extensions=['.bai']), mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[], extensions=['.bai']), regions, config['docker']['samtools'], mgd.TempSpace("merge_bams_tempdir")), kwargs={"ncores": config["max_cores"]}) else: workflow.transform( name='split_merge_tumour', func= 'single_cell.workflows.merge_bams.tasks.cell_region_merge_bams', axes=('region', ), args=( mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=input_bams), mgd.OutputFile('tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=merged_bams), mgd.Instance('region'), config['docker']['samtools'], ), ) return workflow
def hmmcopy_workflow(args): config = inpututils.load_config(args) config = config['hmmcopy'] sampleinfo = inpututils.get_sample_info(args['input_yaml']) cellids = inpututils.get_samples(args['input_yaml']) bam_files = inpututils.get_bams(args['input_yaml']) lib = args["library_id"] workflow = pypeliner.workflow.Workflow() hmmcopy_dir = args["out_dir"] hmmcopy_files = get_output_files(hmmcopy_dir, lib) hmmcopy_meta = os.path.join(hmmcopy_dir, 'metadata.yaml') input_yaml_blob = os.path.join(hmmcopy_dir, 'input.yaml') workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) workflow.subworkflow( name='hmmcopy_workflow', func=hmmcopy.create_hmmcopy_workflow, args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.OutputFile(hmmcopy_files['reads_csvs']), mgd.OutputFile(hmmcopy_files['segs_csvs']), mgd.OutputFile(hmmcopy_files['metrics_csvs']), mgd.OutputFile(hmmcopy_files['params_csvs']), mgd.OutputFile(hmmcopy_files['igv_csvs']), mgd.OutputFile(hmmcopy_files['segs_pdf']), mgd.OutputFile(hmmcopy_files['bias_pdf']), mgd.OutputFile(hmmcopy_files['heatmap_pdf']), mgd.OutputFile(hmmcopy_files['metrics_pdf']), mgd.OutputFile(hmmcopy_files['kernel_density_pdf']), mgd.OutputFile(hmmcopy_files['hmmcopy_data_tar']), cellids, config, sampleinfo ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=( sys.argv[0:], hmmcopy_dir, list(hmmcopy_files.values()), mgd.OutputFile(hmmcopy_meta) ), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'library_id': lib, 'cell_ids': list(bam_files.keys()), 'type': 'hmmcopy', } } ) return workflow
def process_cells_destruct( destruct_config, cell_bam_files, reads_1, reads_2, sample_1, sample_2, stats, tag=False ): ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, } cells = list(cell_bam_files.keys()) workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cells, ) workflow.transform( name='bamdisc_and_numreads_cell', func="single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads", axes=('cell_id',), ctx={'io': 1, 'mem': 8}, args=( destruct_config, mgd.InputFile('bam', 'cell_id', fnames=cell_bam_files), mgd.TempOutputFile('cell_stats', 'cell_id'), mgd.TempOutputFile('cell_reads_1.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_reads_2.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_sample_1.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_sample_2.fastq.gz', 'cell_id'), mgd.TempSpace('bamdisc_cell_tempspace', 'cell_id'), ), ) workflow.transform( name='merge_reads_r1', ctx={'io': 1, 'mem': 8, 'disk': 100}, func="single_cell.workflows.destruct_singlecell.tasks.merge_fastqs", args=( mgd.TempInputFile('cell_reads_1.fastq.gz', 'cell_id'), mgd.OutputFile(reads_1), ), ) workflow.transform( name='merge_reads_r2', ctx={'io': 1, 'mem': 8, 'disk': 100}, func="single_cell.workflows.destruct_singlecell.tasks.merge_fastqs", args=( mgd.TempInputFile('cell_reads_2.fastq.gz', 'cell_id'), mgd.OutputFile(reads_2), ), ) workflow.transform( name='merge_sample', ctx={'io': 1, 'mem': 8, 'disk': 100}, func="single_cell.workflows.destruct_singlecell.tasks.resample_fastqs", args=( mgd.TempInputFile('cell_sample_1.fastq.gz', 'cell_id'), mgd.TempInputFile('cell_sample_2.fastq.gz', 'cell_id'), mgd.OutputFile(sample_1), mgd.OutputFile(sample_2), destruct_config['num_read_samples'], ), ) workflow.transform( name='merge_stats', ctx={'io': 1, 'mem': 8}, func="single_cell.workflows.destruct_singlecell.tasks.merge_stats", args=( mgd.TempInputFile('cell_stats', 'cell_id'), mgd.OutputFile(stats), ), ) return workflow
def lumpy_preprocess_workflow(bam_files, config, discordants, split_reads, histogram, mean_stdev): ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'docker_image': config['docker']['single_cell_pipeline'] } lumpydocker = {'docker_image': config['docker']['lumpy']} histogram_settings = dict(N=10000, skip=0, min_elements=100, mads=10, X=4, read_length=101) histogram_settings.update(lumpydocker) workflow = pypeliner.workflow.Workflow(ctx=ctx) if isinstance(bam_files, dict): workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) workflow.set_filenames('cells.bam', 'cell_id', fnames=bam_files) workflow.subworkflow( name='process_cells', func='single_cell.workflows.lumpy.lumpy_preprocess_cells', args=(config, mgd.InputFile('cells.bam', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.OutputFile(discordants), mgd.OutputFile(split_reads), mgd.OutputFile(histogram), mgd.OutputFile(mean_stdev)), ) else: workflow.transform( name='process_bulk', ctx={ 'mem': 8, 'ncpus': 1, 'disk': 200 }, func='single_cell.workflows.lumpy.tasks.process_bam', args=( mgd.InputFile(bam_files, extensions=['.bai']), mgd.OutputFile(discordants), mgd.OutputFile(split_reads), mgd.TempOutputFile('hist_normal.csv'), mgd.TempSpace("lumpy_normal_processing"), ), kwargs=histogram_settings, ) workflow.transform( name='format_histo_bulk', ctx={ 'mem': 8, 'ncpus': 1 }, func= 'single_cell.workflows.lumpy.merge_histograms.merge_histograms', args=(mgd.TempInputFile('hist_normal.csv'), mgd.OutputFile(histogram), mgd.OutputFile(mean_stdev)), ) return workflow
def somatic_calling_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(tumours.keys()) tumour_ids = helpers.get_values_from_input(inputs, 'tumour_id') normal_ids = helpers.get_values_from_input(inputs, 'normal_id') var_dir = os.path.join(args['out_dir'], 'somatic') museq_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_paired_annotated.vcf.gz') museq_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_paired_annotated.maf') museq_paired_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_paired_museqportrait.pdf') strelka_snv_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_strelka_snv_annotated.vcf.gz') strelka_snv_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_strelka_snv_annotated.maf') strelka_indel_vcf = os.path.join( var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz') strelka_indel_maf = os.path.join( var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.maf') mutect_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_mutect.vcf.gz') mutect_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_mutect.maf') consensus_somatic_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic.maf') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) workflow.subworkflow(name='variant_calling', func=somatic_calling.create_somatic_calling_workflow, args=( samples, mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq_vcf', 'sample_id', template=museq_vcf, axes_origin=[]), mgd.OutputFile('museq_maf', 'sample_id', template=museq_maf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]), mgd.OutputFile('strelka_snv_vcf', 'sample_id', template=strelka_snv_vcf, axes_origin=[]), mgd.OutputFile('strelka_snv_maf', 'sample_id', template=strelka_snv_maf, axes_origin=[]), mgd.OutputFile('strelka_indel_vcf', 'sample_id', template=strelka_indel_vcf, axes_origin=[]), mgd.OutputFile('strelka_indel_maf', 'sample_id', template=strelka_indel_maf, axes_origin=[]), mgd.OutputFile('mutect_vcf', 'sample_id', template=mutect_vcf, axes_origin=[]), mgd.OutputFile('mutect_maf', 'sample_id', template=mutect_maf, axes_origin=[]), mgd.OutputFile('consensus_somatic_maf', 'sample_id', template=consensus_somatic_maf, axes_origin=[]), args['refdir'], normal_ids, tumour_ids, ), kwargs={ 'single_node': args['single_node'], 'is_exome': args['is_exome'], }) filenames = [ museq_vcf, museq_maf, museq_paired_pdf, strelka_snv_vcf, strelka_snv_maf, strelka_indel_vcf, strelka_indel_maf, mutect_vcf, mutect_maf, consensus_somatic_maf ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'variant_calling' } }) pyp.run(workflow)
def lumpy_preprocess_cells(config, bam_files, merged_discordants, merged_splitters, hist_csv, mean_stdev_obj): ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'docker_image': config['docker']['single_cell_pipeline'] } lumpydocker = {'docker_image': config['docker']['lumpy']} histogram_settings = dict(N=10000, skip=0, min_elements=100, mads=10, X=4, read_length=101) histogram_settings.update(lumpydocker) workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) workflow.transform( name='process_tumour_cells', axes=('cell_id', ), ctx={ 'mem': 8, 'ncpus': 1 }, func='single_cell.workflows.lumpy.tasks.process_bam', args=( mgd.InputFile('tumour_bam', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.TempOutputFile('tumour.discordants.sorted.bam', 'cell_id'), mgd.TempOutputFile('tumour.splitters.sorted.bam', 'cell_id'), mgd.TempOutputFile('hist.csv', 'cell_id'), mgd.TempSpace("lumpy_tumour_processing", "cell_id"), ), kwargs=dict(tag=mgd.InputInstance('cell_id'), **histogram_settings), ) workflow.transform( name='merge_disc', ctx={ 'mem': 8, 'ncpus': 1 }, func='single_cell.workflows.lumpy.tasks.merge_bams', args=(mgd.TempInputFile('tumour.discordants.sorted.bam', 'cell_id'), mgd.OutputFile(merged_discordants), mgd.TempSpace("merge_disc_temp")), kwargs=lumpydocker, ) workflow.transform( name='merge_split', ctx={ 'mem': 8, 'ncpus': 1 }, func='single_cell.workflows.lumpy.tasks.merge_bams', args=(mgd.TempInputFile('tumour.splitters.sorted.bam', 'cell_id'), mgd.OutputFile(merged_splitters), mgd.TempSpace("merge_split_temp")), kwargs=lumpydocker, ) workflow.transform( name='merge_histo', ctx={ 'mem': 8, 'ncpus': 1 }, func='single_cell.workflows.lumpy.merge_histograms.merge_histograms', args=(mgd.TempInputFile('hist.csv', 'cell_id'), mgd.OutputFile(hist_csv), mgd.OutputFile(mean_stdev_obj)), ) return workflow
def create_hmmcopy_workflow(bam_file, reads, segs, metrics, params, igv_seg_filename, segs_pdf, bias_pdf, plot_heatmap_ec_output, plot_metrics_output, plot_kernel_density_output, hmmcopy_data_tar, cell_ids, hmmparams, sample_info): chromosomes = hmmparams["chromosomes"] baseimage = hmmparams['docker']['single_cell_pipeline'] hmmcopy_docker = hmmparams['docker']['hmmcopy'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.setobj(obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]), value=sample_info) workflow.transform( name='run_hmmcopy', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.run_hmmcopy", axes=('cell_id', ), args=(mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']), mgd.TempOutputFile('reads.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('segs.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('params.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('hmm_metrics.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('hmm_data.tar.gz', 'cell_id'), mgd.InputInstance('cell_id'), hmmparams, mgd.TempSpace('hmmcopy_temp', 'cell_id'), hmmcopy_docker), ) workflow.transform( name='merge_reads', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=(mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempOutputFile('reads_merged.csv.gz', extensions=['.yaml']), 'reads'), kwargs={'low_memory': True}) workflow.transform( name='add_mappability_bool', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.get_mappability_col", args=( mgd.TempInputFile('reads_merged.csv.gz', extensions=['.yaml']), mgd.OutputFile(reads, extensions=['.yaml']), ), ) workflow.transform( name='merge_segs', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=(mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.OutputFile(segs, extensions=['.yaml']), 'segs'), kwargs={'low_memory': True}) workflow.transform( name='merge_metrics', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=(mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempOutputFile("hmm_metrics.csv.gz", extensions=['.yaml']), 'metrics'), ) workflow.transform( name='merge_params', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=(mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.OutputFile(params, extensions=['.yaml']), None), ) workflow.transform(name='get_max_cn', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.get_max_cn", ret=mgd.TempOutputObj('max_cn'), args=(mgd.InputFile(reads, extensions=['.yaml']), )) workflow.transform(name='hmmcopy_plots', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.plot_hmmcopy", axes=('cell_id', ), args=( mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), hmmparams['ref_genome'], mgd.TempOutputFile('segments.png', 'cell_id', axes_origin=[]), mgd.TempOutputFile('bias.png', 'cell_id', axes_origin=[]), mgd.InputInstance('cell_id'), ), kwargs={ 'num_states': hmmparams['num_states'], 'sample_info': mgd.TempInputObj('sampleinfo', 'cell_id'), 'max_cn': mgd.TempInputObj("max_cn") }) workflow.transform( name='annotate_metrics_with_info_and_clustering', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.add_clustering_order", args=( mgd.InputFile(reads, extensions=['.yaml']), mgd.TempInputFile("hmm_metrics.csv.gz", extensions=['.yaml']), mgd.OutputFile(metrics, extensions=['.yaml']), ), kwargs={ 'chromosomes': hmmparams["chromosomes"], 'sample_info': sample_info }) workflow.transform(name='merge_hmm_copy_plots', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.merge_pdf", args=([ mgd.TempInputFile('segments.png', 'cell_id'), mgd.TempInputFile('bias.png', 'cell_id'), ], [ mgd.OutputFile(segs_pdf), mgd.OutputFile(bias_pdf), ], mgd.InputFile(metrics, extensions=['.yaml']), None, mgd.TempSpace("hmmcopy_plot_merge_temp"), ['segments', 'bias'])) workflow.transform( name='create_igv_seg', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.create_igv_seg", args=( mgd.InputFile(segs, extensions=['.yaml']), mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(igv_seg_filename), hmmparams, )) workflow.transform(name='plot_metrics', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.plot_metrics", args=( mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_metrics_output), 'QC pipeline metrics', )) workflow.transform( name='plot_kernel_density', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.plot_kernel_density", args=( mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_kernel_density_output), ',', 'mad_neutral_state', 'QC pipeline metrics', )) workflow.transform(name='plot_heatmap_ec', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.hmmcopy.tasks.plot_pcolor", args=( mgd.InputFile(reads, extensions=['.yaml']), mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_heatmap_ec_output), ), kwargs={ 'plot_title': 'QC pipeline metrics', 'column_name': 'state', 'plot_by_col': 'experimental_condition', 'color_by_col': 'cell_call', 'chromosomes': chromosomes, 'max_cn': hmmparams['num_states'], 'scale_by_cells': False, 'mappability_threshold': hmmparams["map_cutoff"] }) workflow.transform( name='merge_hmmcopy_data_tars', ctx={ 'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.utils.helpers.tar_files", args=(mgd.TempInputFile('hmm_data.tar.gz', 'cell_id', axes_origin=[]), mgd.OutputFile(hmmcopy_data_tar), mgd.TempSpace("merge_tarballs")), ) return workflow
def create_strelka_workflow(normal_bam_file, tumour_bam_file, snv_vcf_file, snv_maf_file, indel_vcf_file, indel_maf_file, reference, reference_vep, chromosomes, normal_id, tumour_id, single_node=False, is_exome=False): params = config.default_params('variant_calling') workflow = Workflow(ctx=helpers.get_default_ctx(memory=5, walltime='4:00'), ) workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ret=mgd.OutputChunks('regions'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) workflow.transform( name='count_fasta_bases', func="wgs.workflows.strelka.tasks.count_fasta_bases", args=( reference, pypeliner.managed.TempOutputFile('ref_base_counts.tsv'), ), ) workflow.transform( name="get_chrom_sizes", func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes", ret=pypeliner.managed.TempOutputObj('known_sizes'), args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'), chromosomes)) if single_node: workflow.transform(name='strelka_one_node', func="wgs.workflows.strelka.tasks.strelka_one_node", args=( pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai' ]), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai' ]), reference, mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace('call_genome_segment_tmp'), mgd.InputChunks('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': is_exome, }) else: workflow.transform( name='get_chromosome_depths', axes=('regions', ), func="wgs.workflows.strelka.tasks.get_chromosome_depth", args=( mgd.InputInstance('regions'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('chrom_depth.txt', 'regions'), ), ) workflow.transform( name='merge_chromosome_depths', func="wgs.workflows.strelka.tasks.merge_chromosome_depths", args=(mgd.TempInputFile('chrom_depth.txt', 'regions', axes_origin=[]), mgd.TempOutputFile('merged_chrom_depth.txt'))) workflow.transform( name='call_genome_segment', axes=('regions', ), func="wgs.workflows.strelka.tasks.call_genome_segment", args=( mgd.TempInputFile('merged_chrom_depth.txt'), pypeliner.managed.InputFile(normal_bam_file, extensions=['.bai']), pypeliner.managed.InputFile(tumour_bam_file, extensions=['.bai']), reference, mgd.TempOutputFile('indels.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf', 'regions'), mgd.TempSpace('call_genome_segment_tmp', 'regions'), mgd.InputInstance('regions'), mgd.TempInputObj('known_sizes'), ), kwargs={ 'is_exome': False, }) workflow.transform( name='merge_indels', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('indels.vcf', 'regions'), mgd.TempOutputFile('indels.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("indels_merge")), ) workflow.transform( name='merge_snvs', func='wgs.workflows.strelka.tasks.concatenate_vcf', args=(mgd.TempInputFile('snvs.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("snvs_merge")), ) workflow.transform(name='bcftools_normalize_snv', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('snvs.vcf.gz'), mgd.TempOutputFile('normalized_snvs.vcf'), reference, )) workflow.transform( name='finalise_normalize_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_snvs.vcf'), mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform(name='bcftools_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('indels.vcf.gz'), mgd.TempOutputFile('normalized_indels.vcf'), reference, )) workflow.transform( name='finalise_normalize_indel', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized_indels.vcf'), mgd.TempOutputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_indel', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_indels_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.transform( name='filter_vcf_snv', func='wgs.workflows.strelka.tasks.filter_vcf', args=( mgd.TempInputFile('normalized_snvs_finalize.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="strelka_snv_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(snv_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) workflow.subworkflow(name="strelka_indel_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(indel_vcf_file, extensions=['.tbi', '.csi']), mgd.OutputFile(indel_maf_file), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) return workflow
def create_alignment_workflow( fastq_1_filename, fastq_2_filename, bam_filename, alignment_metrics, gc_metrics, detailed_fastqscreen_metrics, plot_metrics, ref_genome, config, laneinfo, sample_info, cell_ids, metrics_tar, library_id, ): baseimage = config['docker']['single_cell_pipeline'] ctx = { 'mem': 7, 'ncpus': 1, 'docker_image': baseimage, 'mem_retry_factor': 1 } bam_filename = dict([(cellid, bam_filename[cellid]) for cellid in cell_ids]) chromosomes = config["chromosomes"] workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('chrom'), value=chromosomes, ) workflow.setobj(obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]), value=sample_info) workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=list(fastq_1_filename.keys()), ) workflow.setobj(obj=mgd.TempOutputObj('laneinfo', 'cell_id', 'lane', axes_origin=[]), value=laneinfo) workflow.transform( name='align_reads', axes=('cell_id', ), func="single_cell.workflows.align.align_tasks.align_lanes", args=( mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq_1_filename, axes_origin=[]), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq_2_filename, axes_origin=[]), mgd.OutputFile('sorted_markdups', 'cell_id', fnames=bam_filename, extensions=['.bai']), mgd.TempOutputFile('fastqc_reports.tar.gz', 'cell_id'), mgd.TempSpace('alignment_temp', 'cell_id'), ref_genome, mgd.TempInputObj('laneinfo', 'cell_id', 'lane', axes_origin=[]), mgd.TempInputObj('sampleinfo', 'cell_id'), mgd.InputInstance('cell_id'), library_id, config['aligner'], config['docker'], config['adapter'], config['adapter2'], mgd.TempOutputFile('organism_detailed_count_per_cell.csv.gz', 'cell_id'), mgd.TempOutputFile('organism_summary_count_per_cell.csv.gz', 'cell_id'), config['fastq_screen_params'], )) workflow.transform( name='merge_fastq_screen_metrics', func= "single_cell.workflows.align.fastqscreen.merge_fastq_screen_counts", args=( mgd.TempInputFile('organism_detailed_count_per_cell.csv.gz', 'cell_id'), mgd.TempInputFile('organism_summary_count_per_cell.csv.gz', 'cell_id'), mgd.OutputFile(detailed_fastqscreen_metrics, extensions=['.yaml']), mgd.TempOutputFile('organism_summary_count_per_cell.csv.gz', extensions=['.yaml']), )) workflow.subworkflow( name='metrics_subworkflow', func="single_cell.workflows.align.bam_metrics_workflow", args=(mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename, extensions=['.bai']), mgd.TempInputFile('organism_summary_count_per_cell.csv.gz', extensions=['.yaml']), mgd.OutputFile(alignment_metrics, extensions=['.yaml']), mgd.OutputFile(gc_metrics, extensions=['.yaml']), mgd.TempOutputFile('markdups_metrics.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('flagstat_metrics.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('wgs_metrics.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('gc_metrics.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('gc_metrics_summary.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('gc_metrics.pdf', 'cell_id', axes_origin=[]), mgd.TempOutputFile('insert_metrics.txt', 'cell_id', axes_origin=[]), mgd.TempOutputFile('insert_metrics.pdf', 'cell_id', axes_origin=[]), ref_genome, sample_info, config, cell_ids)) workflow.transform(name='plot_metrics', ctx={'mem': config['memory']['med']}, func="single_cell.workflows.align.tasks.plot_metrics", args=( mgd.InputFile(alignment_metrics, extensions=['.yaml']), mgd.OutputFile(plot_metrics), 'QC pipeline metrics', mgd.InputFile(gc_metrics, extensions=['.yaml']), config['gc_windows'], )) workflow.transform(name='tar_all_files', ctx={'mem': config['memory']['med']}, func="single_cell.utils.helpers.tar_files", args=([ mgd.TempInputFile('fastqc_reports.tar.gz', 'cell_id'), mgd.TempInputFile('markdups_metrics.txt', 'cell_id'), mgd.TempInputFile('flagstat_metrics.txt', 'cell_id'), mgd.TempInputFile('wgs_metrics.txt', 'cell_id'), mgd.TempInputFile('gc_metrics.txt', 'cell_id'), mgd.TempInputFile('gc_metrics_summary.txt', 'cell_id'), mgd.TempInputFile('gc_metrics.pdf', 'cell_id'), mgd.TempInputFile('insert_metrics.txt', 'cell_id'), mgd.TempInputFile('insert_metrics.pdf', 'cell_id'), ], mgd.OutputFile(metrics_tar), mgd.TempSpace("merge_metrics_tar"))) return workflow
def merge_bams_workflow(args): config = helpers.load_config(args) config = config['merge_bams'] baseimage = config['docker']['single_cell_pipeline'] ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'mem': config["memory"]['low'], 'docker_image': baseimage } workflow = pypeliner.workflow.Workflow(ctx=ctx) data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_wgs = data['tumour_wgs'] normal_wgs = data['normal_wgs'] tumour_cells = data['tumour_cells'] normal_cells = data['normal_cells'] bam_files = tumour_cells if tumour_cells else normal_cells wgs_bams = tumour_wgs if tumour_cells else normal_wgs workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) if isinstance(wgs_bams, dict): workflow.setobj( obj=mgd.OutputChunks('regions'), value=list(wgs_bams.keys()), ) workflow.set_filenames("merged.bam", "region", fnames=wgs_bams) else: workflow.transform( name="get_regions", func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.set_filenames('merged.bam', 'region', template=wgs_bams) workflow.subworkflow(name="wgs_merge_workflow", func=merge_bams.create_merge_bams_workflow, args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.OutputFile("merged.bam", "region", axes_origin=[], extensions=['.bai']), mgd.TempInputObj("region"), config, )) workflow.transform(name="get_files", ctx={'mem': config['memory']['med']}, func='single_cell.utils.helpers.resolve_template', ret=pypeliner.managed.TempOutputObj('outputs'), args=(pypeliner.managed.TempInputObj('region'), wgs_bams, 'region')) return workflow
def pseudo_bulk_qc_workflow(args): data = inpututils.load_qc_input(args["input_yaml"]) config = inpututils.load_config(args) config = config["qc"] out_dir = args["out_dir"] mutationreports = os.path.join(out_dir, 'patient', "mutationreport.html") grouplevelmafs = os.path.join(out_dir, 'patient', "grouplevelmaf.maf") grouplevel_high_impact_mafs = os.path.join( out_dir, 'patient', "grouplevel_high_impact_maf.maf") grouplevel_high_impact_merged_snvs = os.path.join( out_dir, 'patient', "grouplevel_high_impact_merged_snvs.csv") grouplevel_snvs = os.path.join(out_dir, 'patient', "grouplevel_snvs.csv") isabl_ids = {label: paths["isabl_id"] for label, paths in data.items()} mappability_files = { label: paths["mappability"] for label, paths in data.items() } strelka_files = {label: paths["strelka"] for label, paths in data.items()} museq_files = {label: paths["museq"] for label, paths in data.items()} cosmic_status_files = { label: paths["cosmic_status"] for label, paths in data.items() } snpeff_files = {label: paths["snpeff"] for label, paths in data.items()} dbsnp_status_files = { label: paths["dbsnp_status"] for label, paths in data.items() } trinuc_files = {label: paths["trinuc"] for label, paths in data.items()} counts_files = {label: paths["counts"] for label, paths in data.items()} breakpoint_counts_files = { label: paths["destruct_breakpoint_counts"] for label, paths in data.items() } destruct_breakpoint_annotation_files = { label: paths["destruct_breakpoint_annotation"] for label, paths in data.items() } lumpy_breakpoint_annotation_files = { label: paths["lumpy_breakpoint_annotation"] for label, paths in data.items() } lumpy_breakpoint_evidence_files = { label: paths["lumpy_breakpoint_evidence"] for label, paths in data.items() } haplotype_allele_data_files = { label: paths["haplotype_allele_data"] for label, paths in data.items() } annotation_metrics_files = { label: paths["annotation_metrics"] for label, paths in data.items() } hmmcopy_reads_files = { label: paths["hmmcopy_reads"] for label, paths in data.items() } hmmcopy_segs_files = { label: paths["hmmcopy_segs"] for label, paths in data.items() } hmmcopy_metrics_files = { label: paths["hmmcopy_metrics"] for label, paths in data.items() } alignment_metrics_files = { label: paths["alignment_metrics"] for label, paths in data.items() } gc_metrics_files = { label: paths["gc_metrics"] for label, paths in data.items() } indel_files = {label: paths["indel_file"] for label, paths in data.items()} label_dir = os.path.join(out_dir, '{patient}', '{sample_id}', '{library_id}') sample_level_report_htmls = os.path.join(label_dir, "mainreport.html") sample_level_maf = os.path.join(label_dir, "samplelevelmaf.maf") snvs_all = os.path.join(label_dir, 'snvs_all.csv') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config['docker']['single_cell_pipeline']}) workflow.setobj( obj=mgd.OutputChunks( 'patient', 'sample_id', 'library_id', ), value=list(data.keys()), ) workflow.subworkflow( name='create_sample_level_plots', func="single_cell.workflows.pseudo_bulk_qc.create_sample_level_plots", axes=( 'patient', 'sample_id', 'library_id', ), args=(mgd.InputInstance('patient'), mgd.InputInstance('sample_id'), mgd.InputInstance('library_id'), mgd.InputFile('mappability', 'patient', 'sample_id', 'library_id', fnames=mappability_files), mgd.InputFile('strelka', 'patient', 'sample_id', 'library_id', fnames=strelka_files), mgd.InputFile('museq', 'patient', 'sample_id', 'library_id', fnames=museq_files), mgd.InputFile('cosmic_status', 'patient', 'sample_id', 'library_id', fnames=cosmic_status_files), mgd.InputFile('snpeff', 'patient', 'sample_id', 'library_id', fnames=snpeff_files), mgd.InputFile('dbsnp_status', 'patient', 'sample_id', 'library_id', fnames=dbsnp_status_files), mgd.InputFile('trinuc', 'patient', 'sample_id', 'library_id', fnames=trinuc_files), mgd.InputFile('counts', 'patient', 'sample_id', 'library_id', fnames=counts_files), mgd.InputFile('destruct_breakpoint_annotation', 'patient', 'sample_id', 'library_id', fnames=destruct_breakpoint_annotation_files), mgd.InputFile('destruct_breakpoint_counts', 'patient', 'sample_id', 'library_id', fnames=breakpoint_counts_files), mgd.InputFile('lumpy_breakpoint_annotation', 'patient', 'sample_id', 'library_id', fnames=lumpy_breakpoint_annotation_files), mgd.InputFile('lumpy_breakpoint_evidence', 'patient', 'sample_id', 'library_id', fnames=lumpy_breakpoint_evidence_files), mgd.InputFile('haplotype_allele_data', 'patient', 'sample_id', 'library_id', fnames=haplotype_allele_data_files), mgd.InputFile('annotation_metrics', 'patient', 'sample_id', 'library_id', fnames=annotation_metrics_files), mgd.InputFile('hmmcopy_reads', 'patient', 'sample_id', 'library_id', fnames=hmmcopy_reads_files), mgd.InputFile('isabl_ids', 'patient', 'sample_id', 'library_id', fnames=isabl_ids), mgd.InputFile('hmmcopy_segs', 'patient', 'sample_id', 'library_id', fnames=hmmcopy_segs_files), mgd.InputFile('hmmcopy_metrics', 'patient', 'sample_id', 'library_id', fnames=hmmcopy_metrics_files), mgd.InputFile('alignment_metrics', 'patient', 'sample_id', 'library_id', fnames=alignment_metrics_files), mgd.InputFile('gc_metrics', 'patient', 'sample_id', 'library_id', fnames=gc_metrics_files), mgd.InputFile('indel_files', 'patient', 'sample_id', 'library_id', fnames=indel_files), mgd.OutputFile('sample_level_report_htmls', 'patient', 'sample_id', 'library_id', template=sample_level_report_htmls), mgd.OutputFile('mafs', 'patient', 'sample_id', 'library_id', template=sample_level_maf), mgd.OutputFile('snvs_all', 'patient', 'sample_id', 'library_id', template=snvs_all), out_dir, config), ) workflow.subworkflow( name='create_patient_workflow', func="single_cell.workflows.pseudo_bulk_qc.create_patient_workflow", axes=('patient', ), args=( mgd.InputInstance('patient'), mgd.InputFile("mafs", "patient", "sample_id", "library_id", template=sample_level_maf, axes_origin=[]), mgd.InputFile("snvs_all", "patient", "sample_id", "library_id", template=snvs_all, axes_origin=[]), mgd.OutputFile('mutationreport', 'patient', template=mutationreports), mgd.OutputFile('grouplevelmaf', 'patient', template=grouplevelmafs), mgd.OutputFile('grouplevel_high_impact_maf', 'patient', template=grouplevel_high_impact_mafs), mgd.OutputFile('grouplevel_snvs', 'patient', template=grouplevel_snvs), mgd.OutputFile('grouplevel_high_impact_merged_snvs', 'patient', template=grouplevel_high_impact_merged_snvs), config, ), ) return workflow
def create_ltm_workflow(hmmcopy, cn_matrix, output_gml, output_rooted_gml, cnv_annots_csv, cnv_tree_edges_csv, cnv_data_csv, output_rmd, config, root_id, root_id_file, number_jobs, ploidy): workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('timepoint'), value=hmmcopy.keys(), ) workflow.transform( name='generate_cn_matrices', axes=('timepoint', ), ctx={ 'mem': config['memory']['med'], 'pool_id': config['pools']['standard'], 'ncpus': 1 }, func='single_cell.workflows.ltm.tasks.generate_cn_matrices', args=( mgd.InputFile('hmmcopy.h5', 'timepoint', fnames=hmmcopy), mgd.TempOutputFile('cn_matrix.csv', 'timepoint'), str(ploidy), ), ) # Generate copy number matrix workflow.transform( name='generate_cn_matrix', ctx={ 'mem': config['memory']['med'], 'pool_id': config['pools']['standard'], 'ncpus': 1 }, func='single_cell.workflows.ltm.tasks.merge_cn_matrices', args=( mgd.TempInputFile('cn_matrix.csv', 'timepoint'), mgd.OutputFile(cn_matrix), ), ) node_pair_csvs = [] for job in range(number_jobs): node_pair_csvs.append('list_{}.csv'.format(job)) workflow.transform( name='generate_input_csvs', ctx={ 'mem': config['memory']['med'], 'pool_id': config['pools']['standard'], 'ncpus': 1 }, func='single_cell.workflows.ltm.tasks.generate_node_pair_csvs', args=( mgd.InputFile(cn_matrix), number_jobs, [mgd.TempOutputFile(csv) for csv in node_pair_csvs], ), ) distance_csvs = [] for job in range(number_jobs): distance_csvs.append('distance_list_{}.csv'.format(job)) workflow.transform( name='calculate_distances', ctx={ 'mem': config['memory']['med'], 'pool_id': config['pools']['standard'], 'ncpus': 1 }, func='single_cell.workflows.ltm.tasks.calculate_distances', args=( [mgd.TempInputFile(csv) for csv in node_pair_csvs], mgd.InputFile(cn_matrix), [mgd.TempOutputFile(csv) for csv in distance_csvs], config, ), ) # Generates a minimum spanning tree workflow.transform( name='generate_tree', ctx={ 'mem': config['memory']['med'], 'pool_id': config['pools']['standard'], 'ncpus': 1 }, func= 'single_cell.workflows.ltm.scripts.learn_CL_from_distance.learn_CL_from_distance', args=( [mgd.TempInputFile(csv) for csv in distance_csvs], mgd.OutputFile(output_gml), ), ) workflow.transform( name='generate_cellscape_inputs', ctx={ 'mem': config['memory']['med'], 'pool_id': config['pools']['standard'], 'ncpus': 1 }, func='single_cell.workflows.ltm.tasks.generate_cellscape_inputs', args=( mgd.InputFile(cn_matrix), mgd.OutputFile(cnv_annots_csv), mgd.OutputFile(cnv_tree_edges_csv), mgd.OutputFile(cnv_data_csv), mgd.InputFile(output_gml), mgd.OutputFile(output_rooted_gml), root_id, mgd.OutputFile(root_id_file), ), ) workflow.transform( name='create_cellscape_rmarkdown', ctx={ 'mem': config['memory']['med'], 'pool_id': config['pools']['standard'], 'ncpus': 1 }, func='single_cell.workflows.ltm.tasks.move_cellscape', args=(mgd.OutputFile(output_rmd), ), ) return workflow
def breakpoint_calling_workflow(args): config = inpututils.load_config(args) config = config['breakpoint_calling'] run_destruct = True if args['destruct'] else False run_lumpy = True if args['lumpy'] else False if not run_destruct and not run_lumpy: run_destruct = True run_lumpy = True normal_data, tumour_cells = inpututils.load_breakpoint_calling_input( args['input_yaml']) bkp_dir = os.path.join(args['out_dir']) bkp_meta = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') out_files = get_output_files(bkp_dir, run_destruct, run_lumpy) ref_data_directory = config['ref_data_directory'] workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config['docker']['single_cell_pipeline']}) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=list(tumour_cells.keys()), ) if isinstance(normal_data, dict): workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_data.keys()), ) normal_bam = mgd.InputFile('normal_cells.bam', 'normal_cell_id', extensions=['.bai'], fnames=normal_data) else: normal_bam = mgd.InputFile(normal_data, extensions=['.bai']) if run_destruct: workflow.subworkflow( name='destruct', ctx={'docker_image': config['docker']['single_cell_pipeline']}, func= "single_cell.workflows.destruct_singlecell.create_destruct_workflow", args=( normal_bam, mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells), config.get('destruct_config', {}), config, ref_data_directory, mgd.OutputFile(out_files['destruct_breakpoints_filename'], extensions=['.yaml']), mgd.OutputFile(out_files['destruct_breakpoints_lib_filename'], extensions=['.yaml']), mgd.OutputFile(out_files['destruct_cell_counts_filename'], extensions=['.yaml']), ), ) if run_lumpy: workflow.subworkflow( name='lumpy', func="single_cell.workflows.lumpy.create_lumpy_workflow", args=( config, normal_bam, mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells, extensions=['.bai']), mgd.OutputFile(out_files['lumpy_breakpoints_csv'], extensions=['.yaml']), mgd.OutputFile(out_files['lumpy_breakpoints_evidence_csv'], extensions=['.yaml']), mgd.OutputFile(out_files['lumpy_breakpoints_bed']), ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], bkp_dir, list(out_files.values()), mgd.OutputFile(bkp_meta)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'breakpoint_calling' } }) return workflow
def create_aneufinder_workflow(bam_file, cell_ids, config, aneufinder_output, aneufinder_results_filename, aneufinder_pdf_filename, library_id, ): ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) aneufinder_docker = helpers.get_container_ctx(config['containers'], 'aneufinder', docker_only=True) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.transform( name='run_aneufinder_on_individual_cells', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.aneufinder.tasks.run_aneufinder", axes=('cell_id',), args=( mgd.InputFile('bam_file', 'cell_id', fnames=bam_file), mgd.TempSpace('working_dir', 'cell_id', fnames=bam_file), mgd.InputInstance('cell_id'), aneufinder_output, mgd.TempOutputFile('segments.csv', 'cell_id'), mgd.TempOutputFile('reads.csv', 'cell_id'), mgd.TempOutputFile('dnacopy.pdf', 'cell_id'), ), kwargs={'docker_config': aneufinder_docker} ) workflow.transform( name='merge_outputs', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.aneufinder.tasks.merge_outputs_to_hdf", args=( mgd.TempInputFile('reads.csv', 'cell_id'), mgd.TempInputFile('segments.csv', 'cell_id'), mgd.OutputFile(aneufinder_results_filename), mgd.TempSpace("aneufinder_merge"), ) ) workflow.transform( name='merge_aneufinder_pdfs', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.aneufinder.tasks.merge_pdf", args=( [mgd.TempInputFile('dnacopy.pdf', 'cell_id')], [mgd.OutputFile(aneufinder_pdf_filename)], ) ) return workflow
def hmmcopy_workflow(workflow, args): config = helpers.load_config(args) ctx = {'mem_retry_increment': 2, 'ncpus': 1} ctx.update( helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')) cellids = helpers.get_samples(args['input_yaml']) bam_files, bai_files = helpers.get_bams(args['input_yaml']) lib = args['library_id'] workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cellids, ) for params_tag, params in config["hmmcopy_params"].iteritems(): params_tag = "hmmcopy_" + params_tag results_dir = os.path.join(args['out_dir'], 'results', params_tag) plots_dir = os.path.join(results_dir, "plots") info_file = os.path.join(results_dir, "info.yaml") igv_seg_file = os.path.join(results_dir, '{}_igv_segments.seg'.format(lib)) hmmcopy_data = os.path.join(results_dir, '{}_hmmcopy.h5'.format(lib)) segs_pdf = os.path.join(plots_dir, "segments", lib + '_segs.tar.gz') bias_pdf = os.path.join(plots_dir, "bias", lib + '_bias.tar.gz') heatmap_filt_pdf = os.path.join( plots_dir, '{}_heatmap_by_ec_filtered.pdf'.format(lib)) heatmap_pdf = os.path.join(plots_dir, '{}_heatmap_by_ec.pdf'.format(lib)) metrics_pdf = os.path.join(plots_dir, '{}_metrics.pdf'.format(lib)) kernel_density_pdf = os.path.join(plots_dir, '{}_kernel_density.pdf'.format(lib)) workflow.subworkflow( name='hmmcopy_workflow_' + params_tag, func=hmmcopy.create_hmmcopy_workflow, args=(mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files), mgd.InputFile('bam_markdups_index', 'cell_id', fnames=bai_files), mgd.OutputFile(hmmcopy_data), mgd.OutputFile(igv_seg_file), segs_pdf, bias_pdf, mgd.OutputFile(heatmap_pdf), mgd.OutputFile(heatmap_filt_pdf), mgd.OutputFile(metrics_pdf), mgd.OutputFile(kernel_density_pdf), cellids, config, args, params, params_tag, results_dir), kwargs={'alignment_metrics': args['alignment_metrics']}) results = { 'hmmcopy_metrics': helpers.format_file_yaml(hmmcopy_data), 'segments_plot': helpers.format_file_yaml(segs_pdf), 'bias_plot': helpers.format_file_yaml(bias_pdf), 'filtered_heatmap_plot': helpers.format_file_yaml(heatmap_filt_pdf), 'heatmap_plot': helpers.format_file_yaml(heatmap_pdf), 'kde_plot': helpers.format_file_yaml(kernel_density_pdf), 'metrics_plot': helpers.format_file_yaml(metrics_pdf) } input_datasets = { k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems() } metadata = { 'hmmcopy': { 'reads_table': '/hmmcopy/reads/0', 'parameters_table': '/hmmcopy/params/0', 'segments_table': '/hmmcopy/segments/0', 'metrics_table': '/hmmcopy/metrics/0', 'hmmcopy_params_tag': params_tag, 'hmmcopy_params': params, 'chromosomes': config['chromosomes'], 'ref_genome': config['ref_genome'], 'cell_filters': config["good_cells"], 'version': single_cell.__version__, 'results': results, 'containers': config['containers'], 'input_datasets': input_datasets, 'output_datasets': None } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def bam_metrics_workflow(bam_filename, summary_fastq_screen_count_per_cell, alignment_metrics, gc_metrics, markdups_metrics_percell, flagstat_metrics_percell, wgs_metrics_percell, gc_metrics_percell, gc_metrics_summary_percell, gc_metrics_pdf_percell, insert_metrics_percell, insert_metrics_pdf_percell, ref_genome, sample_info, config, cell_ids): markdups_metrics_percell = dict([(cellid, markdups_metrics_percell[cellid]) for cellid in cell_ids]) flagstat_metrics_percell = dict([(cellid, flagstat_metrics_percell[cellid]) for cellid in cell_ids]) wgs_metrics_percell = dict([(cellid, wgs_metrics_percell[cellid]) for cellid in cell_ids]) gc_metrics_percell = dict([(cellid, gc_metrics_percell[cellid]) for cellid in cell_ids]) gc_metrics_summary_percell = dict([ (cellid, gc_metrics_summary_percell[cellid]) for cellid in cell_ids ]) gc_metrics_pdf_percell = dict([(cellid, gc_metrics_pdf_percell[cellid]) for cellid in cell_ids]) insert_metrics_percell = dict([(cellid, insert_metrics_percell[cellid]) for cellid in cell_ids]) insert_metrics_pdf_percell = dict([ (cellid, insert_metrics_pdf_percell[cellid]) for cellid in cell_ids ]) baseimage = config['docker']['single_cell_pipeline'] workflow = pypeliner.workflow.Workflow(ctx={'docker_image': baseimage}) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.transform(name='get_duplication_metrics', axes=('cell_id', ), func="single_cell.utils.picardutils.bam_markdups", args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), mgd.TempOutputFile("temp_markdup_bam.bam", 'cell_id'), mgd.OutputFile('markdups_metrics', 'cell_id', fnames=markdups_metrics_percell), mgd.TempSpace('tempdir_markdups', 'cell_id'), ), kwargs={'docker_image': config['docker']['picard']}) workflow.transform(name='get_flagstat_metrics', axes=('cell_id', ), func="single_cell.utils.bamutils.bam_flagstat", args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), mgd.OutputFile('flagstat_metrics_percell', 'cell_id', fnames=flagstat_metrics_percell), ), kwargs={'docker_image': config['docker']['samtools']}) workflow.transform( name='bam_collect_wgs_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.utils.picardutils.bam_collect_wgs_metrics", axes=('cell_id', ), args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), ref_genome, mgd.OutputFile('wgs_metrics_percell', 'cell_id', fnames=wgs_metrics_percell), config['picard_wgs_params'], mgd.TempSpace('wgs_tempdir', 'cell_id'), ), kwargs={'docker_image': config['docker']['picard']}) workflow.transform( name='bam_collect_gc_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.utils.picardutils.bam_collect_gc_metrics", axes=('cell_id', ), args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), ref_genome, mgd.OutputFile('gc_metrics_percell', 'cell_id', fnames=gc_metrics_percell), mgd.OutputFile('gc_metrics_summary_percell', 'cell_id', fnames=gc_metrics_summary_percell), mgd.OutputFile('gc_metrics_pdf_percell', 'cell_id', fnames=gc_metrics_pdf_percell), mgd.TempSpace('gc_tempdir', 'cell_id'), ), kwargs={'docker_image': config['docker']['picard']}) workflow.transform( name='bam_collect_insert_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.utils.picardutils.bam_collect_insert_metrics", axes=('cell_id', ), args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), mgd.InputFile('flagstat_metrics_percell', 'cell_id', fnames=flagstat_metrics_percell), mgd.OutputFile('insert_metrics_percell', 'cell_id', fnames=insert_metrics_percell), mgd.OutputFile('insert_metrics_pdf_percell', 'cell_id', fnames=insert_metrics_pdf_percell), mgd.TempSpace('insert_tempdir', 'cell_id'), ), kwargs={'docker_image': config['docker']['picard']}) workflow.transform( name="collect_gc_metrics", func="single_cell.workflows.align.tasks.collect_gc", ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, args=(mgd.InputFile('gc_metrics_percell', 'cell_id', axes_origin=[], fnames=gc_metrics_percell), mgd.OutputFile(gc_metrics, extensions=['.yaml']), mgd.TempSpace("temp_gc")), ) workflow.transform( name='collect_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.workflows.align.tasks.collect_metrics", args=( mgd.InputFile('flagstat_metrics', 'cell_id', axes_origin=[], fnames=flagstat_metrics_percell), mgd.InputFile('markdups_metrics', 'cell_id', axes_origin=[], fnames=markdups_metrics_percell), mgd.InputFile('insert_metrics_percell', 'cell_id', axes_origin=[], fnames=insert_metrics_percell), mgd.InputFile('wgs_metrics_percell', 'cell_id', axes_origin=[], fnames=wgs_metrics_percell), mgd.TempSpace("tempdir_collect_metrics"), mgd.TempOutputFile("alignment_metrics.csv.gz", extensions=['.yaml']), ), ) workflow.transform( name='annotate_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.utils.csvutils.annotate_csv", args=( mgd.TempInputFile("alignment_metrics.csv.gz", extensions=['.yaml']), sample_info, mgd.TempOutputFile('alignment_metrics_annotated.csv.gz', extensions=['.yaml']), ), ) workflow.transform( name='add_fastqscreen_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.utils.csvutils.merge_csv", args=( [ mgd.TempInputFile("alignment_metrics_annotated.csv.gz", extensions=['.yaml']), mgd.InputFile(summary_fastq_screen_count_per_cell), ], mgd.OutputFile(alignment_metrics, extensions=['.yaml']), 'outer', ['cell_id'], ), ) return workflow
def lumpy_multi_sample_workflow(config, normal_bam, tumour_cell_bams, lumpy_breakpoints_csv, lumpy_breakpoints_evidence, lumpy_breakpoints_bed): ctx = {'docker_image': config['docker']['single_cell_pipeline']} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(tumour_cell_bams.keys()), ) keys = [(sample_id, library_id) for (sample_id, library_id, _) in list(tumour_cell_bams.keys())] keys = sorted(set(keys)) lumpy_breakpoints_csv = dict([(key, lumpy_breakpoints_csv(*key)) for key in keys]) lumpy_breakpoints_evidence = dict([(key, lumpy_breakpoints_evidence(*key)) for key in keys]) lumpy_breakpoints_bed = dict([(key, lumpy_breakpoints_bed(*key)) for key in keys]) workflow.set_filenames('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', fnames=tumour_cell_bams) workflow.set_filenames('lumpy_breakpoints.csv.gz', 'sample_id', 'library_id', fnames=lumpy_breakpoints_csv) workflow.set_filenames('lumpy_breakpoints_evidence.csv.gz', 'sample_id', 'library_id', fnames=lumpy_breakpoints_evidence) workflow.set_filenames('lumpy_breakpoints.bed', 'sample_id', 'library_id', fnames=lumpy_breakpoints_bed) workflow.subworkflow( name='normal_preprocess_lumpy', func='single_cell.workflows.lumpy.lumpy_preprocess_workflow', ctx={'docker_image': config['docker']['single_cell_pipeline']}, args=(normal_bam, config, mgd.TempOutputFile('normal.discordants.sorted.bam'), mgd.TempOutputFile('normal.splitters.sorted.bam'), mgd.TempOutputFile('hist_normal_formatted.csv'), mgd.TempOutputFile('normal_mean_stdev.yaml')), ) workflow.subworkflow( name='tumour_preprocess_lumpy', func='single_cell.workflows.lumpy.lumpy_preprocess_workflow', axes=('sample_id', 'library_id'), ctx={'docker_image': config['docker']['single_cell_pipeline']}, args=(mgd.InputFile('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai']), config, mgd.TempOutputFile('tumour.discordants.sorted.bam', 'sample_id', 'library_id'), mgd.TempOutputFile('tumour.splitters.sorted.bam', 'sample_id', 'library_id'), mgd.TempOutputFile('hist_tumour_formatted.csv', 'sample_id', 'library_id'), mgd.TempOutputFile('tumour_mean_stdev.yaml', 'sample_id', 'library_id')), ) workflow.subworkflow( name='lumpy', ctx={'docker_image': config['docker']['single_cell_pipeline']}, axes=('sample_id', 'library_id'), func="single_cell.workflows.lumpy.create_lumpy_workflow", args=( config, mgd.TempInputFile('normal.discordants.sorted.bam'), mgd.TempInputFile('normal.splitters.sorted.bam'), mgd.TempInputFile('hist_normal_formatted.csv'), mgd.TempInputFile('normal_mean_stdev.yaml'), mgd.TempInputFile('tumour.discordants.sorted.bam', 'sample_id', 'library_id'), mgd.TempInputFile('tumour.splitters.sorted.bam', 'sample_id', 'library_id'), mgd.TempInputFile('hist_tumour_formatted.csv', 'sample_id', 'library_id'), mgd.TempInputFile('tumour_mean_stdev.yaml', 'sample_id', 'library_id'), mgd.OutputFile('lumpy_breakpoints.bed', 'sample_id', 'library_id'), mgd.OutputFile('lumpy_breakpoints.csv.gz', 'sample_id', 'library_id'), mgd.OutputFile('lumpy_breakpoints_evidence.csv.gz', 'sample_id', 'library_id'), ), kwargs={ 'sample_id': mgd.InputInstance('sample_id'), 'library_id': mgd.InputInstance('library_id') }) return workflow
def create_destruct_workflow( normal_bam, tumour_bam_files, destruct_config, config, destruct_ref_data_dir, breakpoints_csv, breakpoints_library_csv, cell_counts_csv, normal_sample_id='normal', ): ctx = {'docker_image': config['docker']['single_cell_pipeline']} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(tumour_bam_files.keys()), ) workflow.subworkflow( name='normal_preprocess_destruct', func= 'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow', args=(normal_bam, mgd.TempOutputFile('normal_stats'), mgd.TempOutputFile('normal_reads_1.fastq.gz'), mgd.TempOutputFile('normal_reads_2.fastq.gz'), mgd.TempOutputFile('normal_sample_1.fastq.gz'), mgd.TempOutputFile('normal_sample_2.fastq.gz'), destruct_ref_data_dir, destruct_config, config), ) workflow.subworkflow( name='tumour_preprocess_destruct', func= 'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow', args=(mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=tumour_bam_files), mgd.TempOutputFile('tumour_stats'), mgd.TempOutputFile('tumour_reads_1.fastq.gz'), mgd.TempOutputFile('tumour_reads_2.fastq.gz'), mgd.TempOutputFile('tumour_sample_1.fastq.gz'), mgd.TempOutputFile('tumour_sample_2.fastq.gz'), destruct_ref_data_dir, destruct_config, config), kwargs={'tag': True}) workflow.subworkflow( name='run_destruct', func='single_cell.workflows.destruct_singlecell.destruct_workflow', args=( mgd.TempInputFile('normal_stats'), mgd.TempInputFile('normal_reads_1.fastq.gz'), mgd.TempInputFile('normal_reads_2.fastq.gz'), mgd.TempInputFile('normal_sample_1.fastq.gz'), mgd.TempInputFile('normal_sample_2.fastq.gz'), mgd.TempInputFile('tumour_stats'), mgd.TempInputFile('tumour_reads_1.fastq.gz'), mgd.TempInputFile('tumour_reads_2.fastq.gz'), mgd.TempInputFile('tumour_sample_1.fastq.gz'), mgd.TempInputFile('tumour_sample_2.fastq.gz'), destruct_config, config, destruct_ref_data_dir, mgd.OutputFile(breakpoints_csv), mgd.OutputFile(breakpoints_library_csv), mgd.OutputFile(cell_counts_csv), mgd.TempSpace("raw_data_dir"), ), ) return workflow
def create_variant_counting_workflow(args): """ Count variant reads for multiple sets of variants across cells. """ strelka_vcf, museq_vcf, tumour_cell_bams = inpututils.load_variant_counting_input( args['input_yaml']) counts_output = os.path.join(args['out_dir'], "counts.csv.gz") meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') config = inpututils.load_config(args) config = config['variant_calling'] workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config['docker']['single_cell_pipeline']}) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(tumour_cell_bams.keys()), ) workflow.transform(name='merge_snvs_museq', func='single_cell.utils.vcfutils.merge_vcf', args=([ mgd.InputFile('museq.vcf', 'sample_id', 'library_id', fnames=museq_vcf, extensions=['.tbi', '.csi'], axes_origin=[]), mgd.InputFile('strelka.vcf', 'sample_id', 'library_id', fnames=strelka_vcf, extensions=['.tbi', '.csi'], axes_origin=[]), ], mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("merge_vcf_temp")), kwargs={'docker_image': config['docker']['vcftools']}) workflow.subworkflow( name='count_alleles', func=create_snv_allele_counts_for_vcf_targets_workflow, args=( mgd.InputFile('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams, axes_origin=[]), mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(counts_output), config['memory'], ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], [counts_output], mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'snv_genotyping' } }) return workflow