def annotation_workflow(args): config = inpututils.load_config(args) annotation_infiles = inpututils.load_yaml(args['input_yaml']) lib = args["library_id"] workflow = pypeliner.workflow.Workflow(ctx={ 'docker_image': config['annotation']['docker']['single_cell_pipeline'] }, ) annotation_dir = args["out_dir"] input_yaml_blob = os.path.join(annotation_dir, 'input.yaml') annotation_files = get_output_files(annotation_dir, lib) annotation_meta = os.path.join(annotation_dir, 'metadata.yaml') workflow.subworkflow( name='annotation_workflow', func=qc_annotation.create_qc_annotation_workflow, args=( mgd.InputFile(annotation_infiles['hmmcopy_metrics']), mgd.InputFile(annotation_infiles['hmmcopy_reads']), mgd.InputFile(annotation_infiles['alignment_metrics']), mgd.InputFile(annotation_infiles['gc_metrics']), mgd.InputFile(annotation_infiles['segs_pdf_tar']), mgd.OutputFile(annotation_files['merged_metrics_csvs']), mgd.OutputFile(annotation_files['qc_report']), mgd.OutputFile(annotation_files['corrupt_tree_newick']), mgd.OutputFile(annotation_files['consensus_tree_newick']), mgd.OutputFile(annotation_files['phylo_csv']), mgd.OutputFile(annotation_files['loci_rank_trees']), mgd.OutputFile(annotation_files['filtered_data']), mgd.OutputFile(annotation_files['corrupt_tree_pdf']), mgd.OutputFile(annotation_files['segs_pass']), mgd.OutputFile(annotation_files['segs_fail']), mgd.OutputFile(annotation_files['corrupt_heatmap_pdf']), mgd.OutputFile(annotation_files['heatmap_filt_pdf']), config['annotation'], lib, ), kwargs={'no_corrupt_tree': args['no_corrupt_tree']}) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], annotation_dir, list(annotation_files.values()), mgd.OutputFile(annotation_meta)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'library_id': lib, 'type': 'annotation' } }) return workflow
def count_haps_workflow(args): config = inpututils.load_config(args) config = config['count_haps'] baseimage = config['docker']['single_cell_pipeline'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, docker_image=baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) allele_counts_filename = os.path.join(args["out_dir"], "allele_counts.csv.gz") meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') haplotypes_filename, tumour_cells = inpututils.load_count_haps_input( args['input_yaml']) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=list(tumour_cells.keys()), ) workflow.subworkflow( name='extract_allele_readcounts', func= 'single_cell.workflows.extract_allele_readcounts.extract_allele_readcounts', args=( mgd.InputFile(haplotypes_filename, extensions=['.yaml']), mgd.InputFile('tumour_cells.bam', 'tumour_cell_id', extensions=['.bai'], axes_origin=[], fnames=tumour_cells), mgd.OutputFile(allele_counts_filename), config, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], [allele_counts_filename], mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'count_haps' } }) return workflow
def infer_haps_workflow(args): config = inpututils.load_config(args) config = config['infer_haps'] baseimage = config['docker']['single_cell_pipeline'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, docker_image=baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) haplotypes_filename = os.path.join(args["out_dir"], "haplotypes.tsv") meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') normal_data = inpututils.load_infer_haps_input(args['input_yaml']) if isinstance(normal_data, dict): workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_data.keys()), ) bam_file = mgd.InputFile('normal.bam', 'normal_cell_id', fnames=normal_data, extensions=['.bai']) else: bam_file = mgd.InputFile(normal_data, extensions=['.bai']) workflow.subworkflow( name='infer_haps', func=infer_haps, args=( bam_file, mgd.OutputFile(haplotypes_filename), config, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], [haplotypes_filename], mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'infer_haps' } }) return workflow
def make_meta(args): workflow = pypeliner.workflow.Workflow() input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') filelist = [] for root, dirs, files in os.walk(args['out_dir']): for file in files: filelist.append(os.path.join(root, file)) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], filelist, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'cohort_qc' } }) return workflow
def breakpoint_calling_workflow(args): config = inpututils.load_config(args) config = config['breakpoint_calling'] run_destruct = True if args['destruct'] else False run_lumpy = True if args['lumpy'] else False if not run_destruct and not run_lumpy: run_destruct = True run_lumpy = True normal_data, tumour_cells = inpututils.load_breakpoint_calling_input(args['input_yaml']) bkp_dir = os.path.join(args['out_dir']) bkp_meta = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') out_files = get_output_files(bkp_dir, run_destruct, run_lumpy) ref_data_directory = config['ref_data_directory'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=list(tumour_cells.keys()), ) if isinstance(normal_data, dict): workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_data.keys()), ) normal_bam = mgd.InputFile( 'normal_cells.bam', 'normal_cell_id', extensions=['.bai'], fnames=normal_data ) else: normal_bam = mgd.InputFile(normal_data, extensions=['.bai']) if run_destruct: workflow.subworkflow( name='destruct', func="single_cell.workflows.destruct_singlecell.create_destruct_workflow", args=( normal_bam, mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells), config.get('destruct_config', {}), config, ref_data_directory, mgd.OutputFile(out_files['destruct_breakpoints_filename'], extensions=['.yaml']), mgd.OutputFile(out_files['destruct_breakpoints_lib_filename'], extensions=['.yaml']), mgd.OutputFile(out_files['destruct_cell_counts_filename'], extensions=['.yaml']), ), ) if run_lumpy: workflow.subworkflow( name='lumpy', func="single_cell.workflows.lumpy.create_lumpy_workflow", args=( config, normal_bam, mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells, extensions=['.bai']), mgd.OutputFile(out_files['lumpy_breakpoints_csv'], extensions=['.yaml']), mgd.OutputFile(out_files['lumpy_breakpoints_evidence_csv'], extensions=['.yaml']), mgd.OutputFile(out_files['lumpy_breakpoints_bed']), ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=( sys.argv[0:], bkp_dir, list(out_files.values()), mgd.OutputFile(bkp_meta) ), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': {'type': 'breakpoint_calling'} } ) return workflow
def variant_calling_workflow(args): config = inpututils.load_config(args) config = config['variant_calling'] normal_bams, tumour_bams = inpututils.load_variant_calling_input( args['input_yaml']) filepaths = get_file_paths(args['out_dir'], config) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') ctx = { 'ncpus': 1, 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'mem': config["memory"]['low'], } workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('region'), value=list(normal_bams.keys()), ) workflow.subworkflow( name='museq', func=mutationseq.create_museq_workflow, args=( mgd.InputFile('normal_regions.bam', 'region', extensions=['.bai'], fnames=normal_bams), mgd.InputFile('tumour_regions.bam', 'region', extensions=['.bai'], fnames=tumour_bams), mgd.OutputFile(filepaths['museq_vcf'], extensions=['.tbi', '.csi']), mgd.OutputFile(filepaths['museq_csv'], extensions=['.tbi', '.csi']), config, ), ) workflow.subworkflow(name='strelka', func=strelka.create_strelka_workflow, args=( mgd.InputFile('normal_regions.bam', 'region', extensions=['.bai'], fnames=normal_bams), mgd.InputFile('tumour_regions.bam', 'region', extensions=['.bai'], fnames=tumour_bams), config['ref_genome'], mgd.OutputFile(filepaths['strelka_indel'], extensions=['.tbi', '.csi']), mgd.OutputFile(filepaths['strelka_snv'], extensions=['.tbi', '.csi']), mgd.OutputFile(filepaths['strelka_csv'], extensions=['.yaml']), ), kwargs={ "chromosomes": config["chromosomes"], "use_depth_thresholds": config['use_depth_thresholds'] }) workflow.subworkflow( name='annotate_snv_vcf_files', func=snv_annotate.create_snv_annotate_workflow, args=(config, mgd.InputFile(filepaths['museq_vcf'], extensions=['.tbi', '.csi']), mgd.InputFile(filepaths['strelka_snv'], extensions=['.tbi', '.csi']), mgd.OutputFile(filepaths['mappability_csv'], extensions=['.yaml']), mgd.OutputFile(filepaths['snpeff_csv'], extensions=['.yaml']), mgd.OutputFile(filepaths['trinuc_csv'], extensions=['.yaml']), { k: mgd.OutputFile(v) for k, v in filepaths['additional_databases'].items() }, config['memory'])) allfiles = [ filepaths[k] for k in filepaths if not k == 'additional_databases' ] allfiles += filepaths['additional_databases'].values() workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], allfiles, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'variant_calling' } }) return workflow
def cohort_qc_pipeline(args): """Process maf, run classify copynumber, make plots. Args: args ([dict]): [pipeline arguments] """ config = inpututils.load_config(args) config = config["cohort_qc"] pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() out_dir = args["out_dir"] api_key = args["API_key"] meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') # inputs cohort, germline_mafs, vcfs, hmmcopy = inpututils.load_cohort_qc_inputs( args["input_yaml"] ) museq = { label: data["museq"] for label, data in vcfs.items() } strelka_snv = { label: data["strelka_snv"] for label, data in vcfs.items() } strelka_indel = { label: data["strelka_indel"] for label, data in vcfs.items() } hmmcopy_files = { label: data["hmmcopy"] for label, data in hmmcopy.items() } hmmcopy_metrics_files = { label: data["hmmcopy_metrics"] for label, data in hmmcopy.items() } # outputs cbiofile_paths = get_cbioportal_paths(os.path.join(out_dir, cohort)) maftools_filepaths = get_maftools_paths(os.path.join(out_dir, cohort)) workflow.setobj( obj=mgd.OutputChunks('sample_label', 'library_label'), value=list(museq.keys()), ) workflow.subworkflow( name="merge_somatic_mafs", func="single_cell.workflows.cohort_qc.merge_somatic_mafs", axes=('sample_label',), args=( mgd.InputInstance('sample_label'), config, mgd.InputFile( 'museq', 'sample_label', 'library_label', fnames=museq, axes_origin=[] ), mgd.InputFile( 'strelka_snv', 'sample_label', 'library_label', fnames=strelka_snv, axes_origin=[] ), mgd.InputFile( 'strelka_indel', 'sample_label', 'library_label', fnames=strelka_indel, axes_origin=[] ), mgd.TempOutputFile('somatic_maf', 'sample_label') ), ) workflow.subworkflow( name="classifycopynumber", func="single_cell.workflows.cohort_qc.cna_annotation_workflow", args=( config, mgd.InputFile( 'hmmcopy_dict', 'sample_label', 'library_label', fnames=hmmcopy_files, axes_origin=[] ), mgd.InputFile( 'hmmcopy_metrics_dict', 'sample_label', 'library_label', fnames=hmmcopy_metrics_files, axes_origin=[] ), mgd.OutputFile(cbiofile_paths["cna_table"]), mgd.OutputFile(maftools_filepaths["maftools_cna"]), mgd.OutputFile(cbiofile_paths["segments"]), config["gtf"], ), ) workflow.subworkflow( name="maf_annotation_workflow", func="single_cell.workflows.cohort_qc.preprocess_mafs_workflow", args=( config, mgd.InputFile( 'germline_mafs_dict', 'sample_label', fnames=germline_mafs, axes_origin=[] ), mgd.TempInputFile( 'somatic_maf', 'sample_label', axes_origin=[] ), mgd.OutputFile(cbiofile_paths["filtered_germline_maf"]), mgd.OutputFile(cbiofile_paths["annotated_somatic_maf"]), api_key ), ) workflow.subworkflow( name="make_plots_and_report", func="single_cell.workflows.cohort_qc.create_cohort_oncoplot", args=( config, mgd.InputFile(cbiofile_paths["filtered_germline_maf"]), mgd.InputFile(cbiofile_paths["annotated_somatic_maf"]), mgd.InputFile(maftools_filepaths["maftools_cna"]), mgd.OutputFile(maftools_filepaths["maftools_maf"]), mgd.OutputFile(maftools_filepaths["cohort_oncoplot"]), mgd.OutputFile(maftools_filepaths["report"]), cohort ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=( sys.argv[0:], args['out_dir'], list(cbiofile_paths.values()) + list(maftools_filepaths.values()), mgd.OutputFile(meta_yaml) ), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': {'type': 'cohort_qc'} } ) pyp.run(workflow)
def germline_calling_workflow(args): config = inpututils.load_config(args) config = config['germline_calling'] normal_bams = inpututils.load_germline_data(args['input_yaml']) varcalls_meta = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') out_files = get_output_files(args['out_dir']) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('region'), value=list(normal_bams.keys()), ) workflow.subworkflow( name='samtools_germline', func=germline.create_samtools_germline_workflow, args=( mgd.InputFile("normal_split.bam", "region", extensions=['.bai'], fnames=normal_bams), config['ref_genome'], mgd.OutputFile(out_files['samtools_germline_vcf'], extensions=['.tbi']), config, ), ) workflow.subworkflow( name='annotate_mappability', func= "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow", args=( config['databases']['mappability']['local_path'], mgd.InputFile(out_files['samtools_germline_vcf'], extensions=['.tbi']), mgd.OutputFile(out_files['mappability_filename']), ), kwargs={'chromosomes': config['chromosomes']}) workflow.transform( name='annotate_genotype', func="single_cell.workflows.germline.tasks.annotate_normal_genotype", args=( mgd.InputFile(out_files['samtools_germline_vcf'], extensions=['.tbi']), mgd.OutputFile(out_files['normal_genotype_filename']), config["chromosomes"], ), ) workflow.subworkflow( name='snpeff', func= "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow", args=( config['databases']['snpeff']['db'], config['databases']['snpeff']['data_dir'], mgd.InputFile(out_files['samtools_germline_vcf'], extensions=['.tbi']), mgd.OutputFile(out_files['snpeff_vcf_filename']), ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], list(out_files.values()), mgd.OutputFile(varcalls_meta)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'germline_calling' } }) return workflow
def alignment_workflow(args): config = inpututils.load_config(args) config = config['alignment'] lib = args["library_id"] alignment_dir = args["out_dir"] bams_dir = args["bams_dir"] trim = args['trim'] center = args['sequencing_center'] sampleinfo = inpututils.get_sample_info(args['input_yaml']) cellids = inpututils.get_samples(args['input_yaml']) fastq1_files, fastq2_files = inpututils.get_fastqs(args['input_yaml']) alignment_files = get_output_files(alignment_dir, lib) alignment_meta = os.path.join(alignment_dir, 'metadata.yaml') bam_files_template = os.path.join(bams_dir, '{cell_id}.bam') mt_bam_files_template = os.path.join(bams_dir, '{cell_id}_MT.bam') bams_meta = os.path.join(bams_dir, 'metadata.yaml') lanes = sorted(set([v[1] for v in fastq1_files.keys()])) cells = sorted(set([v[0] for v in fastq1_files.keys()])) input_yaml_blob = os.path.join(alignment_dir, 'input.yaml') workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=list(fastq1_files.keys()), ) workflow.subworkflow( name='alignment_workflow', func=align.create_alignment_workflow, args=( mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq1_files, axes_origin=[]), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq2_files, axes_origin=[]), mgd.OutputFile('bam_markdups', 'cell_id', template=bam_files_template, axes_origin=[], extensions=['.bai']), mgd.OutputFile('mt_bam_markdups', 'cell_id', template=mt_bam_files_template, axes_origin=[], extensions=['.bai']), mgd.OutputFile(alignment_files['alignment_metrics_csv']), mgd.OutputFile(alignment_files['gc_metrics_csv']), mgd.OutputFile(alignment_files['fastqc_metrics_csv']), mgd.OutputFile(alignment_files['plot_metrics_output']), config['ref_genome'], config, sampleinfo, cellids, mgd.OutputFile(alignment_files['alignment_metrics_tar']), lib, trim, center, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], alignment_dir, list(alignment_files.values()), mgd.OutputFile(alignment_meta)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'library_id': lib, 'cell_ids': cells, 'lane_ids': lanes, 'type': 'alignment' } }) workflow.transform( name='generate_meta_files_bams', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], bams_dir, mgd.Template('aligned.bam', 'cell_id', template=bam_files_template), mgd.OutputFile(bams_meta)), kwargs={ 'metadata': { 'library_id': lib, 'cell_ids': cells, 'lane_ids': lanes, 'type': 'cellbams' }, 'template': (mgd.InputChunks('cell_id'), bam_files_template, 'cell_id'), }) return workflow
def variant_calling_workflow(args): config = inpututils.load_config(args) config = config['variant_calling'] normal_bams, tumour_bams = inpututils.load_variant_calling_input( args['input_yaml']) filepaths = get_file_paths(args['out_dir']) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') basedocker = {'docker_image': config['docker']['single_cell_pipeline']} vcftools_docker = {'docker_image': config['docker']['vcftools']} baseimage = config['docker']['single_cell_pipeline'] ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'mem': config["memory"]['low'], 'docker_image': baseimage } workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('region'), value=list(normal_bams.keys()), ) workflow.subworkflow( name='museq', func=mutationseq.create_museq_workflow, args=( mgd.InputFile('normal_regions.bam', 'region', extensions=['.bai'], fnames=normal_bams), mgd.InputFile('tumour_regions.bam', 'region', extensions=['.bai'], fnames=tumour_bams), config['ref_genome'], mgd.OutputFile(filepaths['museq_vcf'], extensions=['.tbi', '.csi']), config, ), ) workflow.subworkflow(name='strelka', func=strelka.create_strelka_workflow, args=( mgd.InputFile('normal_regions.bam', 'region', extensions=['.bai'], fnames=normal_bams), mgd.InputFile('tumour_regions.bam', 'region', extensions=['.bai'], fnames=tumour_bams), config['ref_genome'], mgd.OutputFile(filepaths['strelka_indel'], extensions=['.tbi', '.csi']), mgd.OutputFile(filepaths['strelka_snv'], extensions=['.tbi', '.csi']), config, ), kwargs={"chromosomes": config["chromosomes"]}) workflow.transform( name='merge_snvs', func='biowrappers.components.io.vcf.tasks.merge_vcfs', ctx=ctx, args=([ mgd.InputFile(filepaths['museq_vcf'], extensions=['.tbi', '.csi']), mgd.InputFile(filepaths['strelka_snv'], extensions=['.tbi', '.csi']), ], mgd.TempOutputFile('all.snv.vcf')), ) workflow.transform(name='finalise_snvs', func="biowrappers.components.io.vcf.tasks.finalise_vcf", ctx=ctx, args=(mgd.TempInputFile('all.snv.vcf'), mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi'])), kwargs={'docker_config': vcftools_docker}) workflow.subworkflow( name='annotate_snvs', axes=(), ctx=ctx, func= "biowrappers.pipelines.snv_call_and_annotate.create_annotation_workflow", args=( config, mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempOutputFile('snv_annotations.h5'), mgd.TempSpace('raw_data_dir_annotate'), ), kwargs={ 'variant_type': 'snv', 'docker_config': basedocker, 'snpeff_docker': vcftools_docker, 'vcftools_docker': vcftools_docker }) workflow.transform( name='convert_museq_to_csv', func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv", ctx=ctx, args=( mgd.InputFile(filepaths['museq_vcf']), mgd.TempOutputFile('museq.csv'), ), kwargs={ 'score_callback': museq_callback, }) workflow.transform(name='prep_museq_csv', func='single_cell.utils.csvutils.prep_csv_files', args=(mgd.TempInputFile('museq.csv'), mgd.OutputFile(filepaths['museq_csv'], extensions=['.yaml'])), kwargs={'header': True}) workflow.transform( name='convert_strelka_to_csv', func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv", ctx=ctx, args=( mgd.InputFile(filepaths['strelka_snv']), mgd.TempOutputFile('strelka_snv.csv'), ), kwargs={ 'score_callback': strelka_snv_callback, }) workflow.transform(name='prep_strelka_csv', func='single_cell.utils.csvutils.prep_csv_files', args=(mgd.TempInputFile('strelka_snv.csv'), mgd.OutputFile(filepaths['strelka_csv'], extensions=['.yaml'])), kwargs={'header': True}) workflow.transform(name='convert_h5_to_csv', func='single_cell.utils.hdfutils.convert_hdf_to_csv', args=(mgd.TempInputFile('snv_annotations.h5'), { '/snv/cosmic_status': mgd.OutputFile(filepaths['cosmic_csv'], extensions=['.yaml']), '/snv/dbsnp_status': mgd.OutputFile(filepaths['dbsnp_csv'], extensions=['.yaml']), '/snv/mappability': mgd.OutputFile(filepaths['mappability_csv'], extensions=['.yaml']), '/snv/snpeff': mgd.OutputFile(filepaths['snpeff_csv'], extensions=['.yaml']), '/snv/tri_nucleotide_context': mgd.OutputFile(filepaths['trinuc_csv'], extensions=['.yaml']), })) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], list(filepaths.values()), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'variant_calling' } }) return workflow
def merge_bams_workflow(args): config = inpututils.load_config(args) config = config['merge_bams'] ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'mem': config["memory"]['low'] } workflow = pypeliner.workflow.Workflow(ctx=ctx) bam_files = inpututils.load_merge_cell_bams(args['input_yaml']) merge_out_template = os.path.join(args['out_dir'], '{region}.bam') meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) workflow.transform( name="get_regions", func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.transform( name="remove_softclipped_reads", func="single_cell.utils.pysamutils.remove_softclipped_reads", axes=('cell_id', ), args=(mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.TempOutputFile('bam_rm_softclipped.bam', 'cell_id', extensions=['.bai']), args['softclipped_reads_threshold'])) workflow.subworkflow(name="wgs_merge_workflow", func=merge_bams.create_merge_bams_workflow, args=( mgd.TempInputFile('bam_rm_softclipped.bam', 'cell_id', extensions=['.bai']), mgd.OutputFile("merged.bam", "region", axes_origin=[], extensions=['.bai'], template=merge_out_template), mgd.InputChunks("region"), config, )) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], mgd.Template('bam_filenames', 'region', template=merge_out_template), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'template': (mgd.InputChunks('region'), merge_out_template, 'region'), 'metadata': { 'type': 'pseudowgs_regionbams', 'cell_ids': list(bam_files.keys()) } }) return workflow
def hmmcopy_workflow(args): config = inpututils.load_config(args) config = config['hmmcopy'] sampleinfo = inpututils.get_sample_info(args['input_yaml']) cellids = inpututils.get_samples(args['input_yaml']) bam_files = inpututils.get_bams(args['input_yaml']) lib = args["library_id"] workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config['docker']['single_cell_pipeline']}, ) hmmcopy_dir = args["out_dir"] hmmcopy_files = get_output_files(hmmcopy_dir, lib) hmmcopy_meta = os.path.join(hmmcopy_dir, 'metadata.yaml') input_yaml_blob = os.path.join(hmmcopy_dir, 'input.yaml') workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) workflow.subworkflow( name='hmmcopy_workflow', func=hmmcopy.create_hmmcopy_workflow, args=(mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.OutputFile(hmmcopy_files['reads_csvs']), mgd.OutputFile(hmmcopy_files['segs_csvs']), mgd.OutputFile(hmmcopy_files['metrics_csvs']), mgd.OutputFile(hmmcopy_files['params_csvs']), mgd.OutputFile(hmmcopy_files['igv_csvs']), mgd.OutputFile(hmmcopy_files['segs_pdf']), mgd.OutputFile(hmmcopy_files['bias_pdf']), mgd.OutputFile(hmmcopy_files['heatmap_pdf']), mgd.OutputFile(hmmcopy_files['metrics_pdf']), mgd.OutputFile(hmmcopy_files['kernel_density_pdf']), mgd.OutputFile(hmmcopy_files['hmmcopy_data_tar']), cellids, config, sampleinfo), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], hmmcopy_dir, list(hmmcopy_files.values()), mgd.OutputFile(hmmcopy_meta)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'library_id': lib, 'cell_ids': list(bam_files.keys()), 'type': 'hmmcopy', } }) return workflow
def create_variant_counting_workflow(args): """ Count variant reads for multiple sets of variants across cells. """ strelka_vcf, museq_vcf, tumour_cell_bams = inpututils.load_variant_counting_input( args['input_yaml']) counts_output = os.path.join(args['out_dir'], "counts.csv.gz") meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') config = inpututils.load_config(args) config = config['variant_calling'] workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config['docker']['single_cell_pipeline']}) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(tumour_cell_bams.keys()), ) workflow.transform(name='merge_snvs_museq', func='single_cell.utils.vcfutils.merge_vcf', args=([ mgd.InputFile('museq.vcf', 'sample_id', 'library_id', fnames=museq_vcf, extensions=['.tbi', '.csi'], axes_origin=[]), mgd.InputFile('strelka.vcf', 'sample_id', 'library_id', fnames=strelka_vcf, extensions=['.tbi', '.csi'], axes_origin=[]), ], mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("merge_vcf_temp")), kwargs={'docker_image': config['docker']['vcftools']}) workflow.subworkflow( name='count_alleles', func=create_snv_allele_counts_for_vcf_targets_workflow, args=( mgd.InputFile('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams, axes_origin=[]), mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(counts_output), config['memory'], ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], [counts_output], mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'snv_genotyping' } }) return workflow
def split_bam_workflow(args): config = inpututils.load_config(args) config = config['split_bam'] bam_file = inpututils.load_split_wgs_input(args['input_yaml']) baseimage = config['docker']['single_cell_pipeline'] split_bam_template = os.path.join(args['out_dir'], '{region}.bam') meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') workflow = pypeliner.workflow.Workflow(ctx={'docker_image': baseimage}) workflow.transform( name="get_regions", ctx={ 'mem': config['memory']['low'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.TempOutputObj('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow( name="split_normal", func=split_bams.create_split_workflow, ctx={ 'mem': config['memory']['low'], 'ncpus': 1 }, args=( mgd.InputFile(bam_file), mgd.OutputFile("normal.split.bam", 'region', template=split_bam_template, axes_origin=[]), pypeliner.managed.TempInputObj('region'), config, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], mgd.Template('bam_filenames', 'region', template=split_bam_template), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'wgs_regionbams' }, 'template': (mgd.TempInputObj('region'), split_bam_template, 'region'), }) return workflow
def create_variant_counting_workflow(args): """ Count variant reads for multiple sets of variants across cells. """ vcf_files, tumour_cell_bams, sample_library = inpututils.load_variant_counting_input( args['input_yaml']) counts_template = '{sample_id}_{library_id}_counts.csv.gz' counts_output_template = os.path.join(args['out_dir'], counts_template) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') config = inpututils.load_config(args) config = config['variant_calling'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(tumour_cell_bams.keys()), ) workflow.transform( name='merge_snvs_museq', func='single_cell.utils.vcfutils.merge_vcf', args=([mgd.InputFile(vcf_file) for vcf_file in vcf_files], mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("merge_vcf_temp")), ) workflow.subworkflow( name='count_alleles', axes=('sample_id', 'library_id'), func= 'single_cell.workflows.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow', args=( mgd.InputFile('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams, axes_origin=[]), mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile('counts.csv.gz', 'sample_id', 'library_id', template=counts_output_template), mgd.Instance('sample_id'), mgd.Instance('library_id'), config['memory'], ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], mgd.Template('counts.csv.gz', 'sample_id', 'library_id', template=counts_output_template), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'snv_genotyping', 'counts': { 'template': counts_template, 'instances': sample_library, } } }) return workflow