def variant_counting_workflow(args): config = helpers.load_config(args) meta_yaml = os.path.join(args['out_dir'], 'info.yaml') bam_files, bai_files = helpers.get_bams(args['input_yaml']) vcfs = args['input_vcfs'] results_file = os.path.join(args['out_dir'], 'results', 'variant_counting', 'counts.h5') return create_variant_counting_workflow(vcfs, bam_files, results_file, meta_yaml, config)
def infer_haps_workflow(args): config = helpers.load_config(args) config = config['infer_haps'] baseimage = config['docker']['single_cell_pipeline'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, baseimage=baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) haps_dir = os.path.join(args["out_dir"], "infer_haps") haplotypes_filename = os.path.join(haps_dir, "results", "haplotypes.tsv") allele_counts_filename = os.path.join(haps_dir, "results", "allele_counts.tsv") data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_wgs = data['tumour_wgs'] normal_wgs = data['normal_wgs'] tumour_cells = data['tumour_cells'] normal_cells = data['normal_cells'] if args['normal']: bam_file = normal_cells if normal_cells else normal_wgs else: bam_file = tumour_cells if tumour_cells else tumour_wgs if isinstance(bam_file, dict): workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_file.keys()), ) bam_file = mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_file, extensions=['.bai']) else: bam_file = mgd.InputFile(bam_file, extensions=['.bai']) workflow.subworkflow( name='infer_haps', func=infer_haps, args=( bam_file, mgd.OutputFile(haplotypes_filename), mgd.OutputFile(allele_counts_filename), config, ), kwargs={'normal': args['normal']}, ) return workflow
def split_bam_workflow(args): workflow = pypeliner.workflow.Workflow() config = helpers.load_config(args) config = config['split_bam'] baseimage = config['docker']['single_cell_pipeline'] split_bam_template = args["split_bam_template"] by_reads = False if "{region}" in split_bam_template else True splitkeyword = "region" if "{region}" in split_bam_template else "reads" if by_reads: splitnames = [str(i) for i in range(config["num_splits_byreads"])] workflow.setobj( obj=mgd.OutputChunks('reads'), value=splitnames, ) else: workflow.transform( name="get_regions", ctx={ 'mem': config['memory']['low'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.TempOutputObj('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow(name="split_normal", func=split_bams.create_split_workflow, args=( mgd.InputFile(args['wgs_bam']), mgd.OutputFile("normal.split.bam", splitkeyword, template=split_bam_template, axes_origin=[]), pypeliner.managed.TempInputObj(splitkeyword), config, ), kwargs={"by_reads": by_reads}) return workflow
def variant_calling_workflow(args): config = helpers.load_config(args) ctx = {'num_retry': 3, 'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) meta_yaml = os.path.join(args['out_dir'], 'info.yaml') bam_files, bai_files = helpers.get_bams(args['input_yaml']) cellids = helpers.get_samples(args['input_yaml']) varcalls_dir = os.path.join(args['out_dir'], 'results', 'variant_calling') museq_vcf = os.path.join(varcalls_dir, 'museq_snv.vcf.gz') strelka_snv_vcf = os.path.join(varcalls_dir, 'strelka_snv.vcf.gz') strelka_indel_vcf = os.path.join(varcalls_dir, 'strelka_indel.vcf.gz') snv_h5 = os.path.join(varcalls_dir, 'snv_annotations.h5') raw_data_dir = os.path.join(varcalls_dir, 'raw') wgs_bam_template = args["tumour_template"] normal_bam_template = args["normal_template"] regions = refgenome.get_split_regions(config["split_size"]) tumour_region_bams = { r: wgs_bam_template.format(region=r) for r in regions } normal_region_bams = { r: normal_bam_template.format(region=r) for r in regions } return create_variant_calling_workflow( bam_files, tumour_region_bams, normal_region_bams, museq_vcf, strelka_snv_vcf, strelka_indel_vcf, snv_h5, config, raw_data_dir, )
def ltm_workflow(args): workflow = pypeliner.workflow.Workflow() config = helpers.load_config(args) hmmcopy, timepoints = ltmutils.read_input_file(args['input_csv']) cn_matrix = os.path.join(args['out_dir'], 'cn_matrix.csv') output_gml = os.path.join(args['out_dir'], 'tree.gml') output_rooted_gml = os.path.join(args['out_dir'], 'rooted_tree.gml') # Outputs required for visualization with cellscape cnv_annots_csv = os.path.join(args['out_dir'], 'cnv_annots.csv') cnv_tree_edges_csv = os.path.join(args['out_dir'], 'cnv_tree_edges.csv') cnv_data_csv = os.path.join(args['out_dir'], 'cnv_data.csv') output_rmd = os.path.join(args['out_dir'], 'cellscape.Rmd') root_id_file = os.path.join(args['out_dir'], 'root_id.txt') workflow.setobj( obj=mgd.OutputChunks('timepoint'), value=timepoints, ) workflow.subworkflow( name='ltm_scale', func=ltm.create_ltm_workflow, args=( mgd.InputFile('hmmcopy.h5', 'timepoint', fnames=hmmcopy), mgd.OutputFile(cn_matrix), mgd.OutputFile(output_gml), mgd.OutputFile(output_rooted_gml), mgd.OutputFile(cnv_annots_csv), mgd.OutputFile(cnv_tree_edges_csv), mgd.OutputFile(cnv_data_csv), mgd.OutputFile(output_rmd), config, args['root_id'], mgd.OutputFile(root_id_file), args['number_of_jobs'], args['ploidy'], ), ) return workflow
def multi_sample_pipeline(args): data = helpers.load_yaml(args['input_yaml']) tumour_cell_bams = load_tumour_data(data) normal_sample_id, normal_libraries, normal_bams = load_normal_data(data) pyp = pypeliner.app.Pypeline(config=args) workflow = create_multi_sample_workflow( normal_bams, tumour_cell_bams, helpers.load_config(args), destruct_dir=args['destruct_output'], lumpy_dir=args['lumpy_output'], haps_dir=args['haps_output'], varcall_dir=args["variants_output"], normal_sample_id=normal_sample_id) pyp.run(workflow) generate_meta_files(normal_sample_id, normal_libraries, tumour_cell_bams, args)
def split_bam_workflow(workflow, args): config = helpers.load_config(args) info_file = os.path.join(args["out_dir"], 'results', 'split_bam', 'info.yaml') split_bam_template = args["split_bam_template"] split_bai_template = args["split_bam_template"] + ".bai" by_reads = False if "{region}" in split_bam_template else True splitkeyword = "region" if "{region}" in split_bam_template else "reads" if by_reads: splitnames = [str(i) for i in range(config["num_splits_byreads"])] workflow.setobj( obj=mgd.OutputChunks('reads'), value=splitnames, ) else: workflow.transform( name="get_regions", ctx={ 'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2, 'pool_id': config['pools']['standard'], 'ncpus': 1 }, func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.TempOutputObj('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow(name="split_normal", func=split_bams.create_split_workflow, args=( mgd.InputFile(args['wgs_bam']), mgd.InputFile(args['wgs_bam'] + ".bai"), mgd.OutputFile("normal.split.bam", splitkeyword, template=split_bam_template, axes_origin=[]), mgd.OutputFile("normal.split.bam.bai", splitkeyword, template=split_bai_template, axes_origin=[]), pypeliner.managed.TempInputObj(splitkeyword), config, ), kwargs={"by_reads": by_reads}) regions = mgd.InputChunks( 'reads') if by_reads else pypeliner.managed.TempInputObj('region') workflow.transform(name="get_files", func='single_cell.utils.helpers.resolve_template', ret=pypeliner.managed.TempOutputObj('outputs'), args=(pypeliner.managed.TempInputObj('region'), split_bam_template, 'region')) metadata = { 'split_bams': { 'name': 'merge_bams', 'ref_genome': config["ref_genome"], 'version': single_cell.__version__, 'containers': config['containers'], 'output_datasets': pypeliner.managed.TempInputObj('outputs'), 'input_datasets': args['wgs_bam'], 'results': None } } workflow.transform(name='generate_meta_yaml', ctx=dict( mem=config['memory']['med'], pool_id=config['pools']['standard'], ), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def merge_bams_workflow(workflow, args): input_yaml = args["input_yaml"] output_template = args["merged_bam_template"] info_file = os.path.join(args["out_dir"], 'results', 'merge_bams', "info.yaml") config = helpers.load_config(args) bam_files, bai_files = helpers.get_bams(input_yaml) cellids = helpers.get_samples(input_yaml) wgs_bam_template = output_template wgs_bai_template = wgs_bam_template + ".bai" ctx = {'mem_retry_increment': 2, 'ncpus': 1} ctx.update( helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cellids, ) workflow.transform( name="get_regions", ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.TempOutputObj('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow(name="wgs_merge_workflow", func=merge_bams.create_merge_bams_workflow, args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.OutputFile("merged_bam", "region", axes_origin=[], template=wgs_bam_template, extensions=['.bai']), cellids, config, mgd.TempInputObj("region"), )) workflow.transform(name="get_files", ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), func='single_cell.utils.helpers.resolve_template', ret=pypeliner.managed.TempOutputObj('outputs'), args=(pypeliner.managed.TempInputObj('region'), wgs_bam_template, 'region')) inputs = {k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems()} metadata = { 'merge_bams': { 'name': 'merge_bams', 'ref_genome': config["ref_genome"], 'version': single_cell.__version__, 'containers': config['containers'], 'output_datasets': pypeliner.managed.TempInputObj('outputs'), 'input_datasets': inputs, 'results': None } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def germline_calling_workflow(args): config = helpers.load_config(args) config = config['germline_calling'] baseimage = config['docker']['single_cell_pipeline'] basedocker = {'docker_image': config['docker']['single_cell_pipeline']} vcftoolsdocker = {'docker_image': config['docker']['vcftools']} samtoolsdocker = {'docker_image': config['docker']['samtools']} snpeffdocker = {'docker_image': config['docker']['snpeff']} pyp = pypeliner.app.Pypeline(config=args) ctx = { 'mem_retry_increment': 2, 'ncpus': 1, 'mem': config["memory"]['low'], 'disk_retry_increment': 50, 'docker_image': baseimage }, workflow = pypeliner.workflow.Workflow(ctx=ctx) data = helpers.load_pseudowgs_input(args['input_yaml']) normal_bams = data['normal_bams'] tumour_cells = data['tumour_cells'] if not isinstance(normal_bams, dict): workflow.transform( name="get_regions", func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) assert '{region}' in normal_bams, 'only supports a list of files or a template on regions' workflow.set_filenames('normal_split.bam', 'region', template=normal_bams) else: workflow.setobj( obj=mgd.OutputChunks('region'), value=list(normal_bams.keys()), ) workflow.set_filenames('normal_split.bam', 'normal_split', fnames=normal_bams) varcalls_dir = os.path.join(args['out_dir'], 'results', 'germline_calling') samtools_germline_vcf = os.path.join(varcalls_dir, 'raw', 'samtools_germline.vcf.gz') snpeff_vcf_filename = os.path.join(varcalls_dir, 'snpeff.vcf') normal_genotype_filename = os.path.join(varcalls_dir, 'raw', 'normal_genotype.h5') mappability_filename = os.path.join(varcalls_dir, 'raw', 'mappability.h5') counts_template = os.path.join(varcalls_dir, 'counts', 'raw', 'counts.h5') germline_h5_filename = os.path.join(varcalls_dir, 'germline.h5') workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(tumour_cells.keys()), ) workflow.subworkflow(name='samtools_germline', func=germline.create_samtools_germline_workflow, args=( mgd.InputFile("normal_split.bam", "region", extensions=['.bai']), config['ref_genome'], mgd.OutputFile(samtools_germline_vcf, extensions=['.tbi']), config, ), kwargs={ 'vcftools_docker': vcftoolsdocker, 'samtools_docker': samtoolsdocker, }) workflow.subworkflow( name='annotate_mappability', func= "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow", args=( config['databases']['mappability']['local_path'], mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(mappability_filename), ), kwargs={ 'base_docker': basedocker, 'chromosomes': config['chromosomes'] }) workflow.transform( name='annotate_genotype', func="single_cell.workflows.germline.tasks.annotate_normal_genotype", args=( mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(normal_genotype_filename), config["chromosomes"], ), ) workflow.subworkflow( name='snpeff', func= "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow", args=( config['databases']['snpeff']['db'], mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(snpeff_vcf_filename), ), kwargs={ 'hdf5_output': False, 'base_docker': basedocker, 'vcftools_docker': vcftoolsdocker, 'snpeff_docker': snpeffdocker, }) workflow.subworkflow( name='read_counts', func= "single_cell.variant_calling.create_snv_allele_counts_for_vcf_targets_workflow", args=( mgd.InputFile('tumour.bam', 'cell_id', fnames=tumour_cells, extensions=['.bai']), mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(counts_template), config['memory'], ), kwargs={ 'table_name': '/germline_allele_counts', }, ) workflow.transform( name='build_results_file', func="biowrappers.components.io.hdf5.tasks.concatenate_tables", args=( [ mgd.InputFile(counts_template), mgd.InputFile(mappability_filename), mgd.InputFile(normal_genotype_filename), ], pypeliner.managed.OutputFile(germline_h5_filename), ), kwargs={ 'drop_duplicates': True, }) pyp.run(workflow)
def align_workflow(workflow, args): config = helpers.load_config(args) sampleinfo = helpers.get_sample_info(args['input_yaml']) cellids = helpers.get_samples(args['input_yaml']) bam_files, bai_files = helpers.get_bams(args['input_yaml']) lib = args["library_id"] outdir = os.path.join(args["out_dir"], "results", "alignment") info_file = os.path.join(outdir, "info.yaml") alignment_metrics_h5 = os.path.join(outdir, '{}_alignment_metrics.h5'.format(lib)) plots_dir = os.path.join(outdir, 'plots') plot_metrics_output = os.path.join(plots_dir, '{}_plot_metrics.pdf'.format(lib)) ctx = {'mem_retry_increment': 2, 'ncpus': 1} ctx.update( helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')) if not args["metrics_only"]: fastq1_files, fastq2_files = helpers.get_fastqs(args['input_yaml']) instrumentinfo = helpers.get_instrument_info(args['input_yaml']) centerinfo = helpers.get_center_info(args['input_yaml']) workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=fastq1_files.keys(), ) workflow.subworkflow( name='alignment_workflow', func=align.create_alignment_workflow, args=( mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq1_files, axes_origin=[]), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq2_files, axes_origin=[]), mgd.OutputFile('bam_markdups', 'cell_id', fnames=bam_files, axes_origin=[]), mgd.OutputFile('bai_markdups', 'cell_id', fnames=bai_files, axes_origin=[]), config['ref_genome'], config, args, instrumentinfo, centerinfo, sampleinfo, cellids, ), ) else: workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cellids, ) workflow.subworkflow( name='metrics_workflow', func=alignment_metrics.create_alignment_metrics_workflow, args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, axes_origin=[]), mgd.InputFile('bai_markdups', 'cell_id', fnames=bai_files, axes_origin=[]), mgd.OutputFile(alignment_metrics_h5), mgd.OutputFile(plot_metrics_output), config['ref_genome'], config, args, sampleinfo, cellids, ), ) inputs = helpers.get_fastq_files(args["input_yaml"]) outputs = { k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems() } metadata = { 'alignment': { 'name': 'alignment', 'cell_batch_realign': args["realign"], 'metrics_table': '/alignment/metrics', 'gc_metrics_table': '/alignment/gc_metrics', 'aligner': config["aligner"], 'adapter': config["adapter"], 'adapter2': config["adapter2"], 'picardtools_wgsmetrics_params': config['picard_wgs_params'], 'ref_genome': config["ref_genome"], 'version': single_cell.__version__, 'containers': config['containers'], 'output_datasets': outputs, 'input_datasets': inputs, 'results': { 'alignment_metrics': helpers.format_file_yaml(alignment_metrics_h5), 'alignment_plots': helpers.format_file_yaml(plot_metrics_output), }, } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def ltm_workflow(workflow, args): config = helpers.load_config(args) hmmcopy, timepoints = ltmutils.read_input_file(args['input_csv']) cn_matrix = os.path.join(args['out_dir'], 'cn_matrix.csv') output_gml = os.path.join(args['out_dir'], 'tree.gml') output_rooted_gml = os.path.join(args['out_dir'], 'rooted_tree.gml') # Outputs required for visualization with cellscape cnv_annots_csv = os.path.join(args['out_dir'], 'cnv_annots.csv') cnv_tree_edges_csv = os.path.join(args['out_dir'], 'cnv_tree_edges.csv') cnv_data_csv = os.path.join(args['out_dir'], 'cnv_data.csv') output_rmd = os.path.join(args['out_dir'], 'cellscape.Rmd') root_id_file = os.path.join(args['out_dir'], 'root_id.txt') workflow.setobj( obj=mgd.OutputChunks('timepoint'), value=timepoints, ) workflow.subworkflow( name='ltm_scale', func=ltm.create_ltm_workflow, args=( mgd.InputFile('hmmcopy.h5', 'timepoint', fnames=hmmcopy), mgd.OutputFile(cn_matrix), mgd.OutputFile(output_gml), mgd.OutputFile(output_rooted_gml), mgd.OutputFile(cnv_annots_csv), mgd.OutputFile(cnv_tree_edges_csv), mgd.OutputFile(cnv_data_csv), mgd.OutputFile(output_rmd), config, args['root_id'], mgd.OutputFile(root_id_file), args['number_of_jobs'], args['ploidy'], ), ) info_file = os.path.join(args["out_dir"], 'results', 'ltm', "info.yaml") results = { 'ltm_cn_matrix': helpers.format_file_yaml(cn_matrix), 'ltm_gml': helpers.format_file_yaml(output_gml), 'ltm_rooted_gml': helpers.format_file_yaml(output_rooted_gml), 'ltm_cnv_annots_csv': helpers.format_file_yaml(cnv_annots_csv), 'ltm_cnv_tree_edges_csv': helpers.format_file_yaml(cnv_tree_edges_csv), 'ltm_cnv_data_csv': helpers.format_file_yaml(cnv_data_csv), 'ltm_output_rmd': helpers.format_file_yaml(output_rmd) } input_datasets = { k: helpers.format_file_yaml(v) for k, v in bam_file.iteritems() } metadata = { 'LTM': { 'chromosomes': config['chromosomes'], 'ref_genome': config['ref_genome'], 'cell_filters': config["good_cells"], 'version': single_cell.__version__, 'results': results, 'containers': config['containers'], 'input_datasets': input_datasets, 'output_datasets': None } } workflow.transform(name='generate_meta_yaml', ctx=dict( mem=config['memory']['med'], pool_id=config['pools']['standard'], ), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def qc_workflow(args): config = helpers.load_config(args) sampleinfo = helpers.get_sample_info(args['input_yaml']) cellids = helpers.get_samples(args['input_yaml']) bam_files, _ = helpers.get_bams(args['input_yaml']) lib = args["library_id"] workflow = pypeliner.workflow.Workflow() annotation_only = args['annotation_only'] alignment_dir = args["alignment_output"] hmmcopy_dir = args["hmmcopy_output"] annotation_dir = args["annotation_output"] if alignment_dir and not annotation_only: alignment_files = get_output_files(alignment_dir, 'alignment', lib) fastq1_files, fastq2_files = helpers.get_fastqs(args['input_yaml']) triminfo = helpers.get_trim_info(args['input_yaml']) centerinfo = helpers.get_center_info(args['input_yaml']) workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=list(fastq1_files.keys()), ) workflow.subworkflow( name='alignment_workflow', ctx={ 'docker_image': config['alignment']['docker']['single_cell_pipeline'] }, func=align.create_alignment_workflow, args=( mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq1_files, axes_origin=[]), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq2_files, axes_origin=[]), mgd.OutputFile('bam_markdups', 'cell_id', fnames=bam_files, axes_origin=[], extensions=['.bai']), mgd.OutputFile(alignment_files['alignment_metrics_csv']), mgd.OutputFile(alignment_files['gc_metrics_csv']), mgd.OutputFile(alignment_files['fastqc_metrics_csv']), mgd.OutputFile(alignment_files['plot_metrics_output']), config['alignment']['ref_genome'], config['alignment'], triminfo, centerinfo, sampleinfo, cellids, mgd.OutputFile(alignment_files['alignment_metrics_tar']), lib, ), kwargs={'realign': args['realign']}) if hmmcopy_dir and not annotation_only: hmmcopy_files = get_output_files(hmmcopy_dir, 'hmmcopy', lib) if not alignment_dir: workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) workflow.subworkflow( name='hmmcopy_workflow', ctx={ 'docker_image': config['hmmcopy']['docker']['single_cell_pipeline'] }, func=hmmcopy.create_hmmcopy_workflow, args=(mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.OutputFile(hmmcopy_files['reads_csvs']), mgd.OutputFile(hmmcopy_files['segs_csvs']), mgd.OutputFile(hmmcopy_files['metrics_csvs']), mgd.OutputFile(hmmcopy_files['params_csvs']), mgd.OutputFile(hmmcopy_files['igv_csvs']), mgd.OutputFile(hmmcopy_files['segs_pdf']), mgd.OutputFile(hmmcopy_files['bias_pdf']), mgd.OutputFile(hmmcopy_files['heatmap_pdf']), mgd.OutputFile(hmmcopy_files['metrics_pdf']), mgd.OutputFile(hmmcopy_files['kernel_density_pdf']), mgd.OutputFile(hmmcopy_files['hmmcopy_data_tar']), cellids, config['hmmcopy'], sampleinfo), ) if annotation_dir: annotation_files = get_output_files(annotation_dir, 'annotation', lib) if not hmmcopy_dir or not alignment_dir: raise Exception( '--hmmcopy_output and --alignment_output are required to run annotation' ) alignment_files = get_output_files(alignment_dir, 'alignment', lib) hmmcopy_files = get_output_files(hmmcopy_dir, 'hmmcopy', lib) workflow.subworkflow( name='annotation_workflow', ctx={ 'docker_image': config['annotation']['docker']['single_cell_pipeline'] }, func=qc_annotation.create_qc_annotation_workflow, args=( mgd.InputFile(hmmcopy_files['metrics_csvs']), mgd.InputFile(hmmcopy_files['reads_csvs']), mgd.InputFile(alignment_files['alignment_metrics_csv']), mgd.InputFile(alignment_files['gc_metrics_csv']), mgd.InputFile(hmmcopy_files['segs_pdf']), mgd.OutputFile(annotation_files['merged_metrics_csvs']), mgd.OutputFile(annotation_files['qc_report']), mgd.OutputFile(annotation_files['corrupt_tree_newick']), mgd.OutputFile(annotation_files['consensus_tree_newick']), mgd.OutputFile(annotation_files['phylo_csv']), mgd.OutputFile(annotation_files['loci_rank_trees']), mgd.OutputFile(annotation_files['filtered_data']), mgd.OutputFile(annotation_files['corrupt_tree_pdf']), mgd.OutputFile(annotation_files['segs_pass']), mgd.OutputFile(annotation_files['segs_fail']), mgd.OutputFile(annotation_files['corrupt_heatmap_pdf']), mgd.OutputFile(annotation_files['heatmap_filt_pdf']), config['annotation'], lib, ), kwargs={'no_corrupt_tree': args['no_corrupt_tree']}) return workflow
def breakpoint_calling_workflow(workflow, args): config = helpers.load_config(args) normal_bam_file = args['matched_normal'] bam_files, bai_files = helpers.get_bams(args['input_yaml']) varcalls_dir = os.path.join(args['out_dir'], 'results', 'breakpoint_calling') raw_data_directory = os.path.join(varcalls_dir, 'raw') breakpoints_filename = os.path.join(varcalls_dir, 'breakpoints.h5') ref_data_directory = '/refdata' pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=bam_files.keys(), ) workflow.subworkflow( name='destruct', func= "biowrappers.components.breakpoint_calling.destruct.destruct_pipeline", args=( mgd.InputFile(normal_bam_file), mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files), config.get('destruct', {}), ref_data_directory, mgd.OutputFile(breakpoints_filename), raw_data_directory, ), ) info_file = os.path.join(args["out_dir"], 'results', 'breakpoint_calling', "info.yaml") results = { 'destruct_data': helpers.format_file_yaml(breakpoints_filename), } input_datasets = { k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems() } input_datasets = {'normal': normal_bam_file, 'tumour': input_datasets} metadata = { 'breakpoint_calling': { 'ref_data': ref_data_directory, 'version': single_cell.__version__, 'results': results, 'containers': config['containers'], 'input_datasets': input_datasets, 'output_datasets': None } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], mem_retry_increment=2, ncpus=1), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def merge_bams_workflow(args): config = helpers.load_config(args) config = config['merge_bams'] baseimage = config['docker']['single_cell_pipeline'] ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'mem': config["memory"]['low'], 'docker_image': baseimage } workflow = pypeliner.workflow.Workflow(ctx=ctx) data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_wgs = data['tumour_wgs'] normal_wgs = data['normal_wgs'] tumour_cells = data['tumour_cells'] normal_cells = data['normal_cells'] bam_files = tumour_cells if tumour_cells else normal_cells wgs_bams = tumour_wgs if tumour_cells else normal_wgs workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) if isinstance(wgs_bams, dict): workflow.setobj( obj=mgd.OutputChunks('regions'), value=list(wgs_bams.keys()), ) workflow.set_filenames("merged.bam", "region", fnames=wgs_bams) else: workflow.transform( name="get_regions", func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.set_filenames('merged.bam', 'region', template=wgs_bams) workflow.subworkflow(name="wgs_merge_workflow", func=merge_bams.create_merge_bams_workflow, args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.OutputFile("merged.bam", "region", axes_origin=[], extensions=['.bai']), mgd.TempInputObj("region"), config, )) workflow.transform(name="get_files", ctx={'mem': config['memory']['med']}, func='single_cell.utils.helpers.resolve_template', ret=pypeliner.managed.TempOutputObj('outputs'), args=(pypeliner.managed.TempInputObj('region'), wgs_bams, 'region')) return workflow
def breakpoint_calling_workflow(args): run_destruct = args['destruct'] run_lumpy = args['lumpy'] if not any((run_destruct, run_lumpy)): run_destruct = True run_lumpy = True config = helpers.load_config(args) config = config['breakpoint_calling'] data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_cells = data['tumour_cells'] tumour_cells_id = data['tumour_cells_id'] normal_bams = data['normal_wgs'] if data['normal_wgs'] else data[ 'normal_cells'] normal_id = data['normal_wgs_id'] if data['normal_wgs_id'] else data[ 'normal_cells_id'] calls_dir = os.path.join(args['out_dir'], 'results', 'breakpoint_calling') raw_data_directory = os.path.join(calls_dir, 'raw') breakpoints_filename = os.path.join(calls_dir, 'breakpoints.h5') breakpoints_lib_filename = os.path.join(calls_dir, 'breakpoints_lib.h5') cell_counts_filename = os.path.join(calls_dir, 'cell_counts.h5') ref_data_directory = config['ref_data_directory'] workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config['docker']['single_cell_pipeline']}) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=list(tumour_cells.keys()), ) if isinstance(normal_bams, dict): workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_bams.keys()), ) workflow.set_filenames('normal_cells.bam', 'normal_cell_id', fnames=normal_bams) normal_bam = mgd.InputFile('normal_cells.bam', 'normal_cell_id', extensions=['.bai']) else: normal_bam = mgd.InputFile(normal_bams, extensions=['.bai']) if run_destruct: workflow.subworkflow( name='destruct', ctx={'docker_image': config['docker']['destruct']}, func= "single_cell.workflows.destruct_singlecell.create_destruct_workflow", args=( normal_bam, mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells), config.get('destruct', {}), ref_data_directory, mgd.OutputFile(breakpoints_filename), mgd.OutputFile(breakpoints_lib_filename), mgd.OutputFile(cell_counts_filename), raw_data_directory, ), ) if run_lumpy: varcalls_dir = os.path.join(args['out_dir'], 'results', 'breakpoint_calling') breakpoints_bed = os.path.join(varcalls_dir, 'lumpy_breakpoints.bed') breakpoints_csv = os.path.join(varcalls_dir, 'lumpy_breakpoints.csv.gz') breakpoints_evidence_csv = os.path.join( varcalls_dir, 'lumpy_breakpoints_evidence.csv.gz') workflow.subworkflow( name='lumpy', func="single_cell.workflows.lumpy.create_lumpy_workflow", args=( config, mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells, extensions=['.bai']), normal_bam, mgd.OutputFile(breakpoints_bed), mgd.OutputFile(breakpoints_csv), mgd.OutputFile(breakpoints_evidence_csv), ), kwargs={ 'tumour_id': tumour_cells_id, 'normal_id': normal_id }) return workflow
def aneufinder_workflow(workflow, args): config = helpers.load_config(args) cellids = helpers.get_samples(args['input_yaml']) bam_files, _ = helpers.get_bams(args['input_yaml']) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cellids, ) info_file = os.path.join(args["out_dir"],'results', 'aneufinder', "info.yaml") output = os.path.join(args['out_dir'], 'results', "aneufinder") aneufinder_pdf_file = os.path.join( output, 'plots', '{}_reads.pdf'.format(args['library_id'])) helpers.makedirs(output) results_filename = os.path.join(output, '{}_results.h5'.format(args['library_id'])) workflow.subworkflow( name='aneufinder_workflow', func=aneufinder.create_aneufinder_workflow, args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files), cellids, config, output, mgd.OutputFile(results_filename), mgd.OutputFile(aneufinder_pdf_file), args['library_id'], ), ) results = { 'aneufinder_plot': helpers.format_file_yaml(aneufinder_pdf_file), 'aneufinder_data':helpers.format_file_yaml(results_filename), } input_datasets = {k: helpers.format_file_yaml(v) for k,v in bam_files.iteritems()} metadata = { 'aneufinder':{ 'reads_table': '/aneufinder/reads', 'segments_table': '/aneufinder/segments/', 'chromosomes': config['chromosomes'], 'ref_genome': config['ref_genome'], 'version': single_cell.__version__, 'results': results, 'containers': config['containers'], 'input_datasets': input_datasets, 'output_datasets': None } } workflow.transform( name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], mem_retry_increment=2, ncpus=1), func="single_cell.utils.helpers.write_to_yaml", args=( mgd.OutputFile(info_file), metadata ) ) return workflow
def copy_number_calling_workflow(workflow, args): config = helpers.load_config(args) ctx = {'mem_retry_increment': 2, 'ncpus': 1, 'mem': config["memory"]['low'], 'pool_id': config['pools']['standard']} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) tumour_bam_files, tumour_bai_files = helpers.get_bams(args['tumour_yaml']) normal_bam_files, normal_bai_files = helpers.get_bams(args['normal_yaml']) tumour_cellids = helpers.get_samples(args['tumour_yaml']) normal_cellids = helpers.get_samples(args['normal_yaml']) if set(tumour_bam_files.keys()) != set(tumour_cellids): raise ValueError() if set(normal_bam_files.keys()) != set(normal_cellids): raise ValueError() copynumber_dir = os.path.join(args["out_dir"], "copynumber") out_file = os.path.join(copynumber_dir, "results", "results.h5") cloneid = args["clone_id"] remixt_config = config['titan_params'].get('extract_seqdata', {}) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=tumour_cellids, ) workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=normal_cellids, ) workflow.transform( name="get_snp_positions_filename", ctx=ctx, func="remixt.config.get_filename", ret=mgd.TempOutputObj('snp_positions_filename'), args=( remixt_config, config['titan_params']['ref_data_dir'], 'snp_positions' ) ) workflow.transform( name="get_bam_max_fragment_length", ctx=ctx, func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_max_fragment_length'), args=( remixt_config, 'bam_max_fragment_length' ) ) workflow.transform( name="get_bam_max_soft_clipped", ctx=ctx, func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_max_soft_clipped'), args=( remixt_config, 'bam_max_soft_clipped' ) ) workflow.transform( name="get_bam_check_proper_pair", ctx=ctx, func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_check_proper_pair'), args=( remixt_config, 'bam_check_proper_pair' ) ) workflow.subworkflow( name="extract_seqdata_tumour", axes=('tumour_cell_id',), func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile( 'bam_markdups', 'tumour_cell_id', fnames=tumour_bam_files), mgd.InputFile( 'bam_markdups_index', 'tumour_cell_id', fnames=tumour_bai_files), mgd.TempOutputFile("tumour.h5", "tumour_cell_id"), config, config['titan_params'].get('extract_seqdata', {}), config['titan_params']['ref_data_dir'], mgd.TempInputObj('snp_positions_filename'), mgd.TempInputObj('bam_max_fragment_length'), mgd.TempInputObj('bam_max_soft_clipped'), mgd.TempInputObj('bam_check_proper_pair'), ) ) workflow.subworkflow( name="extract_seqdata_normal", axes=('normal_cell_id',), func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile( 'bam_markdups', 'normal_cell_id', fnames=normal_bam_files), mgd.InputFile( 'bam_markdups_index', 'normal_cell_id', fnames=normal_bai_files), mgd.TempOutputFile("normal.h5", "normal_cell_id"), config, config['titan_params'].get('extract_seqdata', {}), config['titan_params']['ref_data_dir'], mgd.TempInputObj('snp_positions_filename'), mgd.TempInputObj('bam_max_fragment_length'), mgd.TempInputObj('bam_max_soft_clipped'), mgd.TempInputObj('bam_check_proper_pair'), ) ) workflow.subworkflow( name='titan_workflow', func=titan.create_titan_workflow, args=( mgd.TempInputFile("normal.h5", "normal_cell_id"), mgd.TempInputFile("tumour.h5", "tumour_cell_id"), config['ref_genome'], copynumber_dir, out_file, config, args, tumour_cellids, normal_cellids, cloneid ), ) info_file = os.path.join(args["out_dir"],'results','copynumber_calling', "info.yaml") results = { 'copynumber_data': helpers.format_file_yaml(out_file), } tumours = {k: helpers.format_file_yaml(v) for k,v in tumour_bam_files.iteritems()} normals = {k: helpers.format_file_yaml(v) for k,v in normal_bam_files.iteritems()} input_datasets = {'tumour': tumours, 'normal': normals} metadata = { 'copynumber_calling': { 'chromosomes': config['chromosomes'], 'ref_genome': config['ref_genome'], 'version': single_cell.__version__, 'results': results, 'containers': config['containers'], 'input_datasets': input_datasets, 'output_datasets': None } } workflow.transform( name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], mem_retry_increment=2, ncpus=1), func="single_cell.utils.helpers.write_to_yaml", args=( mgd.OutputFile(info_file), metadata ) ) return workflow
def variant_calling_workflow(args): config = helpers.load_config(args) config = config['variant_calling'] meta_yaml = os.path.join(args['out_dir'], 'info.yaml') data = helpers.load_pseudowgs_input(args['input_yaml']) tumour_bams = data['tumour_wgs'] normal_bams = data['normal_wgs'] tumour_cells = data['tumour_cells'] varcalls_dir = os.path.join(args['out_dir'], 'results', 'variant_calling') museq_vcf = os.path.join(varcalls_dir, 'museq_snv.vcf.gz') strelka_snv_vcf = os.path.join(varcalls_dir, 'strelka_snv.vcf.gz') strelka_indel_vcf = os.path.join(varcalls_dir, 'strelka_indel.vcf.gz') snv_h5 = os.path.join(varcalls_dir, 'snv_annotations.h5') raw_data_dir = os.path.join(varcalls_dir, 'raw') baseimage = config['docker']['single_cell_pipeline'] ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'mem': config["memory"]['low'], 'docker_image': baseimage } workflow = pypeliner.workflow.Workflow(ctx=ctx) if isinstance(normal_bams, dict) and isinstance(tumour_bams, dict): assert list(normal_bams.keys()) == list(tumour_bams.keys( )), 'keys for tumour and normal bams should be the same' workflow.setobj( obj=mgd.OutputChunks('region'), value=list(normal_bams.keys()), ) workflow.set_filenames('normal_split.bam', 'normal_split', fnames=normal_bams) workflow.set_filenames('tumour_split.bam', 'normal_split', fnames=tumour_bams) else: workflow.transform( name="get_regions", func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) assert '{region}' in normal_bams, 'only supports a list of files or a template on regions' workflow.set_filenames('normal_split.bam', 'region', template=normal_bams) assert '{region}' in tumour_bams, 'only supports a list of files or a template on regions' workflow.set_filenames('tumour_split.bam', 'region', template=normal_bams) workflow.subworkflow( func=create_variant_calling_workflow, name='create_varcall', args=( tumour_cells, mgd.InputFile('tumour_split.bam', 'region', extensions=['bai']), mgd.InputFile('normal_split.bam', 'region', extensions=['bai']), mgd.OutputFile(museq_vcf), mgd.OutputFile(strelka_snv_vcf), mgd.OutputFile(strelka_indel_vcf), mgd.OutputFile(snv_h5), mgd.OutputFile(meta_yaml), config, raw_data_dir, ), ) return workflow
def copy_number_calling_workflow(args): config = helpers.load_config(args) config = config['copy_number_calling'] pyp = pypeliner.app.Pypeline(config=args) ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'docker_image': config['docker']['single_cell_pipeline'] } workflow = pypeliner.workflow.Workflow(ctx=ctx) data = helpers.load_pseudowgs_input(args['input_yaml']) normal_wgs = data['normal_wgs'] tumour_cells = data['tumour_cells'] assert '{region}' in normal_wgs copynumber_dir = os.path.join(args["out_dir"], "copynumber") out_file = os.path.join(copynumber_dir, "results", "results.h5") cloneid = args["clone_id"] remixt_config = config.get('extract_seqdata', {}) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=list(tumour_cells.keys()), ) workflow.transform( name="get_regions", ctx=dict(mem=config['memory']['low']), func="single_cell.utils.pysamutils.get_regions_from_reference", ret=mgd.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], ) ) workflow.transform( name="get_snp_positions_filename", func="remixt.config.get_filename", ret=mgd.TempOutputObj('snp_positions_filename'), args=( remixt_config, config['ref_data_dir'], 'snp_positions' ) ) workflow.transform( name="get_bam_max_fragment_length", func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_max_fragment_length'), args=( remixt_config, 'bam_max_fragment_length' ) ) workflow.transform( name="get_bam_max_soft_clipped", func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_max_soft_clipped'), args=( remixt_config, 'bam_max_soft_clipped' ) ) workflow.transform( name="get_bam_check_proper_pair", func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_check_proper_pair'), args=( remixt_config, 'bam_check_proper_pair' ) ) workflow.subworkflow( name="extract_seqdata_tumour", axes=('tumour_cell_id',), func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile( 'bam_markdups', 'tumour_cell_id', fnames=tumour_cells, extensions=['.bai'] ), mgd.TempOutputFile("tumour.h5", "tumour_cell_id"), config.get('extract_seqdata', {}), config['ref_data_dir'], config ) ) workflow.subworkflow( name="extract_seqdata_normal", axes=('region',), ctx={'disk': 200}, func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile( 'bam_markdups', 'region', template=normal_wgs, extensions=['.bai'] ), mgd.TempOutputFile("normal.h5", "region"), config.get('extract_seqdata', {}), config['ref_data_dir'], config, ) ) workflow.subworkflow( name='titan_workflow', func=titan.create_titan_workflow, args=( mgd.TempInputFile("normal.h5", "region"), mgd.TempInputFile("tumour.h5", "tumour_cell_id"), config['ref_genome'], copynumber_dir, mgd.OutputFile(out_file), config, args, tumour_cells.keys(), mgd.InputChunks('region'), cloneid ), ) pyp.run(workflow)
def germline_calling_workflow(workflow, args): config = helpers.load_config(args) ctx = { 'mem_retry_increment': 2, 'ncpus': 1, 'mem': config["memory"]['low'], 'pool_id': config['pools']['standard'], } docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) bam_files, bai_files = helpers.get_bams(args['input_yaml']) sampleids = helpers.get_samples(args['input_yaml']) normal_bam_template = args["input_template"] normal_bai_template = args["input_template"] + ".bai" if "{reads}" in normal_bam_template: raise ValueError( "input template for germline calling only support region based splits" ) varcalls_dir = os.path.join(args['out_dir'], 'results', 'germline_calling') samtools_germline_vcf = os.path.join(varcalls_dir, 'raw', 'samtools_germline.vcf.gz') snpeff_vcf_filename = os.path.join(varcalls_dir, 'snpeff.vcf') normal_genotype_filename = os.path.join(varcalls_dir, 'raw', 'normal_genotype.h5') mappability_filename = os.path.join(varcalls_dir, 'raw', 'mappability.h5') counts_template = os.path.join(varcalls_dir, 'counts', 'raw', 'counts.h5') germline_h5_filename = os.path.join(varcalls_dir, 'germline.h5') workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=bam_files.keys(), ) workflow.transform( name="get_regions", ctx=ctx, func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow(name='samtools_germline', func=germline.create_samtools_germline_workflow, args=( mgd.InputFile("normal.split.bam", "region", template=normal_bam_template), mgd.InputFile("normal.split.bam.bai", "region", template=normal_bai_template), config['ref_genome'], mgd.OutputFile(samtools_germline_vcf, extensions=['.tbi']), config, ), kwargs={ 'chromosomes': config["chromosomes"], 'base_docker': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline'), 'vcftools_docker': helpers.get_container_ctx(config['containers'], 'vcftools'), 'samtools_docker': helpers.get_container_ctx(config['containers'], 'samtools'), }) workflow.subworkflow( name='annotate_mappability', func= "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow", args=( config['databases']['mappability']['local_path'], mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(mappability_filename), ), kwargs={ 'base_docker': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') }) workflow.transform( name='annotate_genotype', func="single_cell.workflows.germline.tasks.annotate_normal_genotype", ctx=ctx, args=( mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(normal_genotype_filename), config["chromosomes"], ), ) workflow.subworkflow( name='snpeff', func= "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow", args=( config['databases']['snpeff']['db'], mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(snpeff_vcf_filename), ), kwargs={ 'hdf5_output': False, 'base_docker': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline'), 'vcftools_docker': helpers.get_container_ctx(config['containers'], 'vcftools'), 'snpeff_docker': helpers.get_container_ctx(config['containers'], 'snpeff'), }) workflow.subworkflow( name='read_counts', func= "single_cell.variant_calling.create_snv_allele_counts_for_vcf_targets_workflow", args=( config, mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files), mgd.InputFile('tumour.bam.bai', 'cell_id', fnames=bai_files), mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(counts_template), ), kwargs={ 'table_name': '/germline_allele_counts', 'docker_config': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') }, ) workflow.transform( name='build_results_file', func="biowrappers.components.io.hdf5.tasks.concatenate_tables", ctx=ctx, args=( [ mgd.InputFile(counts_template), mgd.InputFile(mappability_filename), mgd.InputFile(normal_genotype_filename), ], pypeliner.managed.OutputFile(germline_h5_filename), ), kwargs={ 'drop_duplicates': True, }) info_file = os.path.join(args["out_dir"], 'results', 'germline_calling', "info.yaml") results = { 'germline_data': helpers.format_file_yaml(germline_h5_filename), } input_datasets = { k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems() } metadata = { 'germline_calling': { 'version': single_cell.__version__, 'results': results, 'containers': config['containers'], 'input_datasets': input_datasets, 'output_datasets': None } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], mem_retry_increment=2, ncpus=1), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def infer_haps_workflow(workflow, args): config = helpers.load_config(args) remixt_config = config['titan_params'].get('extract_seqdata', {}) singlecellimage = config['docker']['images']['single_cell_pipeline'] ctx = { 'mem_retry_increment': 2, 'ncpus': 1, 'image': singlecellimage['image'], 'dockerize': config['docker']['dockerize'], 'mounts': config['docker']['mounts'], 'username': singlecellimage['username'], 'password': singlecellimage['password'], 'server': singlecellimage['server'], } haps_dir = os.path.join(args["out_dir"], "infer_haps") haplotypes_filename = os.path.join(haps_dir, "results", "haplotypes.tsv") allele_counts_filename = os.path.join(haps_dir, "results", "allele_counts.tsv") snp_positions_filename = remixt.config.get_filename( config, ref_data_dir, 'snp_positions') bam_max_fragment_length = remixt.config.get_param( config, 'bam_max_fragment_length') bam_max_soft_clipped = remixt.config.get_param(config, 'bam_max_soft_clipped') bam_check_proper_pair = remixt.config.get_param(config, 'bam_check_proper_pair') workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=config['titan_params']['chromosomes']) if args['input_yaml']: bam_files, bai_files = helpers.get_bams(args['input_yaml']) cellids = helpers.get_samples(args['input_yaml']) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cellids, ) workflow.subworkflow( name="extract_seqdata", axes=('cell_id', ), func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files), mgd.InputFile('bam_markdups_index', 'cell_id', fnames=bai_files), mgd.TempOutputFile("tumour.h5", "cell_id"), config, config['titan_params'].get('extract_seqdata', {}), config['titan_params']['ref_data_dir'], snp_positions_filename, bam_max_fragment_length, bam_max_soft_clipped, bam_check_proper_pair, )) workflow.transform( name='merge_all_seqdata', ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func="single_cell.workflows.titan.tasks.merge_overlapping_seqdata", args=(mgd.TempOutputFile("seqdata_normal_all_cells_merged.h5"), mgd.TempInputFile("tumour.h5", "cell_id"), config["titan_params"]["chromosomes"]), ) else: workflow.subworkflow( name="extract_seqdata", func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile(args['input_bam']), mgd.InputFile(args['input_bam'] + '.bai'), mgd.TempOutputFile("seqdata_normal_all_cells_merged.h5"), config, config['titan_params'].get('extract_seqdata', {}), config['titan_params']['ref_data_dir'], snp_positions_filename, bam_max_fragment_length, bam_max_soft_clipped, bam_check_proper_pair, ), kwargs={'multiprocess': True}) if args["normal"]: workflow.transform( name='infer_snp_genotype', axes=('chromosome', ), ctx={'mem': 16}, func='remixt.analysis.haplotype.infer_snp_genotype_from_normal', args=( mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'), mgd.TempInputFile("seqdata_normal_all_cells_merged.h5"), mgd.InputInstance('chromosome'), config, ), ) else: workflow.transform( name='infer_snp_genotype', axes=('chromosome', ), ctx={'mem': 16}, func='remixt.analysis.haplotype.infer_snp_genotype_from_tumour', args=( mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'), { 'sample': mgd.TempInputFile("seqdata_normal_all_cells_merged.h5") }, mgd.InputInstance('chromosome'), config, ), ) workflow.transform(name='infer_haps', axes=('chromosome', ), ctx={'mem': 16}, func='remixt.analysis.haplotype.infer_haps', args=( mgd.TempOutputFile('haps.tsv', 'chromosome'), mgd.TempInputFile('snp_genotype.tsv', 'chromosome'), mgd.InputInstance('chromosome'), mgd.TempSpace('haplotyping', 'chromosome'), config, config['titan_params']['ref_data_dir'], )) workflow.transform(name='merge_haps', ctx={'mem': 16}, func='remixt.utils.merge_tables', args=( mgd.OutputFile(haplotypes_filename), mgd.TempInputFile('haps.tsv', 'chromosome'), )) workflow.transform( name='create_segments', func='remixt.analysis.segment.create_segments', args=( mgd.TempOutputFile('segments.tsv'), config, config['titan_params']['ref_data_dir'], ), ) workflow.transform( name='haplotype_allele_readcount', ctx={'mem': 20}, func='remixt.analysis.readcount.haplotype_allele_readcount', args=( mgd.OutputFile(allele_counts_filename), mgd.TempInputFile('segments.tsv'), mgd.TempInputFile('tumour.h5', 'cell_id'), mgd.InputFile(haplotypes_filename), config, ), ) info_file = os.path.join(args["out_dir"], 'results', 'infer_haps', "info.yaml") results = { 'infer_haps_allele_counts': helpers.format_file_yaml(allele_counts_filename), 'infer_haps_data': helpers.format_file_yaml(haplotypes_filename), } if args['input_yaml']: input_datasets = { k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems() } else: input_datasets = helpers.format_file_yaml(args['input_bam']) metadata = { 'infer_haps': { 'version': single_cell.__version__, 'results': results, 'containers': config['containers'], 'input_datasets': input_datasets, 'output_datasets': None } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], mem_retry_increment=2, ncpus=1), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow