def run_MutationSeq(config, normal_bam, tumour_bam, output_file): workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X']))) workflow.transform( name='run_museq_paired', ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'}, axes=('interval',), func=tasks.run_museq, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.InputInstance('interval'), mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), ) ) workflow.transform( name='merge_vcfs', func=tasks.merge_vcfs, args=( mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]), mgd.OutputFile(output_file), mgd.TempSpace('merge_vcf'), ) ) return workflow
def call_copynumber( samples, config, tumours, normals, breakpoints, titan_raw_dir, remixt_results, remixt_raw_dir, titan_segments, titan_params, titan_markers ): breakpoints = dict([(sampid, breakpoints[sampid]) for sampid in samples]) remixt_results = dict([(sampid, remixt_results[sampid]) for sampid in samples]) titan_segments = dict([(sampid, titan_segments[sampid]) for sampid in samples]) titan_params = dict([(sampid, titan_params[sampid]) for sampid in samples]) titan_markers = dict([(sampid, titan_markers[sampid]) for sampid in samples]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.Template(titan_raw_dir, 'sample_id'), mgd.OutputFile('titan_segments', 'sample_id', fnames=titan_segments), mgd.OutputFile('titan_params', 'sample_id', fnames=titan_params), mgd.OutputFile('titan_markers', 'sample_id', fnames=titan_markers), config['globals'], config['cna_calling'], config['cna_calling']['titan_intervals'], ), ) workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('breakpoints', 'sample_id', fnames=breakpoints), mgd.InputInstance('sample_id'), config['cna_calling']['remixt_refdata'], mgd.OutputFile('remixt_results', 'sample_id', fnames=remixt_results), mgd.Template(remixt_raw_dir, 'sample_id'), config['cna_calling']['min_num_reads'] ), ) return workflow
def patient_workflow(config, patient_id, patient_input, output_file): workflow = pypeliner.workflow.Workflow() patient_bam_dir = config["bam_directory"] + patient_id patient_result_dir = config["results_dir"] + patient_id helpers.makedirs(patient_bam_dir) helpers.makedirs(patient_result_dir) input_args = helpers.create_input_args(patient_input, patient_bam_dir) workflow.setobj(obj=mgd.OutputChunks('sample_id', ), value=input_args['all_samples']) workflow.subworkflow(name='align_samples', func=alignment.align_sample, axes=('sample_id', ), args=( config, mgd.InputFile('fastq_1', 'sample_id', fnames=input_args['fastqs_r1']), mgd.InputFile('fastq_2', 'sample_id', fnames=input_args['fastqs_r2']), mgd.InputInstance('sample_id'), mgd.OutputFile('sample.bam', 'sample_id', fnames=input_args['all_bams']), mgd.OutputFile('sample.bam.bai', 'sample_id', fnames=input_args['all_bais']), )) workflow.subworkflow(name='run_analyses', func=analysis.partition_tumour, args=( config, input_args, patient_id, patient_result_dir, mgd.InputFile('sample.bam', 'sample_id', fnames=input_args['all_bams'], axes_origin=[]), mgd.InputFile('sample.bam.bai', 'sample_id', fnames=input_args['all_bais'], axes_origin=[]), mgd.OutputFile(output_file), )) return workflow
def ctDNA_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config']) for arg, value in args.iteritems(): config[arg] = value helpers.makedirs(config["bam_directory"]) helpers.makedirs(config["results_dir"]) inputs = helpers.load_yaml(args['input_yaml']) patients = inputs.keys() workflow.setobj(obj=mgd.OutputChunks('patient_id', ), value=patients) workflow.transform(name='get_input_by_patient', func=helpers.get_input_by_patient, ret=mgd.TempOutputObj('patient_input', 'patient_id'), axes=('patient_id', ), args=( inputs, mgd.InputInstance('patient_id'), )) workflow.subworkflow(name='patient_workflow', func=patient_workflow, axes=('patient_id', ), args=( config, mgd.InputInstance('patient_id'), mgd.TempInputObj('patient_input', 'patient_id'), mgd.OutputFile( os.path.join(config['results_dir'], '{patient_id}.log'), 'patient_id'), )) pyp.run(workflow)
def create_museq_workflow( normal_bam, tumour_bam, ref_genome, snv_vcf, config): ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('region'), value=normal_bam.keys(), ) workflow.transform( name='run_museq', ctx=dict(mem=config["memory"]['med'], pool_id=config['pools']['highmem'], **ctx), axes=('region',), func='single_cell.workflows.mutationseq.tasks.run_museq', args=( mgd.InputFile('merged_bam', 'region', fnames=tumour_bam), mgd.InputFile('normal.split.bam', 'region', fnames=normal_bam), mgd.TempOutputFile('museq.vcf', 'region'), mgd.TempOutputFile('museq.log', 'region'), mgd.InputInstance('region'), config, ), kwargs={'docker_kwargs': helpers.get_container_ctx(config['containers'], 'mutationseq')} ) workflow.transform( name='merge_snvs', ctx=dict(mem=config["memory"]['med'], pool_id=config['pools']['standard'], **ctx), func='biowrappers.components.io.vcf.tasks.concatenate_vcf', args=( mgd.TempInputFile('museq.vcf', 'region'), mgd.OutputFile(snv_vcf), ), ) return workflow
def create_extract_seqdata_workflow( bam_filename, seqdata_filename, config, ref_data_dir, ): chromosomes = remixt.config.get_chromosomes(config, ref_data_dir) snp_positions_filename = remixt.config.get_filename(config, ref_data_dir, 'snp_positions') bam_max_fragment_length = remixt.config.get_param(config, 'bam_max_fragment_length') bam_max_soft_clipped = remixt.config.get_param(config, 'bam_max_soft_clipped') bam_check_proper_pair = remixt.config.get_param(config, 'bam_check_proper_pair') workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes) workflow.transform( name='create_chromosome_seqdata', axes=('chromosome',), ctx={'mem': 16}, func=remixt.seqdataio.create_chromosome_seqdata, args=( mgd.TempOutputFile('seqdata', 'chromosome'), mgd.InputFile(bam_filename), mgd.InputFile(snp_positions_filename), mgd.InputInstance('chromosome'), bam_max_fragment_length, bam_max_soft_clipped, bam_check_proper_pair, ), ) workflow.transform( name='merge_seqdata', ctx={'mem': 16}, func=remixt.seqdataio.merge_seqdata, args=( mgd.OutputFile(seqdata_filename), mgd.TempInputFile('seqdata', 'chromosome'), ), ) return workflow
def align_samples(config, fastq1_inputs, fastq2_inputs, bam_outputs, outdir): samples = bam_outputs.keys() workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='align_samples', func=align_sample, axes=('sample_id', ), args=(config, mgd.InputFile('input.r1.fastq.gz', 'sample_id', fnames=fastq1_inputs), mgd.InputFile('input.r2.fastq.gz', 'sample_id', fnames=fastq2_inputs), mgd.OutputFile('output.bam', 'sample_id', fnames=bam_outputs), mgd.InputInstance("sample_id"), outdir), ) return workflow
def cohort_qc_pipeline(args): """Process maf, run classify copynumber, make plots. Args: args ([dict]): [pipeline arguments] """ config = inpututils.load_config(args) config = config["cohort_qc"] pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() out_dir = args["out_dir"] api_key = args["API_key"] meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') # inputs cohort, germline_mafs, vcfs, hmmcopy = inpututils.load_cohort_qc_inputs( args["input_yaml"] ) museq = { label: data["museq"] for label, data in vcfs.items() } strelka_snv = { label: data["strelka_snv"] for label, data in vcfs.items() } strelka_indel = { label: data["strelka_indel"] for label, data in vcfs.items() } hmmcopy_files = { label: data["hmmcopy"] for label, data in hmmcopy.items() } hmmcopy_metrics_files = { label: data["hmmcopy_metrics"] for label, data in hmmcopy.items() } # outputs cbiofile_paths = get_cbioportal_paths(os.path.join(out_dir, cohort)) maftools_filepaths = get_maftools_paths(os.path.join(out_dir, cohort)) workflow.setobj( obj=mgd.OutputChunks('sample_label', 'library_label'), value=list(museq.keys()), ) workflow.subworkflow( name="merge_somatic_mafs", func="single_cell.workflows.cohort_qc.merge_somatic_mafs", axes=('sample_label',), args=( mgd.InputInstance('sample_label'), config, mgd.InputFile( 'museq', 'sample_label', 'library_label', fnames=museq, axes_origin=[] ), mgd.InputFile( 'strelka_snv', 'sample_label', 'library_label', fnames=strelka_snv, axes_origin=[] ), mgd.InputFile( 'strelka_indel', 'sample_label', 'library_label', fnames=strelka_indel, axes_origin=[] ), mgd.TempOutputFile('somatic_maf', 'sample_label') ), ) workflow.subworkflow( name="classifycopynumber", func="single_cell.workflows.cohort_qc.cna_annotation_workflow", args=( config, mgd.InputFile( 'hmmcopy_dict', 'sample_label', 'library_label', fnames=hmmcopy_files, axes_origin=[] ), mgd.InputFile( 'hmmcopy_metrics_dict', 'sample_label', 'library_label', fnames=hmmcopy_metrics_files, axes_origin=[] ), mgd.OutputFile(cbiofile_paths["cna_table"]), mgd.OutputFile(maftools_filepaths["maftools_cna"]), mgd.OutputFile(cbiofile_paths["segments"]), config["gtf"], ), ) workflow.subworkflow( name="maf_annotation_workflow", func="single_cell.workflows.cohort_qc.preprocess_mafs_workflow", args=( config, mgd.InputFile( 'germline_mafs_dict', 'sample_label', fnames=germline_mafs, axes_origin=[] ), mgd.TempInputFile( 'somatic_maf', 'sample_label', axes_origin=[] ), mgd.OutputFile(cbiofile_paths["filtered_germline_maf"]), mgd.OutputFile(cbiofile_paths["annotated_somatic_maf"]), api_key ), ) workflow.subworkflow( name="make_plots_and_report", func="single_cell.workflows.cohort_qc.create_cohort_oncoplot", args=( config, mgd.InputFile(cbiofile_paths["filtered_germline_maf"]), mgd.InputFile(cbiofile_paths["annotated_somatic_maf"]), mgd.InputFile(maftools_filepaths["maftools_cna"]), mgd.OutputFile(maftools_filepaths["maftools_maf"]), mgd.OutputFile(maftools_filepaths["cohort_oncoplot"]), mgd.OutputFile(maftools_filepaths["report"]), cohort ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=( sys.argv[0:], args['out_dir'], list(cbiofile_paths.values()) + list(maftools_filepaths.values()), mgd.OutputFile(meta_yaml) ), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': {'type': 'cohort_qc'} } ) pyp.run(workflow)
def process_cells_destruct(destruct_config, cell_bam_files, reads_1, reads_2, sample_1, sample_2, stats, tag=False): ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, } cells = list(cell_bam_files.keys()) workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cells, ) workflow.transform( name='bamdisc_and_numreads_cell', func= "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads", axes=('cell_id', ), ctx={ 'io': 1, 'mem': 8 }, ret=mgd.TempOutputObj("numreads", "cell_id"), args=( destruct_config, mgd.InputFile('bam', 'cell_id', fnames=cell_bam_files), mgd.TempOutputFile('cell_stats', 'cell_id'), mgd.TempOutputFile('cell_reads_1.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_reads_2.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_sample_1.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_sample_2.fastq.gz', 'cell_id'), mgd.TempSpace('bamdisc_cell_tempspace', 'cell_id'), ), ) workflow.transform( name='merge_read_counts', ret=mgd.TempOutputObj("readcounts"), func= "single_cell.workflows.destruct_singlecell.tasks.merge_read_counts", ctx={ 'io': 1, 'mem': 8 }, args=(mgd.TempInputObj('numreads', 'cell_id'), )) workflow.transform( name='reindex_reads', func= "single_cell.workflows.destruct_singlecell.tasks.re_index_reads_both", ctx={ 'io': 1, 'mem': 8 }, axes=('cell_id', ), args=( mgd.TempInputFile('cell_reads_1.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_reads_1_reindex.fastq.gz', 'cell_id'), mgd.TempInputFile('cell_reads_2.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_reads_2_reindex.fastq.gz', 'cell_id'), mgd.InputInstance('cell_id'), cells, mgd.TempInputObj('readcounts'), ), kwargs={'tag': tag}) workflow.transform( name='merge_reads_r1', ctx={ 'io': 1, 'mem': 8, 'disk': 100 }, func= "single_cell.workflows.destruct_singlecell.tasks.merge_cell_fastqs", args=( mgd.TempInputFile('cell_reads_1_reindex.fastq.gz', 'cell_id'), mgd.OutputFile(reads_1), ), ) workflow.transform( name='merge_reads_r2', ctx={ 'io': 1, 'mem': 8, 'disk': 100 }, func= "single_cell.workflows.destruct_singlecell.tasks.merge_cell_fastqs", args=( mgd.TempInputFile('cell_reads_2_reindex.fastq.gz', 'cell_id'), mgd.OutputFile(reads_2), ), ) workflow.transform( name='merge_sample', ctx={ 'io': 1, 'mem': 8, 'disk': 100 }, func="single_cell.workflows.destruct_singlecell.tasks.resample_fastqs", args=( mgd.TempInputFile('cell_sample_1.fastq.gz', 'cell_id'), mgd.TempInputFile('cell_sample_2.fastq.gz', 'cell_id'), mgd.OutputFile(sample_1), mgd.OutputFile(sample_2), destruct_config['num_read_samples'], ), ) workflow.transform( name='merge_stats', ctx={ 'io': 1, 'mem': 8 }, func="single_cell.workflows.destruct_singlecell.tasks.merge_stats", args=( mgd.TempInputFile('cell_stats', 'cell_id'), mgd.OutputFile(stats), ), ) return workflow
def create_split_workflow(normal_bam, normal_split_bam, regions, config, by_reads=False): normal_split_bam = dict([(ival, normal_split_bam[ival]) for ival in regions]) one_split_job = config["one_split_job"] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('region'), value=regions, ) # split by reads always runs no a single node if by_reads: workflow.transform( name='split_normal_bam', ctx={ 'mem': config['memory']['low'], 'ncpus': config['max_cores'] }, func= "single_cell.workflows.split_bams.tasks.split_bam_file_by_reads", args=( mgd.InputFile(normal_bam, extensions=['.bai']), mgd.OutputFile("normal.split.bam", "region", fnames=normal_split_bam, axes_origin=[], extensions=['.bai']), mgd.TempSpace("bam_split_by_reads"), regions, ), ) elif one_split_job: workflow.transform( name='split_normal_bam', ctx={ 'mem': config['memory']['low'], 'ncpus': config['max_cores'] }, func= "single_cell.workflows.split_bams.tasks.split_bam_file_one_job", args=(mgd.InputFile(normal_bam, extensions=['.bai']), mgd.OutputFile( "normal.split.bam", "region", fnames=normal_split_bam, axes_origin=[], extensions=['.bai'], ), regions, mgd.TempSpace("one_job_split_tempdir")), kwargs={"ncores": config["max_cores"]}) else: workflow.transform( name='split_normal_bam', ctx={ 'mem': config['memory']['low'], 'ncpus': config['max_cores'] }, axes=('region', ), func="single_cell.workflows.split_bams.tasks.split_bam_file", args=(mgd.InputFile(normal_bam, extensions=['.bai']), mgd.OutputFile("normal.split.bam", "region", fnames=normal_split_bam, extensions=['.bai']), mgd.InputInstance('region'))) return workflow
def create_split_workflow(normal_bam, normal_bai, normal_split_bam, normal_split_bai, regions, config, by_reads=False): ctx = {'mem_retry_increment': 2} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) normal_split_bam = dict([(ival, normal_split_bam[ival]) for ival in regions]) normal_split_bai = dict([(ival, normal_split_bai[ival]) for ival in regions]) one_split_job = config["one_split_job"] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('region'), value=regions, ) # split by reads always runs no a single node if by_reads: workflow.transform( name='split_normal_bam', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['multicore'], ncpus=config['max_cores'], **ctx), func= "single_cell.workflows.split_bams.tasks.split_bam_file_by_reads", args=(mgd.InputFile(normal_bam), mgd.InputFile(normal_bai), mgd.OutputFile("normal.split.bam", "region", fnames=normal_split_bam, axes_origin=[]), mgd.OutputFile("normal.split.bam.bai", "region", fnames=normal_split_bai, axes_origin=[]), mgd.TempSpace("bam_split_by_reads"), regions, helpers.get_container_ctx(config['containers'], 'samtools')), ) elif one_split_job: workflow.transform( name='split_normal_bam', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['multicore'], ncpus=config['max_cores'], **ctx), func= "single_cell.workflows.split_bams.tasks.split_bam_file_one_job", args=(mgd.InputFile(normal_bam, extensions=['.bai']), mgd.OutputFile( "normal.split.bam", "region", fnames=normal_split_bam, axes_origin=[], extensions=['.bai'], ), regions, helpers.get_container_ctx(config['containers'], 'samtools')), kwargs={"ncores": config["max_cores"]}) else: workflow.transform( name='split_normal_bam', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['standard'], ncpus=1, **ctx), axes=('region', ), func="single_cell.workflows.split_bams.tasks.split_bam_file", args=(mgd.InputFile(normal_bam), mgd.InputFile(normal_bai), mgd.OutputFile("normal.split.bam", "region", fnames=normal_split_bam), mgd.OutputFile("normal.split.bam.bai", "region", fnames=normal_split_bai), mgd.InputInstance('region'), helpers.get_container_ctx(config['containers'], 'samtools'))) return workflow
def create_titan_workflow(tumour_bam, normal_bam, targets, titan_raw_dir, segments, params, markers, global_config, config, intervals, sample_id): titan_outdir = os.path.join(titan_raw_dir, 'clusters_{numclusters}', 'ploidy_{ploidy}') igv_template = os.path.join(titan_outdir, 'igv_segs.txt') outfile_template = os.path.join(titan_outdir, 'titan_markers.txt') params_template = os.path.join(titan_outdir, 'titan_params.txt') segs_template = os.path.join(titan_outdir, 'titan_segs.txt') plots_template = os.path.join(titan_outdir, 'titan_plots.tar.gz') parsed_template = os.path.join(titan_outdir, 'titan_parsed.csv') museq_vcf = os.path.join(titan_raw_dir, 'museq.vcf') chunks = [(v['num_clusters'], v['ploidy']) for v in intervals] targets = mgd.InputFile(targets) if targets else None workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('numclusters', 'ploidy'), value=chunks, ) workflow.transform(name='generate_intervals', func=tasks.generate_intervals, ctx={ 'mem': global_config['memory']['low'], 'ncpus': 1, 'walltime': '02:00' }, ret=mgd.OutputChunks('interval'), args=(config['reference_genome'], config['chromosomes'])) workflow.transform( name='run_museq', ctx={ 'mem': global_config['memory']['high'], 'ncpus': global_config['threads'], 'walltime': '02:00' }, func=tasks.run_museq, axes=('interval', ), args=(mgd.InputFile(tumour_bam, extensions=['.bai']), mgd.InputFile(normal_bam, extensions=['.bai']), mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), config['reference_genome'], mgd.InputInstance('interval'), config['museq_params']), ) workflow.transform(name='merge_vcfs', ctx={ 'num_retry': 3, 'mem_retry_increment': 2, 'mem': global_config['memory']['high'], 'ncpus': 1 }, func=tasks.merge_vcfs, args=( mgd.TempInputFile('museq.vcf', 'interval'), mgd.OutputFile(museq_vcf), )) workflow.transform( name='convert_museq_vcf2counts', ctx={ 'mem': global_config['memory']['high'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.convert_museq_vcf2counts, args=( mgd.InputFile(museq_vcf), mgd.TempOutputFile('museq_postprocess.txt'), config, ), ) workflow.transform( name='run_readcounter_tumour', ctx={ 'mem': global_config['memory']['high'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.run_readcounter, args=( mgd.InputFile(tumour_bam, extensions=['.bai']), mgd.TempOutputFile('tumour.wig'), config, ), ) workflow.transform( name='run_readcounter_normal', ctx={ 'mem': global_config['memory']['high'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.run_readcounter, args=( mgd.InputFile(normal_bam, extensions=['.bai']), mgd.TempOutputFile('normal.wig'), config, ), ) workflow.transform( name='calc_correctreads_wig', ctx={ 'mem': global_config['memory']['low'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.calc_correctreads_wig, args=( mgd.TempInputFile('tumour.wig'), mgd.TempInputFile('normal.wig'), targets, mgd.TempOutputFile('correct_reads.txt'), config, ), ) workflow.transform(name='run_titan', axes=('numclusters', 'ploidy'), ctx={ 'mem': global_config['memory']['high'], 'ncpus': 1, 'walltime': '06:00' }, func=tasks.run_titan, args=(mgd.TempInputFile('museq_postprocess.txt'), mgd.TempInputFile('correct_reads.txt'), mgd.OutputFile('titan_outfile', 'numclusters', 'ploidy', template=outfile_template), mgd.TempOutputFile('titan.Rdata', 'numclusters', 'ploidy'), mgd.OutputFile('titan_params', 'numclusters', 'ploidy', template=params_template), config['titan_params'], mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy'))) workflow.transform( name='plot_titan', axes=('numclusters', 'ploidy'), ctx={ 'mem': global_config['memory']['low'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.plot_titan, args=(mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'), mgd.InputFile('titan_params', 'numclusters', 'ploidy', template=params_template), mgd.OutputFile('titan_plots', 'numclusters', 'ploidy', template=plots_template), mgd.TempSpace("titan_plots_tempdir", 'numclusters', 'ploidy'), config, mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy')), ) workflow.transform( name='calc_cnsegments_titan', axes=('numclusters', 'ploidy'), ctx={ 'mem': global_config['memory']['low'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.calc_cnsegments_titan, args=( mgd.InputFile('titan_outfile', 'numclusters', 'ploidy', template=outfile_template), mgd.OutputFile('titan_igv', 'numclusters', 'ploidy', template=igv_template), mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'), ), ) workflow.transform( name='annot_pygenes', axes=('numclusters', 'ploidy'), ctx={ 'mem': global_config['memory']['low'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.annot_pygenes, args=( mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'), mgd.OutputFile('titan_segs.csv', 'numclusters', 'ploidy', template=segs_template), config, ), ) workflow.transform( name='parse_titan', axes=('numclusters', 'ploidy'), ctx={ 'mem': global_config['memory']['low'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.parse_titan, args=( mgd.InputFile('titan_segs.csv', 'numclusters', 'ploidy', template=segs_template), mgd.InputFile('titan_params', 'numclusters', 'ploidy', template=params_template), mgd.InputFile('titan_outfile', 'numclusters', 'ploidy', template=outfile_template), mgd.OutputFile('titan_parsed.csv', 'numclusters', 'ploidy', template=parsed_template), config['parse_titan'], sample_id, ), ) workflow.transform( name='segments_h5', ctx={ 'mem': global_config['memory']['low'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.merge_to_h5, args=(mgd.InputFile('titan_segs.csv', 'numclusters', 'ploidy', template=segs_template), mgd.OutputFile(segments), intervals), ) workflow.transform( name='params_h5', ctx={ 'mem': global_config['memory']['low'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.merge_to_h5, args=(mgd.InputFile('titan_params', 'numclusters', 'ploidy', template=params_template), mgd.OutputFile(params), intervals), ) workflow.transform(name='markers_h5', ctx={ 'mem': global_config['memory']['low'], 'ncpus': 1, 'walltime': '02:00' }, func=tasks.merge_to_h5, args=(mgd.InputFile('titan_outfile', 'numclusters', 'ploidy', template=outfile_template), mgd.OutputFile(markers), intervals), kwargs={'dtype': { 'Chr': str }}) return workflow
def infer_haps( bam_file, haplotypes_filename, config, from_tumour=False, ): baseimage = {'docker_image': config['docker']['single_cell_pipeline']} remixt_image = config['docker']['remixt'] remixt_config = config.get('extract_seqdata', {}) remixt_ref_data_dir = config['ref_data_dir'] chromosomes = config['chromosomes'] remixt_config['chromosomes'] = chromosomes ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, **baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) if isinstance(bam_file, dict): workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_file.keys()), ) # dont parallelize over chromosomes for per cell bams workflow.subworkflow( name="extract_seqdata", axes=('cell_id',), func='remixt.workflow.create_extract_seqdata_workflow', ctx={'docker_image': remixt_image}, args=( mgd.InputFile( 'bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai'] ), mgd.TempOutputFile('seqdata_cell.h5', 'cell_id'), remixt_config, remixt_ref_data_dir, ), kwargs={'no_parallelism': True} ) workflow.transform( name='merge_all_seqdata', func="remixt.seqdataio.merge_overlapping_seqdata", ctx={'docker_image': remixt_image}, args=( mgd.TempOutputFile('seqdata_file.h5'), mgd.TempInputFile("seqdata_cell.h5", "cell_id"), config["chromosomes"] ), ) else: workflow.subworkflow( name='extract_seqdata', func='remixt.workflow.create_extract_seqdata_workflow', ctx={'disk': 150, 'docker_image': remixt_image}, args=( mgd.InputFile(bam_file, extensions=['.bai']), mgd.TempOutputFile('seqdata_file.h5'), remixt_config, remixt_ref_data_dir, ), ) workflow.setobj( obj=mgd.OutputChunks('chromosome'), value=chromosomes, ) if from_tumour: func = 'remixt.analysis.haplotype.infer_snp_genotype_from_tumour' else: func = 'remixt.analysis.haplotype.infer_snp_genotype_from_normal' workflow.transform( name='infer_snp_genotype', axes=('chromosome',), ctx={'mem': 16, 'docker_image': remixt_image}, func=func, args=( mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'), mgd.TempInputFile('seqdata_file.h5'), mgd.InputInstance('chromosome'), config, ), ) workflow.transform( name='infer_haps', axes=('chromosome',), ctx={'mem': 16, 'docker_image': remixt_image}, func='remixt.analysis.haplotype.infer_haps', args=( mgd.TempOutputFile('haplotypes.tsv', 'chromosome'), mgd.TempInputFile('snp_genotype.tsv', 'chromosome'), mgd.InputInstance('chromosome'), mgd.TempSpace('haplotyping', 'chromosome'), remixt_config, remixt_ref_data_dir, ), ) workflow.transform( name='merge_haps', ctx={'mem': 16, 'docker_image': remixt_image}, func='remixt.utils.merge_tables', args=( mgd.TempOutputFile('haplotypes_merged.tsv'), mgd.TempInputFile('haplotypes.tsv', 'chromosome'), ) ) workflow.transform( name='finalize_csv', ctx={'mem': 16}, func='single_cell.utils.csvutils.rewrite_csv_file', args=( mgd.TempInputFile('haplotypes_merged.tsv'), mgd.OutputFile(haplotypes_filename, extensions=['.yaml']), ), kwargs={ 'write_header': True, 'dtypes': dtypes()['haplotypes'] }, ) return workflow
def create_titan_workflow( tumour_bam, normal_bam, targets, outfile, params, segs, igv_segs, parsed, plots, tar_outputs, museq_vcf, sample_id, reference, chromosomes, het_positions, map_wig, gc_wig, pygenes_gtf, single_node=None ): cn_params = config.default_params('copynumber_calling') chunks = [(v['num_clusters'], v['ploidy']) for v in cn_params['titan_intervals']] targets = mgd.InputFile(targets) if targets else None ctx = {'docker_image': config.containers('wgs')} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('numclusters', 'ploidy'), value=chunks, ) workflow.transform( name='generate_intervals', func='wgs.workflows.titan.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='2:00', ), ret=mgd.OutputChunks('interval'), args=( reference, chromosomes, ), kwargs={'size': cn_params['split_size']} ) if single_node: workflow.transform( name='run_museq', ctx=helpers.get_default_ctx( memory=15, walltime='96:00', ncpus=8), func='wgs.utils.museq_utils.run_museq_one_job', args=( mgd.TempSpace("run_museq_temp"), mgd.OutputFile(museq_vcf), reference, mgd.InputChunks('interval'), cn_params['museq_params'], ), kwargs={ 'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']), 'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']), 'titan_mode': True, 'museq_docker_image': config.containers('mutationseq'), 'vcftools_docker_image': config.containers('vcftools') } ) else: workflow.transform( name='run_museq', ctx=helpers.get_default_ctx( memory=15, walltime='24:00'), axes=('interval',), func='wgs.utils.museq_utils.run_museq', args=( mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), reference, mgd.InputInstance('interval'), cn_params['museq_params'] ), kwargs={ 'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']), 'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']), 'titan_mode': True, 'docker_image': config.containers('mutationseq') } ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='4:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('museq.vcf', 'interval'), mgd.OutputFile(museq_vcf), mgd.TempSpace('merge_vcf'), ), kwargs={'docker_image': config.containers('vcftools')} ) workflow.transform( name='convert_museq_vcf2counts', ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.convert_museq_vcf2counts', args=( mgd.InputFile(museq_vcf), mgd.TempOutputFile('museq_postprocess.txt'), het_positions, ), ) workflow.transform( name='run_readcounter_tumour', ctx=helpers.get_default_ctx( memory=10, walltime='16:00', disk=200 ), func='wgs.workflows.titan.tasks.run_readcounter', args=( mgd.InputFile(tumour_bam, extensions=['.bai']), mgd.TempOutputFile('tumour.wig'), chromosomes, cn_params['readcounter'] ), ) workflow.transform( name='run_readcounter_normal', ctx=helpers.get_default_ctx( memory=10, walltime='16:00', disk=200 ), func='wgs.workflows.titan.tasks.run_readcounter', args=( mgd.InputFile(normal_bam, extensions=['.bai']), mgd.TempOutputFile('normal.wig'), chromosomes, cn_params['readcounter'] ), ) workflow.transform( name='calc_correctreads_wig', ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.calc_correctreads_wig', args=( mgd.TempInputFile('tumour.wig'), mgd.TempInputFile('normal.wig'), targets, mgd.TempOutputFile('correct_reads.txt'), gc_wig, map_wig, cn_params['genome_type'] ), kwargs={'docker_image': config.containers('titan')} ) workflow.transform( name='run_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=15, walltime='72:00', ncpus='8'), func='wgs.workflows.titan.tasks.run_titan', args=( mgd.TempInputFile('museq_postprocess.txt'), mgd.TempInputFile('correct_reads.txt'), mgd.TempOutputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan.Rdata', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_params', 'numclusters', 'ploidy'), mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy'), sample_id, map_wig, cn_params['titan_params'], cn_params['genome_type'] ), kwargs={'docker_image': config.containers('titan'), 'threads': '8'} ) workflow.transform( name='plot_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=10, walltime='16:00', ), func='wgs.workflows.titan.tasks.plot_titan', args=( mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_plots', 'numclusters', 'ploidy'), mgd.TempSpace("titan_plots_tempdir", 'numclusters', 'ploidy'), mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy') ), kwargs={ 'chromosomes': chromosomes, 'docker_image': config.containers('titan'), }, ) workflow.transform( name='calc_cnsegments_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func='wgs.workflows.titan.tasks.calc_cnsegments_titan', args=( mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'), sample_id, ), kwargs={'docker_image': config.containers('titan')} ) workflow.transform( name='annot_pygenes', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=10, walltime='4:00', ), func='wgs.workflows.titan.tasks.annot_pygenes', args=( mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_segs.csv', 'numclusters', 'ploidy'), pygenes_gtf, ), ) workflow.transform( name='parse_titan', axes=('numclusters', 'ploidy'), ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func='wgs.workflows.titan.tasks.parse_titan_data', args=( mgd.TempInputFile('titan_segs.csv', 'numclusters', 'ploidy'), mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'), mgd.TempOutputFile('titan_parsed.csv', 'numclusters', 'ploidy'), ), ) # select optimal solution workflow.transform( name="select_optimal_solution", ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func="wgs.workflows.titan.tasks.select_optimal_solution", args=( chunks, mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]), mgd.OutputFile(segs, extensions=['.yaml']), mgd.OutputFile(igv_segs, extensions=['.yaml']), mgd.OutputFile(params, extensions=['.yaml']), mgd.OutputFile(outfile, extensions=['.yaml']), mgd.OutputFile(parsed, extensions=['.yaml']), mgd.OutputFile(plots), ) ) workflow.transform( name='tar_all_data', ctx=helpers.get_default_ctx( memory=5, walltime='4:00', ), func="wgs.workflows.titan.tasks.tar_all_data", args=( mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'), mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]), mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]), mgd.OutputFile(tar_outputs), mgd.TempSpace("titan_all_parameters_data"), chunks ) ) return workflow
def cna_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config_file']) inputs = helpers.load_yaml(args['input_yaml']) tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') targets = helpers.get_values_from_input(inputs, 'target_list') breakpoints = helpers.get_values_from_input(inputs, 'breakpoints') samples = tumours.keys() cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5') remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data') titan_raw_dir = os.path.join(cna_outdir, 'titan') titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5') titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5') titan_params_filename = os.path.join(titan_raw_dir, 'params.h5') workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.InputFile("target_list", 'sample_id', fnames=targets, axes_origin=[]), mgd.Template(titan_raw_dir, 'sample_id'), mgd.OutputFile('titan_segments_filename', 'sample_id', axes_origin=[], template=titan_segments_filename), mgd.OutputFile('titan_params_filename', 'sample_id', axes_origin=[], template=titan_params_filename), mgd.OutputFile('titan_markers_filename', 'sample_id', axes_origin=[], template=titan_markers_filename), config['globals'], config['cna_calling'], config['cna_calling']['titan_intervals'], mgd.InputInstance('sample_id'), ), ) workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('destruct_breakpoints', 'sample_id', axes_origin=[], fnames=breakpoints), mgd.InputInstance('sample_id'), config['cna_calling']['remixt_refdata'], mgd.OutputFile('remixt_results_filename', 'sample_id', axes_origin=[], template=remixt_results_filename), mgd.Template(remixt_raw_dir, 'sample_id'), config['cna_calling']['min_num_reads'] ), ) pyp.run(workflow)
def create_museq_workflow( normal_bam, tumour_bam, ref_genome, snv_vcf, config): museq_docker = {'docker_image': config['docker']['mutationseq']} vcftools_docker = {'docker_image': config['docker']['vcftools']} ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'num_retry': 3, 'docker_image': config['docker']['single_cell_pipeline']} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('region'), value=list(normal_bam.keys()), ) workflow.transform( name='run_museq', ctx=dict(mem=config["memory"]['med']), axes=('region',), func='single_cell.workflows.mutationseq.tasks.run_museq', args=( mgd.InputFile('merged_bam', 'region', fnames=tumour_bam, extensions=['.bai']), mgd.InputFile('normal.split.bam', 'region', fnames=normal_bam, extensions=['.bai']), mgd.TempOutputFile('museq.vcf', 'region'), mgd.TempOutputFile('museq.log', 'region'), mgd.InputInstance('region'), config, ), kwargs={'docker_kwargs': museq_docker} ) workflow.transform( name='finalise_region_vcfs', axes=('region',), ctx=dict(mem=config["memory"]['med']), func='biowrappers.components.io.vcf.tasks.finalise_vcf', args=( mgd.TempInputFile('museq.vcf', 'region'), mgd.TempOutputFile('museq.vcf.gz', 'region', extensions=['.tbi', '.csi']), ), kwargs={'docker_config': vcftools_docker} ) workflow.transform( name='merge_snvs', ctx=dict(mem=config["memory"]['med']), func='biowrappers.components.io.vcf.tasks.concatenate_vcf', args=( mgd.TempInputFile('museq.vcf.gz', 'region', extensions=['.tbi', '.csi']), mgd.TempOutputFile('museq.vcf.gz', extensions=['.tbi', '.csi']), ), kwargs={ 'allow_overlap': True, 'docker_config': vcftools_docker }, ) workflow.transform( name='finalise_vcf', func='biowrappers.components.io.vcf.tasks.finalise_vcf', ctx=dict(mem=config["memory"]['med']), args=( mgd.TempInputFile('museq.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']), ), kwargs={'docker_config': vcftools_docker} ) return workflow
def extract_allele_readcounts( haplotypes_filename, cell_bams, allele_counts_filename, config, ): baseimage = {'docker_image': config['docker']['single_cell_pipeline']} remixt_image = config['docker']['remixt'] remixt_config = config.get('extract_seqdata', {}) remixt_ref_data_dir = config['ref_data_dir'] chromosomes = config['chromosomes'] remixt_config['chromosomes'] = chromosomes workflow = pypeliner.workflow.Workflow(ctx=baseimage) workflow.set_filenames('cell.bam', 'cell_id', fnames=cell_bams) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(cell_bams.keys()), ) workflow.subworkflow( name='create_seqdata_readcounts', axes=('cell_id', ), func='remixt.workflow.create_extract_seqdata_workflow', ctx={'docker_image': remixt_image}, args=( mgd.InputFile('cell.bam', 'cell_id', extensions=['.bai']), mgd.TempOutputFile('seqdata.h5', 'cell_id', axes_origin=[]), remixt_config, remixt_ref_data_dir, ), kwargs={'no_parallelism': True}) # TODO Segments with bin width from single cell workflow.transform( name='create_segments', func='remixt.analysis.segment.create_segments', ctx={ 'mem': 16, 'docker_image': remixt_image }, args=( mgd.TempOutputFile('segments.tsv'), remixt_config, remixt_ref_data_dir, ), ) workflow.transform( name='generate_haplotypes_tsv', func= 'single_cell.workflows.extract_allele_readcounts.tasks.convert_csv_to_tsv', args=(mgd.InputFile(haplotypes_filename, extensions=['.yaml']), mgd.TempOutputFile('haplotypes.tsv'))) workflow.transform( name='haplotype_allele_readcount', axes=('cell_id', ), ctx={ 'mem': 16, 'docker_image': remixt_image }, func='remixt.analysis.readcount.haplotype_allele_readcount', args=( mgd.TempOutputFile('allele_counts.tsv', 'cell_id', axes_origin=[]), mgd.TempInputFile('segments.tsv'), mgd.TempInputFile('seqdata.h5', 'cell_id'), mgd.TempInputFile('haplotypes.tsv'), remixt_config, ), ) workflow.transform( name='prep_readcount_csv', axes=('cell_id', ), func='single_cell.utils.csvutils.rewrite_csv_file', args=( mgd.TempInputFile('allele_counts.tsv', 'cell_id'), mgd.TempOutputFile('allele_counts.csv.gz', 'cell_id', extensions=['.yaml']), ), kwargs={ 'write_header': True, 'dtypes': dtypes()['readcount'] }, ) workflow.transform( name='readcounts_cell_id_annotate', axes=('cell_id', ), func='single_cell.utils.csvutils.add_col_from_dict', args=( mgd.TempInputFile('allele_counts.csv.gz', 'cell_id', extensions=['.yaml']), { 'cell_id': mgd.InputInstance('cell_id') }, mgd.TempOutputFile('allele_counts_annotate.csv.gz', 'cell_id', extensions=['.yaml']), dtypes()['readcount'], ), ) workflow.transform( name='merge_allele_readcount', ctx={'mem': 16}, func='single_cell.utils.csvutils.concatenate_csv', args=( mgd.TempInputFile('allele_counts_annotate.csv.gz', 'cell_id', extensions=['.yaml']), mgd.OutputFile(allele_counts_filename, extensions=['.yaml']), ), kwargs={'write_header': True}, ) return workflow
def create_infer_haps_workflow( seqdata_filenames, haps_filename, config, ref_data_dir, normal_id=None, ): chromosomes = remixt.config.get_chromosomes(config, ref_data_dir) workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes) if normal_id is not None: normal_seqdata_filename = seqdata_filenames[normal_id] workflow.transform( name='infer_snp_genotype_from_normal', axes=('chromosome',), ctx={'mem': 16}, func=remixt.analysis.haplotype.infer_snp_genotype_from_normal, args=( mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'), mgd.InputFile(normal_seqdata_filename), mgd.InputInstance('chromosome'), config, ), ) else: workflow.setobj( obj=mgd.OutputChunks('tumour_id'), value=seqdata_filenames.keys(), ) workflow.transform( name='infer_snp_genotype_from_tumour', axes=('chromosome',), ctx={'mem': 16}, func=remixt.analysis.haplotype.infer_snp_genotype_from_tumour, args=( mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'), mgd.InputFile('tumour_seqdata', 'tumour_id', fnames=seqdata_filenames), mgd.InputInstance('chromosome'), config, ), ) workflow.transform( name='infer_haps', axes=('chromosome',), ctx={'mem': 16}, func=remixt.analysis.haplotype.infer_haps, args=( mgd.TempOutputFile('haps.tsv', 'chromosome'), mgd.TempInputFile('snp_genotype.tsv', 'chromosome'), mgd.InputInstance('chromosome'), mgd.TempSpace('haplotyping', 'chromosome'), config, ref_data_dir, ) ) workflow.transform( name='merge_haps', ctx={'mem': 16}, func=remixt.utils.merge_tables, args=( mgd.OutputFile(haps_filename), mgd.TempInputFile('haps.tsv', 'chromosome'), ) ) return workflow
def create_remixt_seqdata_workflow( breakpoint_filename, seqdata_filenames, results_filenames, raw_data_directory, config, ref_data_dir, normal_id=None, ): sample_ids = seqdata_filenames.keys() tumour_ids = seqdata_filenames.keys() if normal_id is not None: tumour_ids.remove(normal_id) results_filenames = dict([(tumour_id, results_filenames[tumour_id]) for tumour_id in tumour_ids]) segment_filename = os.path.join(raw_data_directory, 'segments.tsv') haplotypes_filename = os.path.join(raw_data_directory, 'haplotypes.tsv') counts_table_template = os.path.join(raw_data_directory, 'counts', 'sample_{tumour_id}.tsv') experiment_template = os.path.join(raw_data_directory, 'experiment', 'sample_{tumour_id}.pickle') ploidy_plots_template = os.path.join(raw_data_directory, 'ploidy_plots', 'sample_{tumour_id}.pdf') workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=sample_ids, ) workflow.setobj( obj=mgd.OutputChunks('tumour_id'), value=tumour_ids, ) workflow.transform( name='create_segments', func=remixt.analysis.segment.create_segments, args=( mgd.OutputFile(segment_filename), config, ref_data_dir, ), kwargs={ 'breakpoint_filename': mgd.InputFile(breakpoint_filename), }, ) workflow.subworkflow( name='infer_haps_workflow', func=remixt.workflow.create_infer_haps_workflow, args=( mgd.InputFile('seqdata', 'sample_id', fnames=seqdata_filenames), mgd.OutputFile(haplotypes_filename), config, ref_data_dir, ), kwargs={ 'normal_id': normal_id, } ) workflow.subworkflow( name='prepare_counts_workflow', func=remixt.workflow.create_prepare_counts_workflow, args=( mgd.InputFile(segment_filename), mgd.InputFile(haplotypes_filename), mgd.InputFile('seqdata', 'tumour_id', fnames=seqdata_filenames), mgd.TempOutputFile('rawcounts', 'tumour_id', axes_origin=[]), config, ), ) workflow.subworkflow( name='calc_bias_workflow', axes=('tumour_id',), func=remixt.workflow.create_calc_bias_workflow, args=( mgd.InputFile('seqdata', 'tumour_id', fnames=seqdata_filenames), mgd.TempInputFile('rawcounts', 'tumour_id'), mgd.OutputFile('counts', 'tumour_id', template=counts_table_template), config, ref_data_dir, ), ) workflow.transform( name='create_experiment', axes=('tumour_id',), ctx={'mem': 8}, func=remixt.analysis.experiment.create_experiment, args=( mgd.InputFile('counts', 'tumour_id', template=counts_table_template), mgd.InputFile(breakpoint_filename), mgd.OutputFile('experiment', 'tumour_id', template=experiment_template), ), ) workflow.transform( name='ploidy_analysis_plots', axes=('tumour_id',), ctx={'mem': 8}, func=remixt.cn_plot.ploidy_analysis_plots, args=( mgd.InputFile('experiment', 'tumour_id', template=experiment_template), mgd.OutputFile('plots', 'tumour_id', template=ploidy_plots_template), ), ) workflow.subworkflow( name='fit_model', axes=('tumour_id',), func=remixt.workflow.create_fit_model_workflow, args=( mgd.InputFile('experiment', 'tumour_id', template=experiment_template), mgd.OutputFile('results', 'tumour_id', fnames=results_filenames), config, ref_data_dir, ), kwargs={ 'tumour_id': mgd.InputInstance('tumour_id'), }, ) return workflow
def create_museq_workflow(snv_vcf, museqportrait_pdf, reference, chromosomes, thousand_genomes=None, dbsnp=None, germline_refdata=None, tumour_bam=None, normal_bam=None, single_node=None): name = 'run_museq' if tumour_bam: tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai']) name += '_tumour' if normal_bam: normal_bam = mgd.InputFile(normal_bam, extensions=['.bai']) name += '_normal' single = False if name == 'run_museq_tumour_normal' else True params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': config.containers('wgs')}) workflow.transform( name='generate_intervals', func='wgs.workflows.mutationseq.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform(name=name, ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus='8', disk=600), func='wgs.utils.museq_utils.run_museq_one_job', args=( mgd.TempSpace("run_museq_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), params['museq_params'], ), kwargs={ 'tumour_bam': tumour_bam, 'normal_bam': normal_bam, 'museq_docker_image': config.containers('mutationseq'), 'vcftools_docker_image': config.containers('vcftools') }) else: workflow.transform(name=name, ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.utils.museq_utils.run_museq', args=( mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), reference, mgd.InputInstance('interval'), params['museq_params'], ), kwargs={ 'tumour_bam': tumour_bam, 'normal_bam': normal_bam, 'docker_image': config.containers('mutationseq'), }) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.utils.museq_utils.merge_vcfs', args=( mgd.TempInputFile('museq.vcf', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform(name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('merged.vcf'), mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']), ), kwargs={'docker_image': config.containers('vcftools')}) workflow.transform( name='run_museqportrait', ctx=helpers.get_default_ctx( memory=5, walltime='8:00', ), func='wgs.workflows.mutationseq.tasks.run_museqportrait', args=( mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(museqportrait_pdf), mgd.TempOutputFile('museqportrait.txt'), mgd.TempOutputFile('museqportrait.log'), single, ), kwargs={ 'docker_image': config.containers('mutationseq'), 'thousand_genomes': thousand_genomes, 'dbsnp': dbsnp, 'germline_refdata': germline_refdata, 'germline_plot_threshold': params['germline_portrait_threshold'] }) return workflow
def create_mutect_workflow(normal_bam, tumour_bam, snv_vcf, snv_maf, reference, reference_vep, chromosomes, normal_id, tumour_id, single_node=None): params = config.default_params('variant_calling') workflow = pypeliner.workflow.Workflow() workflow.transform(name='generate_intervals', func='wgs.workflows.mutect.tasks.generate_intervals', ctx=helpers.get_default_ctx( memory=5, walltime='1:00', ), ret=mgd.OutputChunks('interval'), args=(reference, chromosomes), kwargs={'size': params['split_size']}) if single_node: workflow.transform( name='mutect_one_node', ctx=helpers.get_default_ctx(memory=15, walltime='48:00', ncpus=8, disk=600), func='wgs.workflows.mutect.tasks.run_mutect_one_job', args=(mgd.TempSpace("run_mutect_temp"), mgd.TempOutputFile('merged.vcf'), reference, mgd.InputChunks('interval'), mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam)), ) else: workflow.transform( name='mutect_caller', ctx=helpers.get_default_ctx( memory=15, walltime='24:00', ), axes=('interval', ), func='wgs.workflows.mutect.tasks.run_mutect', args=(mgd.TempOutputFile('mutect.vcf', 'interval'), reference, mgd.InputInstance('interval'), mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.TempSpace('mutect_temp', 'interval')), ) workflow.transform( name='merge_vcfs', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.mutect.tasks.merge_vcfs', args=( mgd.TempInputFile('mutect.vcf', 'interval'), mgd.TempOutputFile('merged.vcf'), mgd.TempSpace('merge_vcf'), ), ) workflow.transform(name='bcftools_normalize', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcfutils.bcftools_normalize', args=( mgd.TempInputFile('merged.vcf'), mgd.TempOutputFile('normalized.vcf'), reference, )) workflow.transform( name='finalise_snvs', ctx=helpers.get_default_ctx(walltime='8:00', ), func='wgs.utils.vcf_tasks.finalise_vcf', args=( mgd.TempInputFile('normalized.vcf'), mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']), ), ) workflow.subworkflow(name="strelka_indel_maf", func='wgs.workflows.vcf2maf.create_vcf2maf_workflow', args=( mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']), mgd.OutputFile(snv_maf), reference_vep, ), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) return workflow
def infer_haps( bam_file, haplotypes_filename, allele_counts_filename, config, normal=False, ): baseimage = {'docker_image': config['docker']['single_cell_pipeline']} remixt_config = config.get('extract_seqdata', {}) remixt_ref_data_dir = config['ref_data_dir'] chromosomes = config['chromosomes'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, **baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) if isinstance(bam_file, dict): workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_file.keys()), ) # dont parallelize over chromosomes for per cell bams workflow.subworkflow( name="extract_seqdata", axes=('cell_id', ), func= 'single_cell.workflows.extract_seqdata.create_extract_seqdata_workflow', args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']), mgd.TempOutputFile('seqdata_cell.h5', 'cell_id'), config.get('extract_seqdata', {}), config['ref_data_dir'], config, )) workflow.transform( name='merge_all_seqdata', func="single_cell.workflows.titan.tasks.merge_overlapping_seqdata", args=(mgd.TempOutputFile('seqdata_file.h5'), mgd.TempInputFile("seqdata_cell.h5", "cell_id"), config["chromosomes"]), ) else: # if its a single bam, then its probably whole genome # so parallelize over chromosomes workflow.subworkflow( name='extract_seqdata', func='remixt.workflow.create_extract_seqdata_workflow', ctx={'disk': 150}, args=( mgd.InputFile(bam_file, extensions=['.bai']), mgd.TempOutputFile('seqdata_file.h5'), remixt_config, remixt_ref_data_dir, ), ) workflow.setobj( obj=mgd.OutputChunks('chromosome'), value=chromosomes, ) if normal: func = 'remixt.analysis.haplotype.infer_snp_genotype_from_normal' else: func = 'remixt.analysis.haplotype.infer_snp_genotype_from_tumour' workflow.transform( name='infer_snp_genotype', axes=('chromosome', ), ctx=dict(mem=16, **ctx), func=func, args=( mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'), mgd.TempInputFile('seqdata_file.h5'), mgd.InputInstance('chromosome'), config, ), ) workflow.transform( name='infer_haps', axes=('chromosome', ), ctx=dict(mem=16, **ctx), func='remixt.analysis.haplotype.infer_haps', args=( mgd.TempOutputFile('haplotypes.tsv', 'chromosome'), mgd.TempInputFile('snp_genotype.tsv', 'chromosome'), mgd.InputInstance('chromosome'), mgd.TempSpace('haplotyping', 'chromosome'), remixt_config, remixt_ref_data_dir, ), ) workflow.transform(name='merge_haps', ctx=dict(mem=16, **ctx), func='remixt.utils.merge_tables', args=( mgd.OutputFile(haplotypes_filename), mgd.TempInputFile('haplotypes.tsv', 'chromosome'), )) workflow.transform( name='create_segments', ctx=dict(mem=16, **ctx), func='remixt.analysis.segment.create_segments', args=( mgd.TempOutputFile('segments.tsv'), remixt_config, config['ref_data_dir'], ), ) workflow.transform( name='haplotype_allele_readcount', ctx=dict(mem=16, **ctx), func='remixt.analysis.readcount.haplotype_allele_readcount', args=(mgd.OutputFile(allele_counts_filename), mgd.TempInputFile('segments.tsv'), mgd.TempInputFile('seqdata_file.h5'), mgd.InputFile(haplotypes_filename), remixt_config), ) return workflow
def partition_tumour(config, input_args, patient_id, results_dir, input_bams, input_bais, output_file): workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('tumour_id', ), value=input_args['tumour_samples']) workflow.setobj(obj=mgd.OutputChunks('normal_id', ), value=input_args['normal_samples']) workflow.transform(name='merge_normal', func=tasks.merge_normal, args=(config, mgd.InputFile('normal.bam', 'normal_id', fnames=input_args['normal_bams'], axes_origin=[]), mgd.OutputFile( os.path.join(input_args['patient_bam_dir'], 'merged_normal.bam')), mgd.OutputFile( os.path.join(input_args['patient_bam_dir'], 'merged_normal.bam.bai')))) workflow.subworkflow( name='analyze_tumour', func=analyze_tumour_normal, axes=('tumour_id', ), args=( config, input_args, results_dir, mgd.InputFile( os.path.join(input_args['patient_bam_dir'], 'merged_normal.bam')), mgd.InputInstance('tumour_id'), mgd.InputFile('tumour.bam', 'tumour_id', fnames=input_bams), mgd.OutputFile( os.path.join(results_dir, patient_id + '_{tumour_id}.snv.tsv'), 'tumour_id'), mgd.OutputFile( os.path.join(results_dir, patient_id + '_{tumour_id}.indel.tsv'), 'tumour_id'), mgd.TempOutputFile('snv.vcf', 'tumour_id'), mgd.TempOutputFile('indel.vcf', 'tumour_id'), )) workflow.transform(name='annotate_snvs', func=tasks.annotate_outputs, axes=('tumour_id', ), args=( config, mgd.TempSpace('snv_annotation_space', 'tumour_id'), mgd.TempInputFile('snv.vcf', 'tumour_id'), mgd.OutputFile( os.path.join( results_dir, patient_id + '_{tumour_id}.snv.txt'), 'tumour_id'), )) workflow.transform(name='annotate_indels', func=tasks.annotate_outputs, axes=('tumour_id', ), args=( config, mgd.TempSpace('indel_annotation_space', 'tumour_id'), mgd.TempInputFile('indel.vcf', 'tumour_id'), mgd.OutputFile( os.path.join( results_dir, patient_id + '_{tumour_id}.indel.txt'), 'tumour_id'), )) workflow.transform(name='vcf_annotate_indels', func=tasks.vcf_annotate_outputs, axes=('tumour_id', ), args=( config, mgd.TempSpace('indel_vcf_annotation_space', 'tumour_id'), mgd.TempInputFile('indel.vcf', 'tumour_id'), mgd.OutputFile( os.path.join( results_dir, patient_id + '_{tumour_id}.indel.vcf'), 'tumour_id'), )) workflow.transform( name='vcf_annotate_snvs', func=tasks.vcf_annotate_outputs, axes=('tumour_id', ), args=( config, mgd.TempSpace('snv_vcf_annotation_space', 'tumour_id'), mgd.TempInputFile('snv.vcf', 'tumour_id'), mgd.OutputFile( os.path.join(results_dir, patient_id + '_{tumour_id}.snv.vcf'), 'tumour_id'), )) workflow.transform( name='log_patient_analysis', func=tasks.log_patient_analysis, args=( mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.snv.tsv'), 'tumour_id', axes_origin=[]), mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.indel.tsv'), 'tumour_id', axes_origin=[]), mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.snv.txt'), 'tumour_id', axes_origin=[]), mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.indel.txt'), 'tumour_id', axes_origin=[]), mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.snv.vcf'), 'tumour_id', axes_origin=[]), mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.indel.vcf'), 'tumour_id', axes_origin=[]), mgd.OutputFile(output_file), )) return workflow
def run_LoLoPicker(config, args, normal_bam, tumour_bam, output_file): workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('region', ), value=list(map(str, range(1, 23) + ['X']))) workflow.transform(name='create_axes_beds', axes=('region', ), func=tasks.create_axes_beds, args=(mgd.InputFile(config["bed_file"]), mgd.InputInstance('region'), mgd.TempOutputFile('region.bed', 'region'))) workflow.transform(name='LoLoPicker_somatic', axes=('region', ), func=tasks.LoLoPicker_somatic, args=(config, mgd.InputFile(tumour_bam), mgd.InputFile(normal_bam), mgd.TempInputFile('region.bed', 'region'), mgd.TempSpace('LoLoPicker_somatic_temp', 'region'), mgd.TempOutputFile("raw_somatic_varants.txt", 'region'))) workflow.transform(name='make_sample_list', func=tasks.make_sample_list, args=( args, mgd.TempOutputFile('samplelist.txt'), )) workflow.transform(name='LoLoPicker_control', axes=('region', ), func=tasks.LoLoPicker_control, args=(config, mgd.TempInputFile('samplelist.txt'), mgd.TempSpace('LoLoPicker_control_temp', 'region'), mgd.TempInputFile("raw_somatic_varants.txt", 'region'), mgd.TempOutputFile("control_stats.txt", 'region'))) workflow.transform(name='LoLoPicker_stats', axes=('region', ), func=tasks.LoLoPicker_stats, args=( mgd.TempSpace('LoLoPicker_stats_temp', 'region'), mgd.TempInputFile("raw_somatic_varants.txt", 'region'), mgd.TempInputFile("control_stats.txt", 'region'), mgd.TempOutputFile("stats_calls.txt", 'region'), )) workflow.transform(name='merge_LoLoPicker', func=tasks.merge_LoLoPicker, args=(mgd.TempSpace("merge_LoLo"), mgd.TempInputFile("stats_calls.txt", 'region', axes_origin=[]), mgd.OutputFile(output_file))) return workflow
def breakpoint_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml') input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(tumours.keys()) sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}') destruct_breakpoints = os.path.join( sv_outdir, '{sample_id}_destruct_breakpoints.csv.gz') destruct_library = os.path.join(sv_outdir, '{sample_id}_destruct_library.csv.gz') destruct_raw_breakpoints = os.path.join( sv_outdir, '{sample_id}_destruct_raw_breakpoints.csv.gz') destruct_raw_library = os.path.join( sv_outdir, '{sample_id}_destruct_raw_library.csv.gz') destruct_reads = os.path.join(sv_outdir, '{sample_id}_destruct_reads.csv.gz') lumpy_vcf = os.path.join(sv_outdir, '{sample_id}_lumpy.vcf') parsed_csv = os.path.join(sv_outdir, '{sample_id}_filtered_consensus_calls.csv.gz') svaba_vcf = os.path.join(sv_outdir, '{sample_id}_svaba.vcf') single_node = args['single_node'] refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='destruct', func=destruct_wgs.create_destruct_wgs_workflow, axes=('sample_id', ), args=(mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints), mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library), mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints), mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library), mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads), mgd.InputInstance('sample_id'), refdir_paths['reference'], refdir_paths['refdata_destruct'], refdir_paths['gtf'], refdir_paths['blacklist_destruct']), kwargs={'single_node': single_node}) workflow.subworkflow( name='lumpy', func=lumpy.create_lumpy_workflow, axes=('sample_id', ), args=(mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), ), kwargs={ 'tumour_bam': mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node }, ) if args['svaba']: workflow.subworkflow( name='svaba', func=svaba.create_svaba_workflow, axes=('sample_id', ), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('svaba_vcf', 'sample_id', template=svaba_vcf), refdir_paths['reference'], ), ) workflow.subworkflow( name="consensus_calling", func=breakpoint_calling_consensus.create_consensus_workflow, axes=('sample_id', ), args=(mgd.InputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints), mgd.InputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), mgd.OutputFile('consensus_calls', 'sample_id', template=parsed_csv, extensions=['.yaml']), chromosomes), ) filenames = [ destruct_breakpoints, destruct_library, destruct_raw_breakpoints, destruct_raw_library, destruct_reads, lumpy_vcf, parsed_csv ] if args['svaba']: filenames.append(svaba_vcf) outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func=helpers.generate_and_upload_metadata, args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'breakpoint_calling' } }) pyp.run(workflow)
def lumpy_preprocess_cells(config, bam_files, merged_discordants, merged_splitters, hist_csv, mean_stdev_obj): ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, } histogram_settings = dict(N=10000, skip=0, min_elements=100, mads=10, X=4, read_length=101) workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) workflow.transform( name='process_tumour_cells', axes=('cell_id', ), ctx={ 'mem': 8, 'ncpus': 1 }, func='single_cell.workflows.lumpy.tasks.process_bam', args=( mgd.InputFile('tumour_bam', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.TempOutputFile('tumour.discordants.sorted.bam', 'cell_id'), mgd.TempOutputFile('tumour.splitters.sorted.bam', 'cell_id'), mgd.TempOutputFile('hist.csv', 'cell_id'), mgd.TempSpace("lumpy_tumour_processing", "cell_id"), ), kwargs=dict(tag=mgd.InputInstance('cell_id'), **histogram_settings), ) workflow.transform( name='merge_disc', ctx={ 'mem': 8, 'ncpus': 1 }, func='single_cell.workflows.lumpy.tasks.merge_bams', args=(mgd.TempInputFile('tumour.discordants.sorted.bam', 'cell_id'), mgd.OutputFile(merged_discordants), mgd.TempSpace("merge_disc_temp")), ) workflow.transform( name='merge_split', ctx={ 'mem': 8, 'ncpus': 1 }, func='single_cell.workflows.lumpy.tasks.merge_bams', args=(mgd.TempInputFile('tumour.splitters.sorted.bam', 'cell_id'), mgd.OutputFile(merged_splitters), mgd.TempSpace("merge_split_temp")), ) workflow.transform( name='merge_histo', ctx={ 'mem': 8, 'ncpus': 1 }, func='single_cell.workflows.lumpy.merge_histograms.merge_histograms', args=(mgd.TempInputFile('hist.csv', 'cell_id'), mgd.OutputFile(hist_csv), mgd.OutputFile(mean_stdev_obj)), ) return workflow
def alignment_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] outputs = os.path.join(outdir, '{sample_id}', '{sample_id}.bam') metrics_output = os.path.join(outdir, '{sample_id}', '{sample_id}_metrics.csv.gz') prealignment_tar = os.path.join(outdir, '{sample_id}', '{sample_id}_fastqc.tar.gz') postalignment_tar = os.path.join(outdir, '{sample_id}', '{sample_id}_metrics.tar.gz') samples = list(inputs.keys()) fastqs_r1, fastqs_r2 = helpers.get_fastqs(inputs, samples, None) sample_info = helpers.get_sample_info(inputs) pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('alignment'))) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'lane_id'), value=list(fastqs_r1.keys()), ) workflow.subworkflow(name="prealign", func=pre_alignment.pre_alignment, axes=('sample_id', 'lane_id'), args=( mgd.InputFile('input.r1.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r1), mgd.InputFile('input.r2.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r2), mgd.Template('prealignment.tar', 'sample_id', template=prealignment_tar), )) workflow.subworkflow( name="align", func=alignment.alignment, args=( mgd.InputFile('input.r1.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r1, axes_origin=[]), mgd.InputFile('input.r2.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r2, axes_origin=[]), mgd.OutputFile('output.bam', 'sample_id', template=outputs, axes_origin=[]), args['refdir'], sample_info, ), ) workflow.subworkflow( name="postalign", func=post_alignment.post_alignment, axes=('sample_id', ), args=( mgd.InputFile('output.bam', 'sample_id', template=outputs), mgd.OutputFile('metrics.csv.gz', 'sample_id', template=metrics_output, extensions=['.yaml']), mgd.OutputFile('metrics.tar.gz', 'sample_id', template=postalignment_tar), mgd.InputInstance('sample_id'), args['refdir'], ), ) pyp.run(workflow)
def get_coverage_data( input_bam, output, refdir, chromosomes, mapping_qual, bins, single_node=False ): reference = config.refdir_data(refdir)['paths']['reference'] workflow = pypeliner.workflow.Workflow() if single_node: workflow.transform( name='generate_coverage_bed', func='wgs.workflows.sample_qc.tasks.generate_coverage_bed', ctx=helpers.get_default_ctx( memory=5 ), args=( reference, mgd.TempOutputFile('coverage_bed.bed'), chromosomes, bins, ) ) workflow.transform( name='samtools_coverage', func='wgs.workflows.sample_qc.tasks.samtools_coverage', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(input_bam), mgd.TempInputFile('coverage_bed.bed'), mgd.TempOutputFile('per_interval.txt', 'chromosome'), mapping_qual, ), ) else: workflow.setobj( obj=mgd.OutputChunks('chromosome'), value=chromosomes ) workflow.transform( name='generate_coverage_bed', func='wgs.workflows.sample_qc.tasks.generate_coverage_bed', ctx=helpers.get_default_ctx( memory=5 ), axes=('chromosome',), args=( reference, mgd.TempOutputFile('coverage_bed.bed', 'chromosome'), mgd.InputInstance('chromosome'), bins, ) ) workflow.transform( name='samtools_coverage', func='wgs.workflows.sample_qc.tasks.samtools_coverage', ctx=helpers.get_default_ctx( memory=5 ), axes=('chromosome',), args=( mgd.InputFile(input_bam), mgd.TempInputFile('coverage_bed.bed', 'chromosome'), mgd.TempOutputFile('per_interval.txt', 'chromosome'), mapping_qual, ), ) workflow.transform( name='merge_data', func='wgs.utils.csvutils.concatenate_csv', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.TempInputFile('per_interval.txt', 'chromosome', axes_origin=[]), mgd.OutputFile(output), ) ) return workflow
def create_hmmcopy_workflow( bam_file, reads, segs, metrics, params, igv_seg_filename, segs_pdf, bias_pdf, plot_heatmap_ec_output, plot_metrics_output, plot_kernel_density_output, hmmcopy_data_tar, cell_ids, hmmparams, sample_info ): chromosomes = hmmparams["chromosomes"] baseimage = hmmparams['docker']['single_cell_pipeline'] hmmcopy_docker = hmmparams['docker']['hmmcopy'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.setobj( obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]), value=sample_info) workflow.transform( name='run_hmmcopy', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.run_hmmcopy", axes=('cell_id',), args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']), mgd.TempOutputFile('reads.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('segs.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('params.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('hmm_metrics.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('hmm_data.tar.gz', 'cell_id'), mgd.InputInstance('cell_id'), hmmparams, mgd.TempSpace('hmmcopy_temp', 'cell_id'), hmmcopy_docker ), ) workflow.transform( name='merge_reads', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempOutputFile('reads_merged.csv.gz', extensions=['.yaml']), ), kwargs={'low_memory': True} ) workflow.transform( name='add_mappability_bool', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.get_mappability_col", args=( mgd.TempInputFile('reads_merged.csv.gz', extensions=['.yaml']), mgd.OutputFile(reads, extensions=['.yaml']), ), ) workflow.transform( name='merge_segs', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.OutputFile(segs, extensions=['.yaml']), ), kwargs={'low_memory': True} ) workflow.transform( name='merge_metrics', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempOutputFile("hmm_metrics.csv.gz", extensions=['.yaml']), ), ) workflow.transform( name='merge_params', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.OutputFile(params, extensions=['.yaml']), ), ) workflow.transform( name='get_max_cn', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.get_max_cn", ret=mgd.TempOutputObj('max_cn'), args=( mgd.InputFile(reads, extensions=['.yaml']), ) ) workflow.transform( name='hmmcopy_plots', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_hmmcopy", axes=('cell_id',), args=( mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), hmmparams['ref_genome'], mgd.TempOutputFile('segments.png', 'cell_id', axes_origin=[]), mgd.TempOutputFile('bias.png', 'cell_id', axes_origin=[]), mgd.InputInstance('cell_id'), ), kwargs={ 'num_states': hmmparams['num_states'], 'sample_info': mgd.TempInputObj('sampleinfo', 'cell_id'), 'max_cn': mgd.TempInputObj("max_cn") } ) workflow.transform( name='annotate_metrics_with_info_and_clustering', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.add_clustering_order", args=( mgd.InputFile(reads, extensions=['.yaml']), mgd.TempInputFile("hmm_metrics.csv.gz", extensions=['.yaml']), mgd.OutputFile(metrics, extensions=['.yaml']), ), kwargs={ 'chromosomes': hmmparams["chromosomes"], 'sample_info': sample_info } ) workflow.transform( name='merge_hmm_copy_plots', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.merge_pdf", args=( [ mgd.TempInputFile('segments.png', 'cell_id'), mgd.TempInputFile('bias.png', 'cell_id'), ], [ mgd.OutputFile(segs_pdf), mgd.OutputFile(bias_pdf), ], mgd.InputFile(metrics, extensions=['.yaml']), None, mgd.TempSpace("hmmcopy_plot_merge_temp"), ['segments', 'bias'] ) ) workflow.transform( name='create_igv_seg', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.create_igv_seg", args=( mgd.InputFile(segs, extensions=['.yaml']), mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(igv_seg_filename), hmmparams, ) ) workflow.transform( name='plot_metrics', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_metrics", args=( mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_metrics_output), 'QC pipeline metrics', ) ) workflow.transform( name='plot_kernel_density', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_kernel_density", args=( mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_kernel_density_output), ',', 'mad_neutral_state', 'QC pipeline metrics', ) ) workflow.transform( name='plot_heatmap_ec', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_pcolor", args=( mgd.InputFile(reads, extensions=['.yaml']), mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_heatmap_ec_output), ), kwargs={ 'plot_title': 'QC pipeline metrics', 'column_name': 'state', 'plot_by_col': 'experimental_condition', 'color_by_col': 'cell_call', 'chromosomes': chromosomes, 'max_cn': hmmparams['num_states'], 'scale_by_cells': False, 'mappability_threshold': hmmparams["map_cutoff"] } ) workflow.transform( name='merge_hmmcopy_data_tars', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.utils.helpers.tar_files", args=( mgd.TempInputFile('hmm_data.tar.gz', 'cell_id', axes_origin=[]), mgd.OutputFile(hmmcopy_data_tar), mgd.TempSpace("merge_tarballs") ), ) return workflow
def create_aneufinder_workflow( bam_file, cell_ids, config, aneufinder_results_filename, aneufinder_pdf_filename, ): baseimage = config['docker']['single_cell_pipeline'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.transform( name='run_aneufinder_on_individual_cells', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.aneufinder.tasks.run_aneufinder", axes=('cell_id', ), args=( mgd.InputFile('bam_file', 'cell_id', fnames=bam_file), mgd.TempSpace('working_dir', 'cell_id', fnames=bam_file), mgd.InputInstance('cell_id'), mgd.TempOutputFile('segments.csv', 'cell_id'), mgd.TempOutputFile('reads.csv', 'cell_id'), mgd.TempOutputFile('dnacopy.pdf', 'cell_id'), ), kwargs={'docker_image': config['docker']['aneufinder']}) workflow.transform( name='merge_outputs', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.aneufinder.tasks.merge_outputs_to_hdf", args=( mgd.TempInputFile('reads.csv', 'cell_id'), mgd.TempInputFile('segments.csv', 'cell_id'), mgd.OutputFile(aneufinder_results_filename), mgd.TempSpace("aneufinder_merge"), )) workflow.transform(name='merge_aneufinder_pdfs', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.workflows.aneufinder.tasks.merge_pdf", args=( [mgd.TempInputFile('dnacopy.pdf', 'cell_id')], [mgd.OutputFile(aneufinder_pdf_filename)], )) return workflow