def run_Strelka(config, normal_bam, tumour_bam, snv_output_file, indel_output_file): workflow = pypeliner.workflow.Workflow() workflow.transform(name='configure_bed', func=tasks.configure_bed, args=(mgd.TempSpace('bed_space'), mgd.InputFile(config['bed_file']), mgd.TempOutputFile('bed.gz'), mgd.TempOutputFile('bed.gz.tbi'))) workflow.transform(name='run_strelka', ctx={ 'mem': 10, 'ncpus': 1, 'walltime': '08:00' }, func=tasks.run_strelka, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.TempInputFile('bed.gz'), mgd.TempInputFile('bed.gz.tbi'), mgd.TempSpace('strelka_workspace'), mgd.OutputFile(snv_output_file), mgd.OutputFile(indel_output_file), )) return workflow
def fastqc_workflow(fastq_r1, fastq_r2, r1_html, r1_plot, r2_html, r2_plot): workflow = pypeliner.workflow.Workflow() workflow.transform( name="fastqc_r1", ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), func='wgs.workflows.alignment.tasks.run_fastqc', args=( mgd.InputFile(fastq_r1), mgd.OutputFile(r1_html), mgd.OutputFile(r1_plot), mgd.TempSpace('fastqc_R1'), ), ) workflow.transform( name="fastqc_r2", func='wgs.workflows.alignment.tasks.run_fastqc', ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), args=( mgd.InputFile(fastq_r2), mgd.OutputFile(r2_html), mgd.OutputFile(r2_plot), mgd.TempSpace('fastqc_R2'), ), ) return workflow
def create_optitype_workflow(bam_file, hla_type_file, is_rna=False, threads=1): if check_chr_prefix(bam_file): chrom_str = 'chr6' else: chrom_str = '6' sandbox = soil.utils.workflow.get_sandbox( ['optitype', 'razers3', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.commandline( name='extract_chr6', args=( 'samtools', 'view', '-bh', '-f', '2', '-F', '4', mgd.InputFile(bam_file), chrom_str, '|', 'samtools', 'collate', '-O', '-', mgd.TempSpace('chr6_collate_temp'), '|', 'samtools', 'bam2fq', '-1', mgd.TempOutputFile('chr6_reads_1.fq'), '-2', mgd.TempOutputFile('chr6_reads_2.fq'), '-', ), ) workflow.transform(name='optitype', ctx={ 'mem': 24, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, func=tasks.run_optitype, args=( mgd.TempInputFile('chr6_reads_1.fq'), mgd.TempInputFile('chr6_reads_2.fq'), mgd.OutputFile(hla_type_file), mgd.TempSpace('optitype_temp'), ), kwargs={ 'is_rna': is_rna, 'threads': threads, }) return workflow
def create_align_workflow(fastq_file_1, fastq_file_2, ref_genome_dir, out_bam_file, add_xs_tag=False, align_threads=1, read_group_info=None, sort_threads=1): sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='star_align', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': align_threads }, func=tasks.align, args=( mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), ref_genome_dir, mgd.TempOutputFile('aligned.bam'), mgd.TempSpace('align_tmp'), ), kwargs={ 'add_xs_tag': add_xs_tag, 'read_group_info': read_group_info, 'threads': align_threads, }) workflow.transform(name='sort', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': sort_threads }, func=soil.wrappers.sambamba.tasks.sort, args=( mgd.TempInputFile('aligned.bam'), mgd.OutputFile(out_bam_file), mgd.TempSpace('sort_tmp'), ), kwargs={'threads': sort_threads}) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_vcf2maf_workflow(vcf_file, maf_file, reference, tumour_id=None, normal_id=None): workflow = pypeliner.workflow.Workflow() workflow.transform(name='vcf2maf', func='wgs.workflows.vcf2maf.tasks.run_vcf2maf', args=(mgd.InputFile(vcf_file), mgd.TempOutputFile('maf_file.maf'), mgd.TempSpace('vcf2maf_temp'), reference), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) workflow.transform(name='update_ids', func='wgs.workflows.vcf2maf.tasks.update_ids', args=( mgd.TempInputFile('maf_file.maf'), tumour_id, normal_id, mgd.OutputFile(maf_file), )) return workflow
def run_MutationSeq(config, normal_bam, tumour_bam, output_file): workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X']))) workflow.transform( name='run_museq_paired', ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'}, axes=('interval',), func=tasks.run_museq, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.InputInstance('interval'), mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), ) ) workflow.transform( name='merge_vcfs', func=tasks.merge_vcfs, args=( mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]), mgd.OutputFile(output_file), mgd.TempSpace('merge_vcf'), ) ) return workflow
def create_svaba_workflow( tumour_bam, normal_bam, svaba_vcf, reference, ): workflow = pypeliner.workflow.Workflow() workflow.transform( name='run_svaba', ctx=helpers.get_default_ctx(memory=10, walltime='72:00', ncpus='8', disk=300), func='wgs.workflows.svaba.tasks.run_svaba', args=(mgd.InputFile(tumour_bam), mgd.InputFile(normal_bam), mgd.TempOutputFile('germline.indel.vcf.gz'), mgd.TempOutputFile('germline.sv.vcf.gz'), mgd.TempOutputFile('somatic.indel.vcf.gz'), mgd.OutputFile(svaba_vcf), mgd.TempOutputFile('unfiltered.germline.indel.vcf.gz'), mgd.TempOutputFile('unfiltered.germline.sv.vcf.gz'), mgd.TempOutputFile('unfiltered.somatic.indel.vcf.gz'), mgd.TempOutputFile('unfiltered.somatic.sv.vcf.gz'), reference, mgd.TempSpace('svaba_tempdir_full')), kwargs={ 'ncores': 8, }) return workflow
def create_extract_seqdata_workflow( bam_filename, seqdata_filename, remixt_config, remixt_ref_data_dir, config, ): ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'docker_image': config['docker']['single_cell_pipeline'], 'mem': config["memory"]['high'] } workflow = pypeliner.workflow.Workflow() workflow.transform( name='create_cell_seqdata', ctx=ctx, func= "single_cell.workflows.extract_seqdata.tasks.create_chromosome_seqdata", args=( mgd.OutputFile(seqdata_filename), mgd.InputFile(bam_filename, extensions=['.bai']), mgd.TempSpace("extract_seqdata_temp"), remixt_config, remixt_ref_data_dir, ), kwargs={'chromosomes': config['chromosomes']}) return workflow
def destruct_preprocess_workflow(normal_bam_files, normal_stats, normal_reads_1, normal_reads_2, normal_sample_1, normal_sample_2, ref_data_directory, destruct_config, tag=False): workflow = pypeliner.workflow.Workflow() workflow.transform(name="get_destruct_config", func="destruct.defaultconfig.get_config", ret=mgd.TempOutputObj("destruct_config"), args=(ref_data_directory, destruct_config)) if isinstance(normal_bam_files, str): workflow.transform( name='bamdisc_normal', func= "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads", ctx={ 'io': 1, 'mem': 8, 'disk': 200 }, args=( mgd.TempInputObj("destruct_config"), mgd.InputFile(normal_bam_files), mgd.OutputFile(normal_stats), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.TempSpace('bamdisc_normal_tempspace'), )) else: workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_bam_files.keys()), ) workflow.subworkflow(name='process_normal_cells', func=process_cells_destruct, args=( mgd.TempInputObj("destruct_config"), mgd.InputFile('bam', 'normal_cell_id', fnames=normal_bam_files), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.OutputFile(normal_stats), ), kwargs={'tag': tag}) return workflow
def pre_alignment(fastq_r1, fastq_r2, metrics_tar): workflow = pypeliner.workflow.Workflow() workflow.transform( name="fastqc_r1", ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), func='alignment.workflows.pre_alignment.tasks.run_fastqc', args=( mgd.InputFile(fastq_r1), mgd.TempOutputFile('R1.html'), mgd.TempOutputFile('R1.pdf'), mgd.TempSpace('fastqc_R1'), ), kwargs={ 'docker_image': config.containers("fastqc"), }) workflow.transform( name="fastqc_r2", func='alignment.workflows.pre_alignment.tasks.run_fastqc', ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), args=( mgd.InputFile(fastq_r2), mgd.TempOutputFile('R2.html'), mgd.TempOutputFile('R2.pdf'), mgd.TempSpace('fastqc_R2'), ), kwargs={ 'docker_image': config.containers('fastqc'), }) workflow.transform(name='tar', func='alignment.utils.helpers.make_tar_from_files', axes=('sample_id', ), args=(mgd.OutputFile(metrics_tar), [ mgd.TempInputFile('R2.html'), mgd.TempInputFile('R2.pdf'), mgd.TempInputFile('R2.html'), mgd.TempInputFile('R2.pdf'), ], mgd.TempSpace('wgs_metrics'))) return workflow
def create_patient_workflow(pseudo_bulk_group, mafs, sample_all_snv_csvs, mutationreport, merged_maf, high_impact_maf, merged_snvs, merged_high_impact_snvs): ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.transform( name='merge_mafs', func='single_cell.workflows.pseudo_bulk_qc.tasks.merge_mafs', args=( mafs, mgd.OutputFile(merged_maf), ), kwargs={"id_colname": True}) workflow.transform( name='filter_merged_maf', func= 'single_cell.workflows.pseudo_bulk_qc.tasks.filter_maf_for_high_impact', args=( mgd.InputFile(merged_maf), mgd.OutputFile(high_impact_maf), ), ) workflow.transform( name='merge_snvs', func='single_cell.workflows.pseudo_bulk_qc.tasks.merge_snvs', args=( sample_all_snv_csvs, mgd.OutputFile(merged_snvs), ), kwargs={"id_colname": True}) workflow.transform( name='filter_snvs', func= 'single_cell.workflows.pseudo_bulk_qc.tasks.filter_snvs_for_high_impact', args=( mgd.InputFile(merged_snvs), mgd.OutputFile(merged_high_impact_snvs), ), ) workflow.transform( name='mutationreport', func= 'single_cell.workflows.pseudo_bulk_qc.tasks.create_mutation_report', args=(pseudo_bulk_group, mgd.InputFile(merged_maf), mgd.InputFile(high_impact_maf), mgd.InputFile(merged_high_impact_snvs), mgd.OutputFile(mutationreport), mgd.TempSpace("mutationreport")), ) return workflow
def create_merge_bams_workflow( input_bams, merged_bams, regions, config, ): merged_bams = dict([(region, merged_bams[region]) for region in regions]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(input_bams.keys()), ) workflow.setobj( obj=mgd.OutputChunks('region'), value=regions, ) one_split_job = config["one_split_job"] if one_split_job: workflow.transform( name='merge_bams', ctx={'mem': config['memory']['med'], 'ncpus': config['max_cores']}, func="single_cell.workflows.merge_bams.tasks.merge_bams", args=( mgd.InputFile('bam', 'cell_id', fnames=input_bams, extensions=['.bai']), mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[], extensions=['.bai']), regions, mgd.TempSpace("merge_bams_tempdir") ), kwargs={"ncores": config["max_cores"]} ) else: workflow.transform( name='split_merge_tumour', func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams', axes=('region',), args=( mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=input_bams), mgd.OutputFile( 'tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=merged_bams), mgd.Instance('region'), ), ) return workflow
def create_somatic_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)): regions = utils.get_bam_regions(normal_bam_file, split_size, chromosomes=chromosomes) workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=regions) workflow.transform( name='run_somatic', axes=('regions', ), ctx={ 'mem': 6, 'mem_retry_increment': 2, 'num_retry': 3 }, func=tasks.run_somatic, args=( mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('region.vcf.gz', 'regions'), mgd.TempInputObj('config', 'regions'), mgd.TempSpace('varscan_tmp', 'regions'), ), ) workflow.transform( name='merge', axes=(), ctx={ 'mem': 2, 'mem_retry_increment': 2, 'num_retry': 3 }, func=vcf_tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.OutputFile(out_file), ), ) return workflow
def circos_plot(titan_calls, remixt_calls, sample_id, breakpoints, circos_plot_remixt, circos_plot_titan): workflow = pypeliner.workflow.Workflow() workflow.transform( name='prep_titan', func='wgs_qc_utils.reader.read_titan.make_for_circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(titan_calls), mgd.TempOutputFile("titan_prepped"), ) ) workflow.transform( name='prep_remixt', func='wgs_qc_utils.reader.read_remixt.make_for_circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(remixt_calls), sample_id, mgd.TempOutputFile("remixt_prepped"), ) ) workflow.transform( name='circos_plot', func='wgs.workflows.sample_qc.tasks.circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.TempInputFile("titan_prepped"), mgd.TempInputFile("remixt_prepped"), sample_id, breakpoints, mgd.OutputFile(circos_plot_remixt), mgd.OutputFile(circos_plot_titan), mgd.TempSpace("circos") ) ) return workflow
def create_align_workflow(fastq_file_1, fastq_file_2, ref_genome_fasta_file, out_bam_file, threads=1): sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.subworkflow( name='align', func=soil.wrappers.bwa.workflows.create_align_workflow, args=(mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('aligned.bam')), kwargs={ 'align_threads': threads, 'sort_threads': threads }) workflow.transform(name='mark_dups', func=soil.wrappers.sambamba.tasks.markdups, args=(mgd.TempInputFile('aligned.bam'), mgd.OutputFile(out_bam_file), mgd.TempSpace('mark_dups_tmp')), kwargs={'threads': threads}) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_destruct_workflow( bam_filenames, breakpoint_table, breakpoint_library_table, breakpoint_read_table, config, ref_data_dir, raw_data_dir=None, ): # Optionally cache raw reads for quicker rerun if raw_data_dir is not None: mgd_stats = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_stats.txt'), 'bylibrary') mgd_reads_1 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_reads1.fq.gz'), 'bylibrary') mgd_reads_2 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_reads2.fq.gz'), 'bylibrary') mgd_sample_1 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_sample1.fq.gz'), 'bylibrary') mgd_sample_2 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_sample2.fq.gz'), 'bylibrary') else: mgd_stats = mgd.TempFile('stats.txt', 'bylibrary') mgd_reads_1 = mgd.TempFile('reads1.fq.gz', 'bylibrary') mgd_reads_2 = mgd.TempFile('reads2.fq.gz', 'bylibrary') mgd_sample_1 = mgd.TempFile('sample1.fq.gz', 'bylibrary') mgd_sample_2 = mgd.TempFile('sample2.fq.gz', 'bylibrary') config = destruct.defaultconfig.get_config(ref_data_dir, config) workflow = pypeliner.workflow.Workflow() # Set the library ids workflow.setobj( obj=mgd.TempOutputObj('library_id', 'bylibrary'), value=destruct.tasks.create_library_ids(bam_filenames.keys()), ) # Retrieve discordant reads and stats from bam files workflow.commandline( name='bamdisc', axes=('bylibrary', ), ctx={ 'io': 1, 'mem': 8 }, args=( 'destruct_bamdiscordantfastq', '-r', '-c', config['bam_max_soft_clipped'], '-f', config['bam_max_fragment_length'], '-b', mgd.InputFile('bam', 'bylibrary', fnames=bam_filenames), '-s', mgd_stats.as_output(), '--fastq1', mgd_reads_1.as_output(), '--fastq2', mgd_reads_2.as_output(), '-t', mgd.TempSpace('bamdisc.tempspace', 'bylibrary'), '-n', config['num_read_samples'], '--sample1', mgd_sample_1.as_output(), '--sample2', mgd_sample_2.as_output(), ), ) workflow.subworkflow( name='destruct_fastq', func=create_destruct_fastq_workflow, args=( mgd_reads_1.as_input(), mgd_reads_2.as_input(), mgd_sample_1.as_input(), mgd_sample_2.as_input(), mgd_stats.as_input(), mgd.OutputFile(breakpoint_table), mgd.OutputFile(breakpoint_library_table), mgd.OutputFile(breakpoint_read_table), config, ref_data_dir, ), kwargs={ 'raw_data_dir': raw_data_dir, }, ) return workflow
def create_destruct_fastq_workflow( fastq1_filenames, fastq2_filenames, sample1_filenames, sample2_filenames, stats_filenames, breakpoint_table, breakpoint_library_table, breakpoint_read_table, config, ref_data_dir, raw_data_dir=None, ): workflow = pypeliner.workflow.Workflow() # Set the library ids workflow.setobj( obj=mgd.TempOutputObj('library_id', 'bylibrary'), value=destruct.tasks.create_library_ids(fastq1_filenames.keys()), ) workflow.transform( name='readstats', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.read_stats', ret=mgd.TempOutputObj('stats', 'bylibrary'), args=( mgd.InputFile('stats.txt', 'bylibrary', fnames=stats_filenames), config['fragment_length_num_stddevs'], ), ) # Align a sample of reads and calculate alignment statistics workflow.transform( name='prepseed_sample', axes=('bylibrary', ), ctx=medmem, func='destruct.tasks.prepare_seed_fastq', args=( mgd.InputFile('sample1.fq.gz', 'bylibrary', fnames=sample1_filenames), mgd.InputFile('sample2.fq.gz', 'bylibrary', fnames=sample2_filenames), 36, mgd.TempOutputFile('sample.seed', 'bylibrary'), ), ) workflow.commandline( name='bwtrealign_sample', axes=('bylibrary', ), ctx=medmem, args=( 'bowtie', config['genome_fasta'], mgd.TempInputFile('sample.seed', 'bylibrary'), '--chunkmbs', '512', '-k', '1000', '-m', '1000', '--strata', '--best', '-S', '|', 'destruct_aligntrue', '-a', '-', '-1', mgd.InputFile('sample1.fq.gz', 'bylibrary', fnames=sample1_filenames), '-2', mgd.InputFile('sample2.fq.gz', 'bylibrary', fnames=sample2_filenames), '-r', config['genome_fasta'], '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmin', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'), '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '-s', mgd.TempOutputFile('samples.align.true', 'bylibrary'), ), ) workflow.transform( name='scorestats', axes=('bylibrary', ), ctx=medmem, func='destruct.score_stats.create_score_stats', args=( mgd.TempInputFile('samples.align.true', 'bylibrary'), config['match_score'], mgd.TempOutputFile('score.stats', 'bylibrary'), ), ) # Split discordant fastqs and align workflow.transform( name='splitfastq1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.split_fastq', args=( mgd.InputFile('reads1.fq.gz', 'bylibrary', fnames=fastq1_filenames), int(config['reads_per_split']), mgd.TempOutputFile('reads1', 'bylibrary', 'byread'), ), ) workflow.transform( name='splitfastq2', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.split_fastq', args=( mgd.InputFile('reads2.fq.gz', 'bylibrary', fnames=fastq2_filenames), int(config['reads_per_split']), mgd.TempOutputFile('reads2', 'bylibrary', 'byread', axes_origin=[]), ), ) workflow.transform( name='prepseed', axes=('bylibrary', 'byread'), ctx=medmem, func='destruct.tasks.prepare_seed_fastq', args=( mgd.TempInputFile('reads1', 'bylibrary', 'byread'), mgd.TempInputFile('reads2', 'bylibrary', 'byread'), 36, mgd.TempOutputFile('reads.seed', 'bylibrary', 'byread'), ), ) workflow.commandline( name='bwtrealign', axes=('bylibrary', 'byread'), ctx=medmem, args=( 'bowtie', config['genome_fasta'], mgd.TempInputFile('reads.seed', 'bylibrary', 'byread'), '--chunkmbs', '512', '-k', '1000', '-m', '1000', '--strata', '--best', '-S', '|', 'destruct_realign2', '-l', mgd.TempInputObj('library_id', 'bylibrary'), '-a', '-', '-1', mgd.TempInputFile('reads1', 'bylibrary', 'byread'), '-2', mgd.TempInputFile('reads2', 'bylibrary', 'byread'), '-r', config['genome_fasta'], '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmin', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'), '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '--tchimer', config['chimeric_threshold'], '--talign', config['alignment_threshold'], '--pchimer', config['chimeric_prior'], '--tvalid', config['readvalid_threshold'], '-z', mgd.TempInputFile('score.stats', 'bylibrary'), '--span', mgd.TempOutputFile('spanning.alignments', 'bylibrary', 'byread'), '--split', mgd.TempOutputFile('split.alignments', 'bylibrary', 'byread'), ), ) workflow.transform( name='merge_spanning_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_files_by_line', args=( mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'), mgd.TempOutputFile('spanning.alignments_1', 'bylibrary'), ), ) workflow.commandline( name='filterreads', axes=('bylibrary', ), ctx=lowmem, args=( 'destruct_filterreads', '-n', '2', '-a', mgd.TempInputFile('spanning.alignments_1', 'bylibrary'), '-r', config['satellite_regions'], '>', mgd.TempOutputFile('spanning.alignments', 'bylibrary'), ), ) workflow.transform( name='merge_split_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_files_by_line', args=( mgd.TempInputFile('split.alignments', 'bylibrary', 'byread'), mgd.TempOutputFile('split.alignments', 'bylibrary'), ), ) workflow.transform( name='merge_spanning_2', ctx=lowmem, func='destruct.tasks.merge_alignment_files', args=( mgd.TempInputFile('spanning.alignments', 'bylibrary'), mgd.TempOutputFile('spanning.alignments'), mgd.TempInputObj('library_id', 'bylibrary'), ), ) workflow.transform( name='merge_split_2', ctx=lowmem, func='destruct.tasks.merge_alignment_files', args=( mgd.TempInputFile('split.alignments', 'bylibrary'), mgd.TempOutputFile('split.alignments'), mgd.TempInputObj('library_id', 'bylibrary'), ), ) # Cluster spanning reads workflow.setobj( obj=mgd.TempOutputObj('chrom.args', 'bychromarg'), value=destruct.tasks.generate_chromosome_args(config['chromosomes']), ) workflow.transform( name='write_stats_table', ctx=lowmem, func='destruct.tasks.write_stats_table', args=( mgd.TempInputObj('library_id', 'bylibrary'), mgd.TempInputObj('stats', 'bylibrary'), mgd.TempOutputFile('libstats.tsv'), ), ) workflow.commandline( name='cluster', axes=('bychromarg', ), ctx=medmem, args=( 'destruct_mclustermatepairs', '-a', mgd.TempInputFile('spanning.alignments'), '-s', mgd.TempInputFile('libstats.tsv'), '-c', mgd.TempOutputFile('clusters', 'bychromarg'), mgd.TempInputObj('chrom.args', 'bychromarg'), '--clustmin', config['cluster_readcount_threshold'], '--fragmax', config['fragment_length_max'], ), ) # Predict breakpoints from split reads workflow.transform( name='predict_breaks', axes=('bychromarg', ), ctx=medmem, func='destruct.predict_breaks.predict_breaks', args=( mgd.TempInputFile('clusters', 'bychromarg'), mgd.TempInputFile('spanning.alignments'), mgd.TempInputFile('split.alignments'), mgd.TempOutputFile('breakpoints_2', 'bychromarg'), ), ) workflow.transform( name='merge_clusters', ctx=lowmem, func='destruct.tasks.merge_clusters', args=( mgd.TempInputFile('clusters', 'bychromarg'), mgd.TempInputFile('breakpoints_2', 'bychromarg'), mgd.TempOutputFile('clusters'), mgd.TempOutputFile('breakpoints_2'), mgd.TempOutputFile('merge_clusters.debug'), ), ) # Realign reads to breakpoints workflow.commandline( name='realigntobreaks', axes=('bylibrary', 'byread'), ctx=medmem, args=( 'destruct_realigntobreaks2', '-r', config['genome_fasta'], '-b', mgd.TempInputFile('breakpoints_2'), '-c', mgd.TempInputFile('clusters'), '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '--span', mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'), '-1', mgd.TempInputFile('reads1', 'bylibrary', 'byread'), '-2', mgd.TempInputFile('reads2', 'bylibrary', 'byread'), '--realignments', mgd.TempOutputFile('realignments', 'bylibrary', 'byread'), ), ) # Calculate likelihoods based on realignments workflow.transform( name='calculate_realignment_likelihoods', axes=('bylibrary', 'byread'), ctx=medmem, func='destruct.predict_breaks.calculate_realignment_likelihoods', args=( mgd.TempInputFile('breakpoints_2'), mgd.TempInputFile('realignments', 'bylibrary', 'byread'), mgd.TempInputFile('score.stats', 'bylibrary'), mgd.TempOutputFile('likelihoods_2', 'bylibrary', 'byread'), config['match_score'], mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_mean'), mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_stddev'), ), ) workflow.transform( name='merge_likelihoods_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_sorted_files_by_line', args=( mgd.TempInputFile('likelihoods_2', 'bylibrary', 'byread'), mgd.TempOutputFile('likelihoods_2', 'bylibrary'), mgd.TempSpace('merge_likelihoods_1_temp', 'bylibrary'), '1', ), ) workflow.transform( name='merge_likelihoods_2', ctx=lowmem, func='destruct.tasks.merge_sorted_files_by_line', args=( mgd.TempInputFile('likelihoods_2', 'bylibrary'), mgd.TempOutputFile('likelihoods_2'), mgd.TempSpace('merge_likelihoods_2_temp'), '1', ), ) # Set cover for multi mapping reads workflow.transform( name='calc_weights', ctx=medmem, func='destruct.predict_breaks.calculate_cluster_weights', args=( mgd.TempInputFile('breakpoints_2'), mgd.TempOutputFile('cluster_weights'), ), ) workflow.commandline( name='setcover', ctx=medmem, args=( 'destruct_setcover', '-c', mgd.TempInputFile('clusters'), '-w', mgd.TempInputFile('cluster_weights'), '-a', mgd.TempOutputFile('clusters_setcover'), ), ) # Select cluster based on setcover workflow.transform( name='select_clusters', ctx=medmem, func='destruct.predict_breaks.select_clusters', args=( mgd.TempInputFile('clusters_setcover'), mgd.TempInputFile('breakpoints_2'), mgd.TempOutputFile('breakpoints_1'), mgd.TempInputFile('likelihoods_2'), mgd.TempOutputFile('likelihoods_1'), ), ) # Select prediction based on max likelihood workflow.transform( name='select_predictions', ctx=himem, func='destruct.predict_breaks.select_predictions', args=( mgd.TempInputFile('breakpoints_1'), mgd.TempOutputFile('breakpoints'), mgd.TempInputFile('likelihoods_1'), mgd.TempOutputFile('likelihoods'), config['mate_score_threshold'], config['template_length_min_threshold'], config['min_alignment_log_likelihood'], ), ) # Optionally tabulate supporting reads workflow.transform( name='tabreads', ctx=medmem, func='destruct.tasks.tabulate_reads', args=( mgd.TempInputFile('clusters_setcover'), mgd.TempInputFile('likelihoods'), mgd.TempInputObj('library_id', 'bylibrary'), mgd.InputFile('reads1.fq.gz', 'bylibrary', fnames=fastq1_filenames), mgd.InputFile('reads2.fq.gz', 'bylibrary', fnames=fastq2_filenames), mgd.TempOutputFile('breakreads.table.unsorted'), ), ) workflow.commandline( name='sortreads', ctx=medmem, args=( 'sort', '-n', mgd.TempInputFile('breakreads.table.unsorted'), '>', mgd.OutputFile(breakpoint_read_table), ), ) # Tabulate results workflow.transform( name='tabulate', ctx=himem, func='destruct.tasks.tabulate_results', args=( mgd.TempInputFile('breakpoints'), mgd.TempInputFile('likelihoods'), mgd.TempInputObj('library_id', 'bylibrary'), config['genome_fasta'], config['gtf_filename'], config['dgv_filename'], mgd.OutputFile(breakpoint_table), mgd.OutputFile(breakpoint_library_table), ), ) return workflow
def create_search_workflow(in_fasta_file, in_mzml_file, out_file, add_decoys=True, fixed_mods=None, max_mods=1, precursor_mass_tolerance='20ppm', search_mem=5, split_size=1000, variable_mods=None): sandbox = soil.utils.workflow.get_sandbox(['msgf_plus', 'proteowizard']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='index_db', ctx={ 'mem': 4, 'mem_retry_increment': 8, 'num_retry': 3 }, func=tasks.build_index_sentinel, args=(mgd.InputFile(in_fasta_file), mgd.TempOutputFile('db.sentinel')), kwargs={'add_decoys': add_decoys}) workflow.transform(name='split_mzml_file', func=soil.wrappers.proteowizard.tasks.split_mzml_file, args=( mgd.InputFile(in_mzml_file), mgd.TempOutputFile('spec_data.mzml', 'split'), mgd.TempSpace('split_tmp'), ), kwargs={ 'split_size': split_size, }) workflow.transform(name='run_msgf_plus', axes=('split', ), ctx={ 'mem': search_mem + 3, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.run_search_sentinel, args=( mgd.TempInputFile('db.sentinel'), mgd.TempInputFile('spec_data.mzml', 'split'), mgd.TempOutputFile('search.mzid', 'split'), mgd.TempSpace('msgf_tmp', 'split'), ), kwargs={ 'add_decoys': False, 'fixed_mods': fixed_mods, 'max_mods': max_mods, 'mem': search_mem, 'precursor_mass_tolerance': precursor_mass_tolerance, 'variable_mods': variable_mods }) workflow.transform(name='convert_to_tsv', axes=('split', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.convert_mzid_to_tsv, args=( mgd.TempInputFile('search.mzid', 'split'), mgd.TempOutputFile('search.tsv', 'split'), )) workflow.transform(name='merge_results', func=tasks.merge_results, args=(mgd.TempInputFile('search.tsv', 'split'), mgd.TempOutputFile('merged.tsv'))) workflow.transform(name='convert_output', func=tasks.convert_msgf_to_final, args=(mgd.TempInputFile('merged.tsv'), mgd.TempOutputFile('final.tsv.gz'))) workflow.transform(name='clean_up', func=tasks.clean_up, args=(mgd.TempInputFile('db.sentinel'), mgd.TempInputFile('final.tsv.gz'), mgd.OutputFile(out_file))) return workflow
def create_percolator_workflow(in_fasta_file, in_mzml_file, out_file, fixed_mods=None, max_mods=1, split_size=1000, variable_mods=None): sandbox = soil.utils.workflow.get_sandbox( ['msgf_plus', 'percolator', 'proteowizard']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='build_decoy_db', func=tasks.build_decoy_db, args=( mgd.InputFile(in_fasta_file), mgd.TempOutputFile('decoy.no_index.fasta'), ), kwargs={'decoy_prefix': 'XXX_'}) workflow.transform(name='index_decoy_db', func=tasks.build_index, args=(mgd.TempInputFile('decoy.no_index.fasta'), mgd.TempOutputFile('decoy.fasta')), kwargs={'add_decoys': False}) workflow.transform(name='index_target_db', func=tasks.build_index, args=(mgd.InputFile(in_fasta_file), mgd.TempOutputFile('target.fasta')), kwargs={'add_decoys': False}) workflow.transform(name='split_mzml_file', func=soil.wrappers.proteowizard.tasks.split_mzml_file, args=( mgd.InputFile(in_mzml_file), mgd.TempOutputFile('spec_data.mzml', 'split'), mgd.TempSpace('split_tmp'), ), kwargs={ 'split_size': split_size, }) workflow.transform(name='run_msgf_plus_decoy', axes=('split', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.run_search, args=( mgd.TempInputFile('decoy.fasta'), mgd.TempInputFile('spec_data.mzml', 'split'), mgd.TempOutputFile('decoy_search_results.mzid', 'split'), mgd.TempSpace('msgf_decoy_tmp', 'split'), ), kwargs={ 'add_decoys': False, 'add_features': True, 'fixed_mods': fixed_mods, 'max_mods': max_mods, 'variable_mods': variable_mods }) workflow.transform(name='run_msgf_plus_target', axes=('split', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.run_search, args=( mgd.TempInputFile('target.fasta'), mgd.TempInputFile('spec_data.mzml', 'split'), mgd.TempOutputFile('target_search_results.mzid', 'split'), mgd.TempSpace('msgf_target_tmp', 'split'), ), kwargs={ 'add_decoys': False, 'add_features': True, 'fixed_mods': fixed_mods, 'max_mods': max_mods, 'variable_mods': variable_mods }) workflow.transform(name='convert_to_tsv_decoy', axes=('split', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.convert_mzid_to_tsv, args=( mgd.TempInputFile('decoy_search_results.mzid', 'split'), mgd.TempOutputFile('decoy_search.tsv', 'split'), )) workflow.transform(name='convert_to_tsv_target', axes=('split', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.convert_mzid_to_tsv, args=( mgd.TempInputFile('target_search_results.mzid', 'split'), mgd.TempOutputFile('target_search.tsv', 'split'), )) workflow.transform(name='merge_results', func=tasks.merge_results, args=([ mgd.TempInputFile('decoy_search.tsv', 'split'), mgd.TempInputFile('target_search.tsv', 'split') ], mgd.TempOutputFile('merged.tsv'))) workflow.transform(name='convert_output', func=tasks.convert_msgf_to_final, args=(mgd.TempInputFile('merged.tsv'), mgd.OutputFile( out_file.replace('.tsv', '.msgf.tsv.gz')))) workflow.transform(name='run_msgf2pin', ctx={ 'mem': 4, 'mem_retry_increment': 4, 'num_retry': 3 }, func=soil.wrappers.percolator.tasks.convert_msgf_to_pin, args=(mgd.TempInputFile('decoy_search_results.mzid', 'split'), mgd.TempInputFile('target_search_results.mzid', 'split'), mgd.TempOutputFile('percolator_input.tsv'), mgd.TempSpace('msgf2pin_tmp'))) workflow.transform(name='run_percolator', ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3 }, func=soil.wrappers.percolator.tasks.run_percolator, args=(mgd.TempInputFile('percolator_input.tsv'), mgd.TempOutputFile('final.tsv'))) workflow.transform(name='clean_up_decoy', func=tasks.clean_up, args=([ mgd.TempInputFile('decoy.fasta'), mgd.TempInputFile('target.fasta') ], mgd.TempInputFile('final.tsv'), mgd.OutputFile(out_file))) return workflow
def infer_haps( bam_file, haplotypes_filename, allele_counts_filename, config, normal=False, ): baseimage = {'docker_image': config['docker']['single_cell_pipeline']} remixt_config = config.get('extract_seqdata', {}) remixt_ref_data_dir = config['ref_data_dir'] chromosomes = config['chromosomes'] ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, **baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) if isinstance(bam_file, dict): workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_file.keys()), ) # dont parallelize over chromosomes for per cell bams workflow.subworkflow( name="extract_seqdata", axes=('cell_id', ), func= 'single_cell.workflows.extract_seqdata.create_extract_seqdata_workflow', args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']), mgd.TempOutputFile('seqdata_cell.h5', 'cell_id'), config.get('extract_seqdata', {}), config['ref_data_dir'], config, )) workflow.transform( name='merge_all_seqdata', func="single_cell.workflows.titan.tasks.merge_overlapping_seqdata", args=(mgd.TempOutputFile('seqdata_file.h5'), mgd.TempInputFile("seqdata_cell.h5", "cell_id"), config["chromosomes"]), ) else: # if its a single bam, then its probably whole genome # so parallelize over chromosomes workflow.subworkflow( name='extract_seqdata', func='remixt.workflow.create_extract_seqdata_workflow', ctx={'disk': 150}, args=( mgd.InputFile(bam_file, extensions=['.bai']), mgd.TempOutputFile('seqdata_file.h5'), remixt_config, remixt_ref_data_dir, ), ) workflow.setobj( obj=mgd.OutputChunks('chromosome'), value=chromosomes, ) if normal: func = 'remixt.analysis.haplotype.infer_snp_genotype_from_normal' else: func = 'remixt.analysis.haplotype.infer_snp_genotype_from_tumour' workflow.transform( name='infer_snp_genotype', axes=('chromosome', ), ctx=dict(mem=16, **ctx), func=func, args=( mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'), mgd.TempInputFile('seqdata_file.h5'), mgd.InputInstance('chromosome'), config, ), ) workflow.transform( name='infer_haps', axes=('chromosome', ), ctx=dict(mem=16, **ctx), func='remixt.analysis.haplotype.infer_haps', args=( mgd.TempOutputFile('haplotypes.tsv', 'chromosome'), mgd.TempInputFile('snp_genotype.tsv', 'chromosome'), mgd.InputInstance('chromosome'), mgd.TempSpace('haplotyping', 'chromosome'), remixt_config, remixt_ref_data_dir, ), ) workflow.transform(name='merge_haps', ctx=dict(mem=16, **ctx), func='remixt.utils.merge_tables', args=( mgd.OutputFile(haplotypes_filename), mgd.TempInputFile('haplotypes.tsv', 'chromosome'), )) workflow.transform( name='create_segments', ctx=dict(mem=16, **ctx), func='remixt.analysis.segment.create_segments', args=( mgd.TempOutputFile('segments.tsv'), remixt_config, config['ref_data_dir'], ), ) workflow.transform( name='haplotype_allele_readcount', ctx=dict(mem=16, **ctx), func='remixt.analysis.readcount.haplotype_allele_readcount', args=(mgd.OutputFile(allele_counts_filename), mgd.TempInputFile('segments.tsv'), mgd.TempInputFile('seqdata_file.h5'), mgd.InputFile(haplotypes_filename), remixt_config), ) return workflow
def run_LoLoPicker(config, args, normal_bam, tumour_bam, output_file): workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('region', ), value=list(map(str, range(1, 23) + ['X']))) workflow.transform(name='create_axes_beds', axes=('region', ), func=tasks.create_axes_beds, args=(mgd.InputFile(config["bed_file"]), mgd.InputInstance('region'), mgd.TempOutputFile('region.bed', 'region'))) workflow.transform(name='LoLoPicker_somatic', axes=('region', ), func=tasks.LoLoPicker_somatic, args=(config, mgd.InputFile(tumour_bam), mgd.InputFile(normal_bam), mgd.TempInputFile('region.bed', 'region'), mgd.TempSpace('LoLoPicker_somatic_temp', 'region'), mgd.TempOutputFile("raw_somatic_varants.txt", 'region'))) workflow.transform(name='make_sample_list', func=tasks.make_sample_list, args=( args, mgd.TempOutputFile('samplelist.txt'), )) workflow.transform(name='LoLoPicker_control', axes=('region', ), func=tasks.LoLoPicker_control, args=(config, mgd.TempInputFile('samplelist.txt'), mgd.TempSpace('LoLoPicker_control_temp', 'region'), mgd.TempInputFile("raw_somatic_varants.txt", 'region'), mgd.TempOutputFile("control_stats.txt", 'region'))) workflow.transform(name='LoLoPicker_stats', axes=('region', ), func=tasks.LoLoPicker_stats, args=( mgd.TempSpace('LoLoPicker_stats_temp', 'region'), mgd.TempInputFile("raw_somatic_varants.txt", 'region'), mgd.TempInputFile("control_stats.txt", 'region'), mgd.TempOutputFile("stats_calls.txt", 'region'), )) workflow.transform(name='merge_LoLoPicker', func=tasks.merge_LoLoPicker, args=(mgd.TempSpace("merge_LoLo"), mgd.TempInputFile("stats_calls.txt", 'region', axes_origin=[]), mgd.OutputFile(output_file))) return workflow
def analyze_tumour_normal(config, input_args, results_dir, normal_bam, tumour_sample, tumour_bam, snv_tsv, indel_tsv, snv_vcf, indel_vcf): workflow = pypeliner.workflow.Workflow() matched_results_dir = os.path.join(results_dir, tumour_sample) helpers.makedirs(matched_results_dir) workflow.subworkflow(name='run_deepSNV', func=deepSNV.run_deepSNV, args=(config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.OutputFile( os.path.join(matched_results_dir, 'deepSNV_out.tsv')))) workflow.subworkflow(name='run_VarScan', func=VarScan.run_VarScan, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.OutputFile( os.path.join(matched_results_dir, 'VarScan_out.vcf')), mgd.OutputFile( os.path.join(matched_results_dir, 'VarScan_indel_out.vcf')), )) workflow.subworkflow(name='run_MutationSeq', func=MutationSeq.run_MutationSeq, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.OutputFile( os.path.join(matched_results_dir, 'museq_out.vcf')), )) workflow.subworkflow(name='run_Strelka', func=Strelka.run_Strelka, args=(config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.OutputFile( os.path.join(matched_results_dir, 'strelka_out.vcf')), mgd.OutputFile( os.path.join(matched_results_dir, 'strelka_indel_out.vcf')))) workflow.subworkflow(name='run_LoLoPicker', func=LoLoPicker.run_LoLoPicker, args=( config, input_args, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.OutputFile( os.path.join(matched_results_dir, 'LoLoPicker_out.tsv')), )) workflow.transform( name='create_result_dict', func=union.create_result_dict, ret=mgd.TempOutputObj('result_dict'), args=( mgd.InputFile(os.path.join(matched_results_dir, 'deepSNV_out.tsv')), mgd.InputFile(os.path.join(matched_results_dir, 'VarScan_out.vcf')), mgd.InputFile(os.path.join(matched_results_dir, 'museq_out.vcf')), mgd.InputFile(os.path.join(matched_results_dir, 'strelka_out.vcf')), mgd.InputFile( os.path.join(matched_results_dir, 'LoLoPicker_out.tsv')), )) workflow.transform(name='union_results', func=union.union_results, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.TempInputObj('result_dict'), mgd.TempSpace('union_space'), mgd.OutputFile(snv_tsv), mgd.OutputFile(snv_vcf), )) workflow.transform(name='union_indels', func=union.union_indels, args=( config, mgd.InputFile( os.path.join(matched_results_dir, 'strelka_indel_out.vcf')), mgd.InputFile( os.path.join(matched_results_dir, 'VarScan_indel_out.vcf')), mgd.OutputFile(indel_tsv), mgd.OutputFile(indel_vcf), )) return workflow
def create_delly_wrapper_workflow(bam_filenames, output_filename, raw_data_dir, control_id=None, ref_genome_fasta_file=None, delly_excl_chrom=None): bams = list() for lib_id, bam_filename in bam_filenames.items(): bams += [destruct.benchmark.wrappers.utils.symlink(bam_filename, link_name='{0}.bam'.format(lib_id), link_directory=raw_data_dir)] destruct.benchmark.wrappers.utils.symlink(bam_filename+'.bai', link_name='{0}.bam.bai'.format(lib_id), link_directory=raw_data_dir) workflow = pypeliner.workflow.Workflow() workflow.transform( name='get_sv_types', func=destruct.benchmark.wrappers.delly.tasks.get_sv_types, ret=pypeliner.managed.OutputChunks('sv_type'), args=( mgd.InputFile(ref_genome_fasta_file), ), ) workflow.transform( name='delly_call', axes=('sv_type',), ctx={'mem': 64, 'num_retry': 2, 'mem_retry_factor': 2}, func=destruct.benchmark.wrappers.delly.tasks.run_delly_call, args=( mgd.Instance('sv_type'), delly_excl_chrom, ref_genome_fasta_file, [mgd.InputFile(bam) for bam in bams], mgd.TempOutputFile('out.bcf', 'sv_type'), ), ) if control_id is None: concat_input = mgd.TempInputFile('out.bcf', 'sv_type') else: workflow.transform( name='delly_filter_somatic', axes=('sv_type',), ctx={'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2}, func=destruct.benchmark.wrappers.delly.tasks.run_delly_filter, args=( mgd.Instance('sv_type'), bam_filenames.keys(), control_id, mgd.TempSpace('samples.tsv'), ref_genome_fasta_file, mgd.TempInputFile('out.bcf', 'sv_type'), mgd.TempOutputFile('somatic.bcf', 'sv_type'), ), ) concat_input = mgd.TempInputFile('somatic.bcf', 'sv_type') workflow.transform( name='concatenate_vcf', func=destruct.benchmark.wrappers.tasks.concatenate_bcf, ctx={'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2}, args=( concat_input, mgd.TempOutputFile('somatic.bcf'), ), ) workflow.transform( name='convert_vcf', func=destruct.benchmark.wrappers.delly.tasks.convert_vcf, ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2}, args=( mgd.TempInputFile('somatic.bcf'), mgd.OutputFile(output_filename), ), kwargs={ 'control_id': control_id, } ) return workflow
def create_allele_counts_workflow(normal_bam_file, tumour_bam_file, dbsnp_vcf_file, ref_genome_fasta_file, allele_counts_file, chromosomes='autosomes'): chromosomes = soil.utils.genome.load_bam_chromosome_lengths( normal_bam_file, chromosomes) sandbox = soil.utils.workflow.get_sandbox(['snpsift']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.subworkflow( name='call_snps', func=soil.wrappers.platypus.workflows.create_single_sample_workflow, args=( mgd.InputFile(normal_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('normal.vcf.gz'), ), kwargs={ 'chromosomes': chromosomes, 'split_size': int(1e7) }) workflow.commandline(name='annotate_dbsnp_status', ctx={ 'mem': 6, 'mem_retry_increment': 4, 'num_retry': 3 }, args=('SnpSift', 'annotate', mgd.InputFile(dbsnp_vcf_file), mgd.TempInputFile('normal.vcf.gz'), '>', mgd.TempOutputFile('normal.dbsnp.vcf'))) workflow.commandline(name='annotate_variant_type', ctx={ 'mem': 6, 'mem_retry_increment': 4, 'num_retry': 3 }, args=('SnpSift', 'varType', mgd.TempInputFile('normal.dbsnp.vcf'), '>', mgd.TempOutputFile('normal.dbsnp.vartype.vcf'))) workflow.commandline( name='filter_het_snps', ctx={ 'mem': 6, 'mem_retry_increment': 4, 'num_retry': 3 }, args=('SnpSift', 'filter', "isHet(GEN[0]) & ((exists ID) & ( ID =~ 'rs' )) & (exists SNP)", mgd.TempInputFile('normal.dbsnp.vartype.vcf'), '>', mgd.TempOutputFile('het.snps.vcf'))) workflow.transform(name='split_vcf', ctx={ 'mem': 6, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.split_vcf, args=(mgd.TempInputFile('het.snps.vcf'), mgd.TempOutputFile('split.vcf', 'split'), mgd.TempSpace('split_tmp')), kwargs={'split_size': int(1e4)}) workflow.transform(name='get_allele_counts', axes=('split', ), func=tasks.get_snv_allele_counts_for_vcf_targets, args=(mgd.InputFile(tumour_bam_file), mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('split.tsv', 'split'))) workflow.transform(name='merge_counts', func=tasks.merge_counts, args=(mgd.TempInputFile('split.tsv', 'split'), mgd.OutputFile(allele_counts_file))) return workflow
def create_titan_workflow(normal_bam_file, tumour_bam_file, dbsnp_vcf_file, mappability_file, ref_genome_fasta_file, out_file, exome_bed_file=None, sample='Tumour', threads=1): sandbox = soil.utils.workflow.get_sandbox( ['hmmcopy', 'hmmcopy_utils', 'titan']) sandbox.channels.append('conda-forge') sandbox.packages.extend(['pandas', 'rpy2']) chromosomes = soil.utils.genome.load_bam_chromosome_lengths( normal_bam_file, 'autosomes') workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('init_params', 'param_idx'), value=tasks.create_intialization_parameters()) workflow.subworkflow(name='get_allele_counts', func=create_allele_counts_workflow, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(dbsnp_vcf_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('allele_counts.tsv')), kwargs={'chromosomes': 'autosomes'}) workflow.commandline(name='build_normal_wig', args=('readCounter', '-c', ','.join(chromosomes), mgd.InputFile(normal_bam_file), '>', mgd.TempOutputFile('normal.wig'))) workflow.commandline(name='build_tumour_wig', args=('readCounter', '-c', ','.join(chromosomes), mgd.InputFile(tumour_bam_file), '>', mgd.TempOutputFile('tumour.wig'))) workflow.commandline(name='build_gc_wig', args=('gcCounter', '-c', ','.join(chromosomes), mgd.InputFile(ref_genome_fasta_file), '>', mgd.TempOutputFile('gc.wig'))) workflow.commandline(name='build_mappability_wig', args=('mapCounter', '-c', ','.join(chromosomes), mgd.InputFile(mappability_file), '>', mgd.TempOutputFile('mappability.wig'))) workflow.transform(name='build_coverage_file', func=tasks.build_coverage_file, args=(mgd.TempInputFile('normal.wig'), mgd.TempInputFile('tumour.wig'), mgd.TempInputFile('gc.wig'), mgd.TempInputFile('mappability.wig'), mgd.TempOutputFile('coverage.wig')), kwargs={'target_file': exome_bed_file}) workflow.transform(name='run_titan', axes=('param_idx', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3, 'threads': threads }, func=tasks.run_titan, args=(mgd.TempInputFile('coverage.wig'), mgd.TempInputFile('allele_counts.tsv'), mgd.TempInputObj('init_params', 'param_idx'), mgd.TempOutputFile('run.tar.gz', 'param_idx'), mgd.TempSpace('titan_tmp', 'param_idx')), kwargs={ 'is_exome': (exome_bed_file is not None), 'sample': sample, 'threads': threads }) workflow.transform(name='build_run_stats_file', func=tasks.build_run_stats_file, args=(mgd.TempInputFile('run.tar.gz', 'param_idx'), mgd.TempInputObj('init_params', 'param_idx'), mgd.TempOutputFile('stats.tsv'))) workflow.transform(name='build_output', func=tasks.build_final_results_file, args=(mgd.TempInputFile('coverage.wig'), mgd.TempInputFile('allele_counts.tsv'), mgd.TempInputFile('run.tar.gz', 'param_idx'), mgd.TempInputFile('stats.tsv'), mgd.OutputFile(out_file), mgd.TempSpace('build_results'))) return workflow
def infer_haps( bam_file, haplotypes_filename, config, from_tumour=False, ): baseimage = {'docker_image': config['docker']['single_cell_pipeline']} remixt_image = config['docker']['remixt'] remixt_config = config.get('extract_seqdata', {}) remixt_ref_data_dir = config['ref_data_dir'] chromosomes = config['chromosomes'] remixt_config['chromosomes'] = chromosomes ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, **baseimage) workflow = pypeliner.workflow.Workflow(ctx=ctx) if isinstance(bam_file, dict): workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_file.keys()), ) # dont parallelize over chromosomes for per cell bams workflow.subworkflow( name="extract_seqdata", axes=('cell_id',), func='remixt.workflow.create_extract_seqdata_workflow', ctx={'docker_image': remixt_image}, args=( mgd.InputFile( 'bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai'] ), mgd.TempOutputFile('seqdata_cell.h5', 'cell_id'), remixt_config, remixt_ref_data_dir, ), kwargs={'no_parallelism': True} ) workflow.transform( name='merge_all_seqdata', func="remixt.seqdataio.merge_overlapping_seqdata", ctx={'docker_image': remixt_image}, args=( mgd.TempOutputFile('seqdata_file.h5'), mgd.TempInputFile("seqdata_cell.h5", "cell_id"), config["chromosomes"] ), ) else: workflow.subworkflow( name='extract_seqdata', func='remixt.workflow.create_extract_seqdata_workflow', ctx={'disk': 150, 'docker_image': remixt_image}, args=( mgd.InputFile(bam_file, extensions=['.bai']), mgd.TempOutputFile('seqdata_file.h5'), remixt_config, remixt_ref_data_dir, ), ) workflow.setobj( obj=mgd.OutputChunks('chromosome'), value=chromosomes, ) if from_tumour: func = 'remixt.analysis.haplotype.infer_snp_genotype_from_tumour' else: func = 'remixt.analysis.haplotype.infer_snp_genotype_from_normal' workflow.transform( name='infer_snp_genotype', axes=('chromosome',), ctx={'mem': 16, 'docker_image': remixt_image}, func=func, args=( mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'), mgd.TempInputFile('seqdata_file.h5'), mgd.InputInstance('chromosome'), config, ), ) workflow.transform( name='infer_haps', axes=('chromosome',), ctx={'mem': 16, 'docker_image': remixt_image}, func='remixt.analysis.haplotype.infer_haps', args=( mgd.TempOutputFile('haplotypes.tsv', 'chromosome'), mgd.TempInputFile('snp_genotype.tsv', 'chromosome'), mgd.InputInstance('chromosome'), mgd.TempSpace('haplotyping', 'chromosome'), remixt_config, remixt_ref_data_dir, ), ) workflow.transform( name='merge_haps', ctx={'mem': 16, 'docker_image': remixt_image}, func='remixt.utils.merge_tables', args=( mgd.TempOutputFile('haplotypes_merged.tsv'), mgd.TempInputFile('haplotypes.tsv', 'chromosome'), ) ) workflow.transform( name='finalize_csv', ctx={'mem': 16}, func='single_cell.utils.csvutils.rewrite_csv_file', args=( mgd.TempInputFile('haplotypes_merged.tsv'), mgd.OutputFile(haplotypes_filename, extensions=['.yaml']), ), kwargs={ 'write_header': True, 'dtypes': dtypes()['haplotypes'] }, ) return workflow
def create_qc_annotation_workflow( hmmcopy_metrics, hmmcopy_reads, alignment_metrics, gc_metrics, segs_tar, merged_metrics, qc_report, corrupt_tree, consensus_tree, phylo_csv, rank_trees, filtered_data, corrupt_tree_pdf, pass_segs, fail_segs, corrupt_tree_heatmap_output, plot_heatmap_ec_filt_output, config, library_id, no_corrupt_tree=False, ): ctx = {'docker_image': config['docker']['single_cell_pipeline']} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.transform( name='cell_cycle_classifier', func="single_cell.workflows.qc_annotation.tasks.cell_cycle_classifier", args=(mgd.InputFile(hmmcopy_reads), mgd.InputFile(hmmcopy_metrics, extensions=['.yaml']), mgd.InputFile(alignment_metrics), mgd.TempOutputFile('cell_state_classifier.csv.gz', extensions=['.yaml']), mgd.TempSpace('tempdata_cell_cycle')), kwargs={'docker_image': config['docker']['cell_cycle_classifier']}) workflow.transform( name="add_quality", ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.workflows.qc_annotation.tasks.add_quality", args=(mgd.TempInputFile('cell_state_classifier.csv.gz', extensions=['.yaml']), mgd.InputFile(alignment_metrics, extensions=['.yaml']), mgd.TempOutputFile("hmmcopy_quality_metrics.csv.gz", extensions=['.yaml']), config['classifier_training_data'], mgd.TempSpace("hmmcopy_classify_tempdir")), ) workflow.transform( name='merge_alignment_hmmcopy_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.workflows.qc_annotation.tasks.merge_metrics", args=(mgd.TempInputFile("hmmcopy_quality_metrics.csv.gz", extensions=['.yaml']), mgd.InputFile(alignment_metrics, extenstions=['.yaml']), mgd.TempOutputFile('merged_metrics.csv.gz', extensions=['.yaml']))) workflow.transform( name='generate_qc_report', func="single_cell.workflows.qc_annotation.tasks.generate_qc_report", args=(mgd.TempSpace("QC_report_singlecellpipeline"), config['reference_gc'], mgd.TempInputFile('merged_metrics.csv.gz', extensions=['.yaml']), mgd.InputFile(gc_metrics, extensions=['.yaml']), mgd.OutputFile(qc_report))) workflow.transform( name='filter_segs_plots', func="single_cell.workflows.qc_annotation.tasks.filter_plot_tar", args=(mgd.TempInputFile('merged_metrics.csv.gz', extensions=['.yaml']), mgd.InputFile(segs_tar), mgd.OutputFile(pass_segs), mgd.OutputFile(fail_segs), mgd.TempSpace("filter_seg_plots"), config['good_cells'])) workflow.transform( name='plot_heatmap_ec_filtered', func="single_cell.workflows.qc_annotation.tasks.plot_pcolor", args=( mgd.InputFile(hmmcopy_reads, extensions=['.yaml']), mgd.TempInputFile('merged_metrics.csv.gz', extensions=['.yaml']), mgd.OutputFile(plot_heatmap_ec_filt_output), ), kwargs={ 'plot_title': 'QC pipeline metrics', 'column_name': 'state', 'plot_by_col': 'experimental_condition', 'color_by_col': 'cell_call', 'chromosomes': config['chromosomes'], 'max_cn': config['num_states'], 'scale_by_cells': False, 'cell_filters': config["good_cells"], 'mappability_threshold': config["map_cutoff"] }) if no_corrupt_tree: workflow.transform( name='finalize_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'num_retry': 1 }, func="single_cell.utils.csvutils.finalize_csv", args=( mgd.TempInputFile('merged_metrics.csv.gz', extensions=['.yaml']), mgd.OutputFile(merged_metrics, extensions=['.yaml']), ), ) else: workflow.transform(name='finalize_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1, 'num_retry': 1 }, func="single_cell.utils.csvutils.finalize_csv", args=(mgd.TempInputFile('merged_metrics.csv.gz', extensions=['.yaml']), mgd.TempOutputFile( 'merged_metrics_with_header.csv.gz', extensions=['.yaml']))) workflow.subworkflow( name='corrupt_tree', func= 'single_cell.workflows.corrupt_tree.create_corrupt_tree_workflow', args=(mgd.TempInputFile('merged_metrics_with_header.csv.gz', extensions=['.yaml']), mgd.InputFile(hmmcopy_reads), mgd.OutputFile(corrupt_tree), mgd.OutputFile(consensus_tree), mgd.OutputFile(phylo_csv), mgd.OutputFile(rank_trees), mgd.OutputFile(filtered_data), mgd.OutputFile(corrupt_tree_pdf), library_id, config)) workflow.transform( name="add_corrupt_tree_order", ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func= "single_cell.workflows.qc_annotation.tasks.add_corrupt_tree_order", args=(mgd.InputFile(corrupt_tree), mgd.TempInputFile('merged_metrics_with_header.csv.gz', extensions=['.yaml']), mgd.OutputFile(merged_metrics, extensions=['.yaml'])), ) workflow.transform( name='plot_heatmap_corrupt_tree', func="single_cell.workflows.qc_annotation.tasks.plot_pcolor", args=( mgd.InputFile(hmmcopy_reads, extensions=['.yaml']), mgd.TempInputFile('merged_metrics.csv.gz', extensions=['.yaml']), mgd.OutputFile(corrupt_tree_heatmap_output), ), kwargs={ 'plot_title': 'QC pipeline metrics', 'column_name': 'state', 'plot_by_col': 'experimental_condition', 'color_by_col': 'cell_call', 'chromosomes': config['chromosomes'], 'max_cn': config['num_states'], 'scale_by_cells': False, 'corrupt_tree': mgd.InputFile(corrupt_tree), }) return workflow
def create_remixt_workflow( tumour_path, normal_path, breakpoints, sample_id, remixt_results_filename, remixt_brk_cn_csv, remixt_cn_csv, remixt_minor_modes_csv, remixt_mix_csv, remixt_read_depth_csv, remixt_stats_csv, remixt_refdata, reference, single_node=False, ): ctx = {'docker_image': config.containers('wgs')} params = config.default_params('copynumber_calling')['remixt'] workflow = pypeliner.workflow.Workflow(ctx=ctx) remixt_config = { 'genome_fasta': reference, 'genome_fai': reference + '.fai', } if breakpoints is None: workflow.setobj( obj=mgd.TempOutputObj('emptybreakpoints'), value=[], ) workflow.transform( name='write_empty_breakpoints', func='wgs.workflows.remixt.tasks.write_empty_breakpoints', args=( mgd.TempInputObj('emptybreakpoints'), mgd.TempOutputFile('filtered_breakpoints.csv'), ), ) else: workflow.transform( name='filter_breakpoints', func='wgs.workflows.remixt.tasks.filter_destruct_breakpoints', ctx=helpers.get_default_ctx(memory=4, walltime='4:00'), args=(mgd.InputFile(breakpoints), mgd.TempOutputFile('filtered_breakpoints.csv'), params['min_num_reads'])) if single_node: workflow.transform( name='remixt', func='wgs.workflows.remixt.tasks.run_remixt_local', ctx=helpers.get_default_ctx(memory=15, walltime='120:00', ncpus=8), args=( mgd.TempSpace("remixt_temp"), mgd.TempInputFile('filtered_breakpoints.csv'), mgd.InputFile(tumour_path, extensions=['.bai']), mgd.InputFile(normal_path, extensions=['.bai']), sample_id, mgd.OutputFile(remixt_results_filename), mgd.TempSpace('remixt_raw_dir'), remixt_config, remixt_refdata, ), ) else: workflow.subworkflow(name='remixt', func="remixt.workflow.create_remixt_bam_workflow", ctx={ 'docker_image': config.containers('remixt'), 'walltime': '48:00' }, args=( mgd.TempInputFile('filtered_breakpoints.csv'), { sample_id: mgd.InputFile(tumour_path, extensions=['.bai']), sample_id + 'N': mgd.InputFile(normal_path, extensions=['.bai']) }, { sample_id: mgd.OutputFile(remixt_results_filename) }, mgd.TempSpace('remixt_raw_dir'), remixt_config, remixt_refdata, ), kwargs={ 'normal_id': sample_id + 'N', }) workflow.transform( name='parse_remixt', func='wgs.workflows.remixt.tasks.parse_remixt_file', args=(mgd.InputFile(remixt_results_filename), [ mgd.OutputFile(remixt_brk_cn_csv, extensions=['.yaml']), mgd.OutputFile(remixt_cn_csv, extensions=['.yaml']), mgd.OutputFile(remixt_minor_modes_csv, extensions=['.yaml']), mgd.OutputFile(remixt_mix_csv, extensions=['.yaml']), mgd.OutputFile(remixt_read_depth_csv, extensions=['.yaml']), mgd.OutputFile(remixt_stats_csv, extensions=['.yaml']), ], ['/brk_cn', '/cn', '/minor_modes', '/mix', '/read_depth', '/stats'], mgd.TempSpace('tempdir_parse'))) return workflow
def create_hmmcopy_workflow( bam_file, reads, segs, metrics, params, igv_seg_filename, segs_pdf, bias_pdf, plot_heatmap_ec_output, plot_metrics_output, plot_kernel_density_output, hmmcopy_data_tar, cell_ids, hmmparams, sample_info ): chromosomes = hmmparams["chromosomes"] baseimage = hmmparams['docker']['single_cell_pipeline'] hmmcopy_docker = hmmparams['docker']['hmmcopy'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.setobj( obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]), value=sample_info) workflow.transform( name='run_hmmcopy', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.run_hmmcopy", axes=('cell_id',), args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']), mgd.TempOutputFile('reads.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('segs.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('params.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('hmm_metrics.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('hmm_data.tar.gz', 'cell_id'), mgd.InputInstance('cell_id'), hmmparams, mgd.TempSpace('hmmcopy_temp', 'cell_id'), hmmcopy_docker ), ) workflow.transform( name='merge_reads', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempOutputFile('reads_merged.csv.gz', extensions=['.yaml']), ), kwargs={'low_memory': True} ) workflow.transform( name='add_mappability_bool', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.get_mappability_col", args=( mgd.TempInputFile('reads_merged.csv.gz', extensions=['.yaml']), mgd.OutputFile(reads, extensions=['.yaml']), ), ) workflow.transform( name='merge_segs', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.OutputFile(segs, extensions=['.yaml']), ), kwargs={'low_memory': True} ) workflow.transform( name='merge_metrics', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempOutputFile("hmm_metrics.csv.gz", extensions=['.yaml']), ), ) workflow.transform( name='merge_params', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.OutputFile(params, extensions=['.yaml']), ), ) workflow.transform( name='get_max_cn', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.get_max_cn", ret=mgd.TempOutputObj('max_cn'), args=( mgd.InputFile(reads, extensions=['.yaml']), ) ) workflow.transform( name='hmmcopy_plots', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_hmmcopy", axes=('cell_id',), args=( mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), hmmparams['ref_genome'], mgd.TempOutputFile('segments.png', 'cell_id', axes_origin=[]), mgd.TempOutputFile('bias.png', 'cell_id', axes_origin=[]), mgd.InputInstance('cell_id'), ), kwargs={ 'num_states': hmmparams['num_states'], 'sample_info': mgd.TempInputObj('sampleinfo', 'cell_id'), 'max_cn': mgd.TempInputObj("max_cn") } ) workflow.transform( name='annotate_metrics_with_info_and_clustering', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.add_clustering_order", args=( mgd.InputFile(reads, extensions=['.yaml']), mgd.TempInputFile("hmm_metrics.csv.gz", extensions=['.yaml']), mgd.OutputFile(metrics, extensions=['.yaml']), ), kwargs={ 'chromosomes': hmmparams["chromosomes"], 'sample_info': sample_info } ) workflow.transform( name='merge_hmm_copy_plots', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.merge_pdf", args=( [ mgd.TempInputFile('segments.png', 'cell_id'), mgd.TempInputFile('bias.png', 'cell_id'), ], [ mgd.OutputFile(segs_pdf), mgd.OutputFile(bias_pdf), ], mgd.InputFile(metrics, extensions=['.yaml']), None, mgd.TempSpace("hmmcopy_plot_merge_temp"), ['segments', 'bias'] ) ) workflow.transform( name='create_igv_seg', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.create_igv_seg", args=( mgd.InputFile(segs, extensions=['.yaml']), mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(igv_seg_filename), hmmparams, ) ) workflow.transform( name='plot_metrics', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_metrics", args=( mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_metrics_output), 'QC pipeline metrics', ) ) workflow.transform( name='plot_kernel_density', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_kernel_density", args=( mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_kernel_density_output), ',', 'mad_neutral_state', 'QC pipeline metrics', ) ) workflow.transform( name='plot_heatmap_ec', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_pcolor", args=( mgd.InputFile(reads, extensions=['.yaml']), mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_heatmap_ec_output), ), kwargs={ 'plot_title': 'QC pipeline metrics', 'column_name': 'state', 'plot_by_col': 'experimental_condition', 'color_by_col': 'cell_call', 'chromosomes': chromosomes, 'max_cn': hmmparams['num_states'], 'scale_by_cells': False, 'mappability_threshold': hmmparams["map_cutoff"] } ) workflow.transform( name='merge_hmmcopy_data_tars', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.utils.helpers.tar_files", args=( mgd.TempInputFile('hmm_data.tar.gz', 'cell_id', axes_origin=[]), mgd.OutputFile(hmmcopy_data_tar), mgd.TempSpace("merge_tarballs") ), ) return workflow
def partition_tumour(config, input_args, patient_id, results_dir, input_bams, input_bais, output_file): workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('tumour_id', ), value=input_args['tumour_samples']) workflow.setobj(obj=mgd.OutputChunks('normal_id', ), value=input_args['normal_samples']) workflow.transform(name='merge_normal', func=tasks.merge_normal, args=(config, mgd.InputFile('normal.bam', 'normal_id', fnames=input_args['normal_bams'], axes_origin=[]), mgd.OutputFile( os.path.join(input_args['patient_bam_dir'], 'merged_normal.bam')), mgd.OutputFile( os.path.join(input_args['patient_bam_dir'], 'merged_normal.bam.bai')))) workflow.subworkflow( name='analyze_tumour', func=analyze_tumour_normal, axes=('tumour_id', ), args=( config, input_args, results_dir, mgd.InputFile( os.path.join(input_args['patient_bam_dir'], 'merged_normal.bam')), mgd.InputInstance('tumour_id'), mgd.InputFile('tumour.bam', 'tumour_id', fnames=input_bams), mgd.OutputFile( os.path.join(results_dir, patient_id + '_{tumour_id}.snv.tsv'), 'tumour_id'), mgd.OutputFile( os.path.join(results_dir, patient_id + '_{tumour_id}.indel.tsv'), 'tumour_id'), mgd.TempOutputFile('snv.vcf', 'tumour_id'), mgd.TempOutputFile('indel.vcf', 'tumour_id'), )) workflow.transform(name='annotate_snvs', func=tasks.annotate_outputs, axes=('tumour_id', ), args=( config, mgd.TempSpace('snv_annotation_space', 'tumour_id'), mgd.TempInputFile('snv.vcf', 'tumour_id'), mgd.OutputFile( os.path.join( results_dir, patient_id + '_{tumour_id}.snv.txt'), 'tumour_id'), )) workflow.transform(name='annotate_indels', func=tasks.annotate_outputs, axes=('tumour_id', ), args=( config, mgd.TempSpace('indel_annotation_space', 'tumour_id'), mgd.TempInputFile('indel.vcf', 'tumour_id'), mgd.OutputFile( os.path.join( results_dir, patient_id + '_{tumour_id}.indel.txt'), 'tumour_id'), )) workflow.transform(name='vcf_annotate_indels', func=tasks.vcf_annotate_outputs, axes=('tumour_id', ), args=( config, mgd.TempSpace('indel_vcf_annotation_space', 'tumour_id'), mgd.TempInputFile('indel.vcf', 'tumour_id'), mgd.OutputFile( os.path.join( results_dir, patient_id + '_{tumour_id}.indel.vcf'), 'tumour_id'), )) workflow.transform( name='vcf_annotate_snvs', func=tasks.vcf_annotate_outputs, axes=('tumour_id', ), args=( config, mgd.TempSpace('snv_vcf_annotation_space', 'tumour_id'), mgd.TempInputFile('snv.vcf', 'tumour_id'), mgd.OutputFile( os.path.join(results_dir, patient_id + '_{tumour_id}.snv.vcf'), 'tumour_id'), )) workflow.transform( name='log_patient_analysis', func=tasks.log_patient_analysis, args=( mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.snv.tsv'), 'tumour_id', axes_origin=[]), mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.indel.tsv'), 'tumour_id', axes_origin=[]), mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.snv.txt'), 'tumour_id', axes_origin=[]), mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.indel.txt'), 'tumour_id', axes_origin=[]), mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.snv.vcf'), 'tumour_id', axes_origin=[]), mgd.InputFile(os.path.join(results_dir, patient_id + '_{tumour_id}.indel.vcf'), 'tumour_id', axes_origin=[]), mgd.OutputFile(output_file), )) return workflow