def create_workflow_1(input_filename, output_filename): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1}) # Read data into a managed object workflow.transform(name='read', func=read_stuff, ret=mgd.TempOutputObj('input_data'), args=(mgd.InputFile(input_filename), )) # Extract a property of the managed object, modify it # and store the result in another managed object workflow.transform( name='do', func=do_stuff, ret=mgd.TempOutputObj('output_data'), args=(mgd.TempInputObj('input_data').prop('some_string'), )) # Write the object to an output file workflow.transform(name='write', func=write_stuff, args=(mgd.TempInputObj('output_data'), mgd.TempOutputFile('output_file'))) # Recursive workflow workflow.subworkflow(name='sub_workflow_2', func=create_workflow_2, args=(mgd.TempInputFile('output_file'), mgd.OutputFile(output_filename))) return workflow
def _create_download_cosmic_workflow(ref_data_version, out_file, user, password, host='sftp-cancer.sanger.ac.uk', local_download=False): host_base_path = '/files/{}/cosmic/v83/VCF'.format( ref_data_version.lower()) coding_host_path = '/'.join([host_base_path, 'CosmicCodingMuts.vcf.gz']) non_coding_host_path = '/'.join( [host_base_path, 'CosmicNonCodingVariants.vcf.gz']) sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('coding_host_path'), value=coding_host_path) workflow.setobj(obj=mgd.TempOutputObj('non_coding_host_path'), value=non_coding_host_path) workflow.subworkflow(name='download_coding', func=_create_download_cosmic_file_subworkflow, args=( host, mgd.TempInputObj('coding_host_path'), user, password, mgd.TempOutputFile('coding.vcf.gz'), ), kwargs={'local_download': local_download}) workflow.subworkflow(name='download_non_coding', func=_create_download_cosmic_file_subworkflow, args=( host, mgd.TempInputObj('non_coding_host_path'), user, password, mgd.TempOutputFile('non_coding.vcf.gz'), ), kwargs={'local_download': local_download}) workflow.transform(name='merge_files', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=([ mgd.TempInputFile('coding.vcf.gz'), mgd.TempInputFile('non_coding.vcf.gz') ], mgd.OutputFile(out_file)), kwargs={ 'allow_overlap': True, 'index_file': mgd.OutputFile(out_file + '.tbi') }) return workflow
def destruct_preprocess_workflow(normal_bam_files, normal_stats, normal_reads_1, normal_reads_2, normal_sample_1, normal_sample_2, ref_data_directory, destruct_config, config, tag=False): workflow = pypeliner.workflow.Workflow() workflow.transform(name="get_destruct_config", func="destruct.defaultconfig.get_config", ctx={ 'docker_image': config['docker']['destruct'], 'disk': 200 }, ret=mgd.TempOutputObj("destruct_config"), args=(ref_data_directory, destruct_config)) if isinstance(normal_bam_files, str): workflow.subworkflow(name='process_individual_cells', func=process_cells_destruct, args=( mgd.TempInputObj("destruct_config"), config, mgd.InputFile(normal_bam_files), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.OutputFile(normal_stats), ), kwargs={'tag': tag}) else: workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_bam_files.keys()), ) workflow.subworkflow(name='process_individual_cells', func=process_cells_destruct, args=( mgd.TempInputObj("destruct_config"), config, mgd.InputFile('bam', 'normal_cell_id', fnames=normal_bam_files), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.OutputFile(normal_stats), ), kwargs={'tag': tag}) return workflow
def destruct_preprocess_workflow(normal_bam_files, normal_stats, normal_reads_1, normal_reads_2, normal_sample_1, normal_sample_2, ref_data_directory, destruct_config, tag=False): workflow = pypeliner.workflow.Workflow() workflow.transform(name="get_destruct_config", func="destruct.defaultconfig.get_config", ret=mgd.TempOutputObj("destruct_config"), args=(ref_data_directory, destruct_config)) if isinstance(normal_bam_files, str): workflow.transform( name='bamdisc_normal', func= "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads", ctx={ 'io': 1, 'mem': 8, 'disk': 200 }, args=( mgd.TempInputObj("destruct_config"), mgd.InputFile(normal_bam_files), mgd.OutputFile(normal_stats), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.TempSpace('bamdisc_normal_tempspace'), )) else: workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_bam_files.keys()), ) workflow.subworkflow(name='process_normal_cells', func=process_cells_destruct, args=( mgd.TempInputObj("destruct_config"), mgd.InputFile('bam', 'normal_cell_id', fnames=normal_bam_files), mgd.OutputFile(normal_reads_1), mgd.OutputFile(normal_reads_2), mgd.OutputFile(normal_sample_1), mgd.OutputFile(normal_sample_2), mgd.OutputFile(normal_stats), ), kwargs={'tag': tag}) return workflow
def _create_download_decompress_concat_workflow(urls, out_file, local_download=False): workflow = pypeliner.workflow.Workflow() local_files = [] for i, url in enumerate(urls): local_files.append(mgd.TempFile('file_{}'.format(i))) workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url) workflow.subworkflow(name='download_file_{}'.format(i), func=_create_download_decompress_workflow, args=( mgd.TempInputObj('url_{}'.format(i)), local_files[i].as_output(), ), kwargs={'local_download': local_download}) concat_args = [ 'cat', ] + [x.as_input() for x in local_files] + ['>', mgd.OutputFile(out_file)] workflow.commandline(name='concat', args=concat_args) return workflow
def _create_download_decompress_workflow(url, local_path, local_download=False): workflow = pypeliner.workflow.Workflow() workflow.setobj(mgd.TempOutputObj('url'), value=url) workflow.transform( name='download', ctx={'local': local_download}, func=tasks.download, args=( mgd.TempInputObj('url'), mgd.TempOutputFile('download'), ), ) workflow.transform(name='decompress', func=tasks.decompress, args=( mgd.TempInputFile('download'), mgd.OutputFile(local_path), )) return workflow
def create_pileup2snp_workflow(bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)): sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools', 'varscan']) workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox) workflow.setobj( obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions(bam_file, split_size, chromosomes=chromosomes) ) workflow.commandline( name='run_mpileup', axes=('regions',), args=( 'samtools', 'mpileup', '-f', mgd.InputFile(ref_genome_fasta_file), '-o', mgd.TempOutputFile('region.mpileup', 'regions'), '-r', mgd.TempInputObj('config', 'regions'), mgd.InputFile(bam_file), ) ) workflow.transform( name='run_mpileup2snp', axes=('regions',), ctx=med_mem_ctx, func=tasks.mpileup2snp, args=( mgd.TempInputFile('region.mpileup', 'regions'), mgd.TempOutputFile('region.vcf', 'regions'), ) ) workflow.transform( name='compress', axes=('regions',), func=soil.wrappers.samtools.tasks.compress_vcf, args=( mgd.TempInputFile('region.vcf', 'regions'), mgd.TempOutputFile('region.vcf.gz', 'regions'), ), ) workflow.transform( name='concatenate_vcfs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.OutputFile(out_file), ), ) return workflow
def create_eagle_ref_data_workflow(vcf_url_template, out_file, local_download=False): chrom_map_file = soil.utils.package_data.load_data_file( 'ref_data/data/GRCh37/chrom_map.tsv') chrom_map = pd.read_csv(chrom_map_file, sep='\t') chrom_map = chrom_map[chrom_map['ncbi'].isin( [str(x) for x in range(1, 23)])] chrom_map['url'] = chrom_map['ncbi'].apply( lambda x: vcf_url_template.format(chrom=x)) vcf_urls = chrom_map['url'].to_dict() sandbox = soil.utils.workflow.get_sandbox(['bcftools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls) workflow.transform(name='download_vcf_files', axes=('chrom', ), ctx={'local': local_download}, func=soil.ref_data.tasks.download, args=(mgd.TempInputObj('vcf_url', 'chrom'), mgd.TempOutputFile('raw.vcf.gz', 'chrom'))) workflow.transform(name='write_chrom_map', func=tasks.write_chrom_map_file, args=(mgd.InputFile(chrom_map_file), mgd.TempOutputFile('chrom_map.tsv'))) workflow.transform(name='rename_chroms', axes=('chrom', ), func=soil.wrappers.bcftools.tasks.rename_chroms, args=(mgd.TempInputFile('chrom_map.tsv'), mgd.TempInputFile('raw.vcf.gz', 'chrom'), mgd.TempOutputFile('renamed.bcf', 'chrom'))) workflow.transform(name='concat_vcfs', func=soil.wrappers.bcftools.tasks.concatenate_vcf, args=(mgd.TempInputFile('renamed.bcf', 'chrom'), mgd.OutputFile(out_file)), kwargs={'bcf_output': True}) workflow.commandline(name='index', args=('bcftools', 'index', mgd.InputFile(out_file), '-o', mgd.OutputFile(out_file + '.csi'))) return workflow
def create_vcf_mappability_annotation_workflow( mappability_file, vcf_file, out_file, chromosomes=default_chromosomes, split_size=int(1e7), ): ctx = {'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2} workflow = pypeliner.workflow.Workflow() workflow.transform( name='get_regions', ret=mgd.TempOutputObj('regions_obj', 'regions'), ctx=ctx, func='biowrappers.components.variant_calling.utils.get_vcf_regions', args=( mgd.InputFile(vcf_file, extensions=['.tbi']), split_size, ), kwargs={ 'chromosomes': chromosomes, }, ) workflow.transform( name='annotate_db_status', axes=('regions',), ctx=ctx, func='biowrappers.components.variant_calling.mappability.tasks.get_mappability', args=( mappability_file, mgd.InputFile(vcf_file, extensions=['.tbi']), mgd.TempOutputFile('mappability.csv.gz', 'regions') ), kwargs={ 'region': mgd.TempInputObj('regions_obj', 'regions'), }, ) workflow.transform( name='merge_tables', ctx=ctx, func='biowrappers.components.io.csv.tasks.concatenate_csv', args=( mgd.TempInputFile('mappability.csv.gz', 'regions'), mgd.OutputFile(out_file) ) ) return workflow
def create_somatic_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)): regions = utils.get_bam_regions(normal_bam_file, split_size, chromosomes=chromosomes) workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=regions) workflow.transform( name='run_somatic', axes=('regions', ), ctx={ 'mem': 6, 'mem_retry_increment': 2, 'num_retry': 3 }, func=tasks.run_somatic, args=( mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('region.vcf.gz', 'regions'), mgd.TempInputObj('config', 'regions'), mgd.TempSpace('varscan_tmp', 'regions'), ), ) workflow.transform( name='merge', axes=(), ctx={ 'mem': 2, 'mem_retry_increment': 2, 'num_retry': 3 }, func=vcf_tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.OutputFile(out_file), ), ) return workflow
def create_snv_allele_counts_workflow( bam_file, out_file, table_name, chromosomes=default_chromosomes, count_duplicates=False, min_bqual=0, min_mqual=0, report_non_variant_positions=True, report_zero_count_positions=False, split_size=int(1e7)): workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.TempOutputObj('regions_obj', 'regions'), value=biowrappers.components.variant_calling.utils.get_bam_regions(bam_file, split_size, chromosomes=chromosomes) ) workflow.transform( name='get_counts', axes=('regions',), ctx=med_ctx, func='biowrappers.components.snv_allele_counts.tasks.get_snv_allele_counts_for_region', args=( mgd.InputFile(bam_file), mgd.TempOutputFile('counts.h5', 'regions'), mgd.TempInputObj('regions_obj', 'regions'), table_name ), kwargs={ 'count_duplicates': count_duplicates, 'min_bqual': min_bqual, 'min_mqual': min_mqual, 'report_non_variant_positions': report_non_variant_positions, 'report_zero_count_positions': report_zero_count_positions } ) workflow.transform( name='concatenate_counts', ctx=med_ctx, func='biowrappers.components.io.hdf5.tasks.concatenate_tables', args=( mgd.TempInputFile('counts.h5', 'regions'), mgd.OutputFile(out_file) ) ) return workflow
def create_fit_model_workflow( experiment_filename, results_filename, config, ref_data_dir, tumour_id=None, ): config = remixt.config.get_sample_config(config, tumour_id) workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 16}) workflow.transform( name='init', func=remixt.analysis.pipeline.init, ret=mgd.TempOutputObj('init_params', 'init_id'), args=( mgd.TempOutputFile('init_results'), mgd.InputFile(experiment_filename), config, ), ) workflow.transform( name='fit', axes=('init_id',), func=remixt.analysis.pipeline.fit_task, args=( mgd.TempOutputFile('fit_results', 'init_id'), mgd.InputFile(experiment_filename), mgd.TempInputObj('init_params', 'init_id'), config, ), ) workflow.transform( name='collate', func=remixt.analysis.pipeline.collate, args=( mgd.OutputFile(results_filename), mgd.InputFile(experiment_filename), mgd.TempInputFile('init_results'), mgd.TempInputFile('fit_results', 'init_id'), config, ), ) return workflow
def _create_download_vep_plugins_workflow(urls, out_dir, local_download=False): workflow = pypeliner.workflow.Workflow() for i, url in enumerate(urls): out_file = os.path.join(out_dir, os.path.basename(url)) workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url) workflow.transform(name='download_file_{}'.format(i), ctx={'local': local_download}, func=tasks.download, args=( mgd.TempInputObj('url_{}'.format(i)), mgd.OutputFile(out_file), )) return workflow
def create_mappability_annotation_workflow( in_vcf_file, out_csv_file, mappability_file, split_size=1e4 ): workflow = pypeliner.workflow.Workflow( ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2} ) workflow.transform( name="get_regions", func="single_cell.workflows.mappability_annotation.tasks.get_vcf_regions", ret=mgd.TempOutputObj('regions_obj', 'regions'), args=( mgd.InputFile(in_vcf_file, extensions=['.tbi']), int(split_size), ), ) workflow.transform( name='annotate_db_status', axes=('regions',), func='single_cell.workflows.mappability_annotation.tasks.get_mappability', args=( mappability_file, mgd.InputFile(in_vcf_file, extensions=['.tbi']), mgd.TempOutputFile('mappability.csv.gz', 'regions', extensions=['.yaml']) ), kwargs={ 'region': mgd.TempInputObj('regions_obj', 'regions'), }, ) workflow.transform( name='merge_tables', func='single_cell.utils.csvutils.concatenate_csv', args=( mgd.TempInputFile('mappability.csv.gz', 'regions', extensions=['.yaml']), mgd.OutputFile(out_csv_file, extensions=['.yaml']) ) ) return workflow
def ctDNA_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config']) for arg, value in args.iteritems(): config[arg] = value helpers.makedirs(config["bam_directory"]) helpers.makedirs(config["results_dir"]) inputs = helpers.load_yaml(args['input_yaml']) patients = inputs.keys() workflow.setobj(obj=mgd.OutputChunks('patient_id', ), value=patients) workflow.transform(name='get_input_by_patient', func=helpers.get_input_by_patient, ret=mgd.TempOutputObj('patient_input', 'patient_id'), axes=('patient_id', ), args=( inputs, mgd.InputInstance('patient_id'), )) workflow.subworkflow(name='patient_workflow', func=patient_workflow, axes=('patient_id', ), args=( config, mgd.InputInstance('patient_id'), mgd.TempInputObj('patient_input', 'patient_id'), mgd.OutputFile( os.path.join(config['results_dir'], '{patient_id}.log'), 'patient_id'), )) pyp.run(workflow)
def download_external_files(config): download_keys = [x for x in config if 'url' in config[x]] urls = dict(zip( download_keys, [config[x]['url'] for x in download_keys], )) downloaded_files = dict( zip( urls.keys(), [config[x]['local_path'] for x in urls.keys()], )) workflow = Workflow() workflow.setobj( obj=mgd.TempOutputObj('url', 'files'), value=urls, ) workflow.subworkflow( name='download', func=create_download_workflow, axes=('files', ), args=( mgd.TempInputObj('url', 'files'), mgd.TempOutputFile('download.file', 'files'), ), ) workflow.transform( name='unzip', axes=('files', ), func=tasks.unzip, args=( mgd.TempInputFile('download.file', 'files'), mgd.OutputFile('unzipped', 'files', fnames=downloaded_files), ), ) return workflow
def create_destruct_fastq_workflow( fastq1_filenames, fastq2_filenames, sample1_filenames, sample2_filenames, stats_filenames, breakpoint_table, breakpoint_library_table, breakpoint_read_table, config, ref_data_dir, raw_data_dir=None, ): workflow = pypeliner.workflow.Workflow() # Set the library ids workflow.setobj( obj=mgd.TempOutputObj('library_id', 'bylibrary'), value=destruct.tasks.create_library_ids(fastq1_filenames.keys()), ) workflow.transform( name='readstats', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.read_stats', ret=mgd.TempOutputObj('stats', 'bylibrary'), args=( mgd.InputFile('stats.txt', 'bylibrary', fnames=stats_filenames), config['fragment_length_num_stddevs'], ), ) # Align a sample of reads and calculate alignment statistics workflow.transform( name='prepseed_sample', axes=('bylibrary', ), ctx=medmem, func='destruct.tasks.prepare_seed_fastq', args=( mgd.InputFile('sample1.fq.gz', 'bylibrary', fnames=sample1_filenames), mgd.InputFile('sample2.fq.gz', 'bylibrary', fnames=sample2_filenames), 36, mgd.TempOutputFile('sample.seed', 'bylibrary'), ), ) workflow.commandline( name='bwtrealign_sample', axes=('bylibrary', ), ctx=medmem, args=( 'bowtie', config['genome_fasta'], mgd.TempInputFile('sample.seed', 'bylibrary'), '--chunkmbs', '512', '-k', '1000', '-m', '1000', '--strata', '--best', '-S', '|', 'destruct_aligntrue', '-a', '-', '-1', mgd.InputFile('sample1.fq.gz', 'bylibrary', fnames=sample1_filenames), '-2', mgd.InputFile('sample2.fq.gz', 'bylibrary', fnames=sample2_filenames), '-r', config['genome_fasta'], '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmin', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'), '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '-s', mgd.TempOutputFile('samples.align.true', 'bylibrary'), ), ) workflow.transform( name='scorestats', axes=('bylibrary', ), ctx=medmem, func='destruct.score_stats.create_score_stats', args=( mgd.TempInputFile('samples.align.true', 'bylibrary'), config['match_score'], mgd.TempOutputFile('score.stats', 'bylibrary'), ), ) # Split discordant fastqs and align workflow.transform( name='splitfastq1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.split_fastq', args=( mgd.InputFile('reads1.fq.gz', 'bylibrary', fnames=fastq1_filenames), int(config['reads_per_split']), mgd.TempOutputFile('reads1', 'bylibrary', 'byread'), ), ) workflow.transform( name='splitfastq2', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.split_fastq', args=( mgd.InputFile('reads2.fq.gz', 'bylibrary', fnames=fastq2_filenames), int(config['reads_per_split']), mgd.TempOutputFile('reads2', 'bylibrary', 'byread', axes_origin=[]), ), ) workflow.transform( name='prepseed', axes=('bylibrary', 'byread'), ctx=medmem, func='destruct.tasks.prepare_seed_fastq', args=( mgd.TempInputFile('reads1', 'bylibrary', 'byread'), mgd.TempInputFile('reads2', 'bylibrary', 'byread'), 36, mgd.TempOutputFile('reads.seed', 'bylibrary', 'byread'), ), ) workflow.commandline( name='bwtrealign', axes=('bylibrary', 'byread'), ctx=medmem, args=( 'bowtie', config['genome_fasta'], mgd.TempInputFile('reads.seed', 'bylibrary', 'byread'), '--chunkmbs', '512', '-k', '1000', '-m', '1000', '--strata', '--best', '-S', '|', 'destruct_realign2', '-l', mgd.TempInputObj('library_id', 'bylibrary'), '-a', '-', '-1', mgd.TempInputFile('reads1', 'bylibrary', 'byread'), '-2', mgd.TempInputFile('reads2', 'bylibrary', 'byread'), '-r', config['genome_fasta'], '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmin', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'), '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '--tchimer', config['chimeric_threshold'], '--talign', config['alignment_threshold'], '--pchimer', config['chimeric_prior'], '--tvalid', config['readvalid_threshold'], '-z', mgd.TempInputFile('score.stats', 'bylibrary'), '--span', mgd.TempOutputFile('spanning.alignments', 'bylibrary', 'byread'), '--split', mgd.TempOutputFile('split.alignments', 'bylibrary', 'byread'), ), ) workflow.transform( name='merge_spanning_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_files_by_line', args=( mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'), mgd.TempOutputFile('spanning.alignments_1', 'bylibrary'), ), ) workflow.commandline( name='filterreads', axes=('bylibrary', ), ctx=lowmem, args=( 'destruct_filterreads', '-n', '2', '-a', mgd.TempInputFile('spanning.alignments_1', 'bylibrary'), '-r', config['satellite_regions'], '>', mgd.TempOutputFile('spanning.alignments', 'bylibrary'), ), ) workflow.transform( name='merge_split_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_files_by_line', args=( mgd.TempInputFile('split.alignments', 'bylibrary', 'byread'), mgd.TempOutputFile('split.alignments', 'bylibrary'), ), ) workflow.transform( name='merge_spanning_2', ctx=lowmem, func='destruct.tasks.merge_alignment_files', args=( mgd.TempInputFile('spanning.alignments', 'bylibrary'), mgd.TempOutputFile('spanning.alignments'), mgd.TempInputObj('library_id', 'bylibrary'), ), ) workflow.transform( name='merge_split_2', ctx=lowmem, func='destruct.tasks.merge_alignment_files', args=( mgd.TempInputFile('split.alignments', 'bylibrary'), mgd.TempOutputFile('split.alignments'), mgd.TempInputObj('library_id', 'bylibrary'), ), ) # Cluster spanning reads workflow.setobj( obj=mgd.TempOutputObj('chrom.args', 'bychromarg'), value=destruct.tasks.generate_chromosome_args(config['chromosomes']), ) workflow.transform( name='write_stats_table', ctx=lowmem, func='destruct.tasks.write_stats_table', args=( mgd.TempInputObj('library_id', 'bylibrary'), mgd.TempInputObj('stats', 'bylibrary'), mgd.TempOutputFile('libstats.tsv'), ), ) workflow.commandline( name='cluster', axes=('bychromarg', ), ctx=medmem, args=( 'destruct_mclustermatepairs', '-a', mgd.TempInputFile('spanning.alignments'), '-s', mgd.TempInputFile('libstats.tsv'), '-c', mgd.TempOutputFile('clusters', 'bychromarg'), mgd.TempInputObj('chrom.args', 'bychromarg'), '--clustmin', config['cluster_readcount_threshold'], '--fragmax', config['fragment_length_max'], ), ) # Predict breakpoints from split reads workflow.transform( name='predict_breaks', axes=('bychromarg', ), ctx=medmem, func='destruct.predict_breaks.predict_breaks', args=( mgd.TempInputFile('clusters', 'bychromarg'), mgd.TempInputFile('spanning.alignments'), mgd.TempInputFile('split.alignments'), mgd.TempOutputFile('breakpoints_2', 'bychromarg'), ), ) workflow.transform( name='merge_clusters', ctx=lowmem, func='destruct.tasks.merge_clusters', args=( mgd.TempInputFile('clusters', 'bychromarg'), mgd.TempInputFile('breakpoints_2', 'bychromarg'), mgd.TempOutputFile('clusters'), mgd.TempOutputFile('breakpoints_2'), mgd.TempOutputFile('merge_clusters.debug'), ), ) # Realign reads to breakpoints workflow.commandline( name='realigntobreaks', axes=('bylibrary', 'byread'), ctx=medmem, args=( 'destruct_realigntobreaks2', '-r', config['genome_fasta'], '-b', mgd.TempInputFile('breakpoints_2'), '-c', mgd.TempInputFile('clusters'), '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '--span', mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'), '-1', mgd.TempInputFile('reads1', 'bylibrary', 'byread'), '-2', mgd.TempInputFile('reads2', 'bylibrary', 'byread'), '--realignments', mgd.TempOutputFile('realignments', 'bylibrary', 'byread'), ), ) # Calculate likelihoods based on realignments workflow.transform( name='calculate_realignment_likelihoods', axes=('bylibrary', 'byread'), ctx=medmem, func='destruct.predict_breaks.calculate_realignment_likelihoods', args=( mgd.TempInputFile('breakpoints_2'), mgd.TempInputFile('realignments', 'bylibrary', 'byread'), mgd.TempInputFile('score.stats', 'bylibrary'), mgd.TempOutputFile('likelihoods_2', 'bylibrary', 'byread'), config['match_score'], mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_mean'), mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_stddev'), ), ) workflow.transform( name='merge_likelihoods_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_sorted_files_by_line', args=( mgd.TempInputFile('likelihoods_2', 'bylibrary', 'byread'), mgd.TempOutputFile('likelihoods_2', 'bylibrary'), mgd.TempSpace('merge_likelihoods_1_temp', 'bylibrary'), '1', ), ) workflow.transform( name='merge_likelihoods_2', ctx=lowmem, func='destruct.tasks.merge_sorted_files_by_line', args=( mgd.TempInputFile('likelihoods_2', 'bylibrary'), mgd.TempOutputFile('likelihoods_2'), mgd.TempSpace('merge_likelihoods_2_temp'), '1', ), ) # Set cover for multi mapping reads workflow.transform( name='calc_weights', ctx=medmem, func='destruct.predict_breaks.calculate_cluster_weights', args=( mgd.TempInputFile('breakpoints_2'), mgd.TempOutputFile('cluster_weights'), ), ) workflow.commandline( name='setcover', ctx=medmem, args=( 'destruct_setcover', '-c', mgd.TempInputFile('clusters'), '-w', mgd.TempInputFile('cluster_weights'), '-a', mgd.TempOutputFile('clusters_setcover'), ), ) # Select cluster based on setcover workflow.transform( name='select_clusters', ctx=medmem, func='destruct.predict_breaks.select_clusters', args=( mgd.TempInputFile('clusters_setcover'), mgd.TempInputFile('breakpoints_2'), mgd.TempOutputFile('breakpoints_1'), mgd.TempInputFile('likelihoods_2'), mgd.TempOutputFile('likelihoods_1'), ), ) # Select prediction based on max likelihood workflow.transform( name='select_predictions', ctx=himem, func='destruct.predict_breaks.select_predictions', args=( mgd.TempInputFile('breakpoints_1'), mgd.TempOutputFile('breakpoints'), mgd.TempInputFile('likelihoods_1'), mgd.TempOutputFile('likelihoods'), config['mate_score_threshold'], config['template_length_min_threshold'], config['min_alignment_log_likelihood'], ), ) # Optionally tabulate supporting reads workflow.transform( name='tabreads', ctx=medmem, func='destruct.tasks.tabulate_reads', args=( mgd.TempInputFile('clusters_setcover'), mgd.TempInputFile('likelihoods'), mgd.TempInputObj('library_id', 'bylibrary'), mgd.InputFile('reads1.fq.gz', 'bylibrary', fnames=fastq1_filenames), mgd.InputFile('reads2.fq.gz', 'bylibrary', fnames=fastq2_filenames), mgd.TempOutputFile('breakreads.table.unsorted'), ), ) workflow.commandline( name='sortreads', ctx=medmem, args=( 'sort', '-n', mgd.TempInputFile('breakreads.table.unsorted'), '>', mgd.OutputFile(breakpoint_read_table), ), ) # Tabulate results workflow.transform( name='tabulate', ctx=himem, func='destruct.tasks.tabulate_results', args=( mgd.TempInputFile('breakpoints'), mgd.TempInputFile('likelihoods'), mgd.TempInputObj('library_id', 'bylibrary'), config['genome_fasta'], config['gtf_filename'], config['dgv_filename'], mgd.OutputFile(breakpoint_table), mgd.OutputFile(breakpoint_library_table), ), ) return workflow
def create_remixt_workflow( tumour_path, normal_path, breakpoints, sample_id, remixt_results_filename, remixt_brk_cn_csv, remixt_cn_csv, remixt_minor_modes_csv, remixt_mix_csv, remixt_read_depth_csv, remixt_stats_csv, remixt_refdata, reference, single_node=False, ): ctx = {'docker_image': config.containers('wgs')} params = config.default_params('copynumber_calling')['remixt'] workflow = pypeliner.workflow.Workflow(ctx=ctx) remixt_config = { 'genome_fasta': reference, 'genome_fai': reference + '.fai', } if breakpoints is None: workflow.setobj( obj=mgd.TempOutputObj('emptybreakpoints'), value=[], ) workflow.transform( name='write_empty_breakpoints', func='wgs.workflows.remixt.tasks.write_empty_breakpoints', args=( mgd.TempInputObj('emptybreakpoints'), mgd.TempOutputFile('filtered_breakpoints.csv'), ), ) else: workflow.transform( name='filter_breakpoints', func='wgs.workflows.remixt.tasks.filter_destruct_breakpoints', ctx=helpers.get_default_ctx(memory=4, walltime='4:00'), args=(mgd.InputFile(breakpoints), mgd.TempOutputFile('filtered_breakpoints.csv'), params['min_num_reads'])) if single_node: workflow.transform( name='remixt', func='wgs.workflows.remixt.tasks.run_remixt_local', ctx=helpers.get_default_ctx(memory=15, walltime='120:00', ncpus=8), args=( mgd.TempSpace("remixt_temp"), mgd.TempInputFile('filtered_breakpoints.csv'), mgd.InputFile(tumour_path, extensions=['.bai']), mgd.InputFile(normal_path, extensions=['.bai']), sample_id, mgd.OutputFile(remixt_results_filename), mgd.TempSpace('remixt_raw_dir'), remixt_config, remixt_refdata, ), ) else: workflow.subworkflow(name='remixt', func="remixt.workflow.create_remixt_bam_workflow", ctx={ 'docker_image': config.containers('remixt'), 'walltime': '48:00' }, args=( mgd.TempInputFile('filtered_breakpoints.csv'), { sample_id: mgd.InputFile(tumour_path, extensions=['.bai']), sample_id + 'N': mgd.InputFile(normal_path, extensions=['.bai']) }, { sample_id: mgd.OutputFile(remixt_results_filename) }, mgd.TempSpace('remixt_raw_dir'), remixt_config, remixt_refdata, ), kwargs={ 'normal_id': sample_id + 'N', }) workflow.transform( name='parse_remixt', func='wgs.workflows.remixt.tasks.parse_remixt_file', args=(mgd.InputFile(remixt_results_filename), [ mgd.OutputFile(remixt_brk_cn_csv, extensions=['.yaml']), mgd.OutputFile(remixt_cn_csv, extensions=['.yaml']), mgd.OutputFile(remixt_minor_modes_csv, extensions=['.yaml']), mgd.OutputFile(remixt_mix_csv, extensions=['.yaml']), mgd.OutputFile(remixt_read_depth_csv, extensions=['.yaml']), mgd.OutputFile(remixt_stats_csv, extensions=['.yaml']), ], ['/brk_cn', '/cn', '/minor_modes', '/mix', '/read_depth', '/stats'], mgd.TempSpace('tempdir_parse'))) return workflow
def analyze_tumour_normal(config, input_args, results_dir, normal_bam, tumour_sample, tumour_bam, snv_tsv, indel_tsv, snv_vcf, indel_vcf): workflow = pypeliner.workflow.Workflow() matched_results_dir = os.path.join(results_dir, tumour_sample) helpers.makedirs(matched_results_dir) workflow.subworkflow(name='run_deepSNV', func=deepSNV.run_deepSNV, args=(config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.OutputFile( os.path.join(matched_results_dir, 'deepSNV_out.tsv')))) workflow.subworkflow(name='run_VarScan', func=VarScan.run_VarScan, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.OutputFile( os.path.join(matched_results_dir, 'VarScan_out.vcf')), mgd.OutputFile( os.path.join(matched_results_dir, 'VarScan_indel_out.vcf')), )) workflow.subworkflow(name='run_MutationSeq', func=MutationSeq.run_MutationSeq, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.OutputFile( os.path.join(matched_results_dir, 'museq_out.vcf')), )) workflow.subworkflow(name='run_Strelka', func=Strelka.run_Strelka, args=(config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.OutputFile( os.path.join(matched_results_dir, 'strelka_out.vcf')), mgd.OutputFile( os.path.join(matched_results_dir, 'strelka_indel_out.vcf')))) workflow.subworkflow(name='run_LoLoPicker', func=LoLoPicker.run_LoLoPicker, args=( config, input_args, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.OutputFile( os.path.join(matched_results_dir, 'LoLoPicker_out.tsv')), )) workflow.transform( name='create_result_dict', func=union.create_result_dict, ret=mgd.TempOutputObj('result_dict'), args=( mgd.InputFile(os.path.join(matched_results_dir, 'deepSNV_out.tsv')), mgd.InputFile(os.path.join(matched_results_dir, 'VarScan_out.vcf')), mgd.InputFile(os.path.join(matched_results_dir, 'museq_out.vcf')), mgd.InputFile(os.path.join(matched_results_dir, 'strelka_out.vcf')), mgd.InputFile( os.path.join(matched_results_dir, 'LoLoPicker_out.tsv')), )) workflow.transform(name='union_results', func=union.union_results, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.TempInputObj('result_dict'), mgd.TempSpace('union_space'), mgd.OutputFile(snv_tsv), mgd.OutputFile(snv_vcf), )) workflow.transform(name='union_indels', func=union.union_indels, args=( config, mgd.InputFile( os.path.join(matched_results_dir, 'strelka_indel_out.vcf')), mgd.InputFile( os.path.join(matched_results_dir, 'VarScan_indel_out.vcf')), mgd.OutputFile(indel_tsv), mgd.OutputFile(indel_vcf), )) return workflow
def create_somatic_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes='default', is_exome=False, split_size=int(1e7)): sandbox = soil.utils.workflow.get_sandbox( ['bcftools', 'samtools', 'strelka']) workflow = pypeliner.workflow.Workflow(default_ctx=med_mem_ctx, default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('config', 'regions'), value=soil.utils.genome.get_bam_regions( normal_bam_file, split_size, chromosomes=chromosomes)) workflow.setobj(obj=mgd.TempOutputObj('chrom_names', 'chrom_axis'), value=get_chromosomes(normal_bam_file, chromosomes=chromosomes)) workflow.transform( name='count_fasta_bases', func=soil.wrappers.strelka.tasks.count_fasta_bases, args=( mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('ref_base_counts.tsv'), ), ) workflow.transform( name='get_genome_size', ctx={'local': True}, func=get_known_genome_size, ret=mgd.TempOutputObj('genome_size'), args=( mgd.InputFile(tumour_bam_file), mgd.TempInputFile('ref_base_counts.tsv'), chromosomes, ), sandbox=None, ) workflow.transform( name='get_chromosome_depths', axes=('chrom_axis', ), func=soil.wrappers.strelka.tasks.get_chromosome_depth, args=( mgd.TempInputObj('chrom_names', 'chrom_axis'), mgd.InputFile(normal_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('chrom_depth.txt', 'chrom_axis'), ), ) workflow.transform( name='merge_chromosome_depths', func=soil.wrappers.strelka.tasks.merge_chromosome_depth, args=( mgd.TempInputFile('chrom_depth.txt', 'chrom_axis'), mgd.TempOutputFile('chrom_depth_merged.txt'), ), sandbox=None, ) workflow.transform(name='call_genome_segment', axes=('regions', ), func=soil.wrappers.strelka.tasks.call_genome_segment, args=( mgd.TempInputFile('chrom_depth_merged.txt'), mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('indels.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf', 'regions'), mgd.TempSpace('call_genome_segment_tmp', 'regions'), mgd.TempInputObj('config', 'regions'), mgd.TempInputObj('genome_size'), ), kwargs={ 'is_exome': is_exome, }) workflow.transform( name='merge_indels', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('indels.vcf', 'regions'), mgd.TempOutputFile('indels.vcf.gz'), ), ) workflow.transform( name='merge_snvs', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( mgd.TempInputFile('snvs.vcf', 'regions'), mgd.TempOutputFile('snvs.vcf.gz'), ), ) workflow.transform( name='merge_all', func=soil.wrappers.samtools.tasks.concatenate_vcf, args=( [ mgd.TempInputFile('indels.vcf.gz'), mgd.TempInputFile('snvs.vcf.gz') ], mgd.TempOutputFile('merged.vcf.gz'), ), kwargs={ 'allow_overlap': True, }, ) workflow.commandline(name='filter_vcf', ctx=low_mem_ctx, args=( 'bcftools', 'view', '-O', 'z', '-f', '.,PASS', '-o', mgd.OutputFile(out_file), mgd.TempInputFile('merged.vcf.gz'), )) workflow.transform(name='index_vcf', ctx=low_mem_ctx, func=soil.wrappers.samtools.tasks.index_vcf, args=( mgd.InputFile(out_file), mgd.OutputFile(out_file + '.tbi'), )) return workflow
def process_cells_destruct(destruct_config, cell_bam_files, reads_1, reads_2, sample_1, sample_2, stats, tag=False): ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, } cells = list(cell_bam_files.keys()) workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cells, ) workflow.transform( name='bamdisc_and_numreads_cell', func= "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads", axes=('cell_id', ), ctx={ 'io': 1, 'mem': 8 }, ret=mgd.TempOutputObj("numreads", "cell_id"), args=( destruct_config, mgd.InputFile('bam', 'cell_id', fnames=cell_bam_files), mgd.TempOutputFile('cell_stats', 'cell_id'), mgd.TempOutputFile('cell_reads_1.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_reads_2.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_sample_1.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_sample_2.fastq.gz', 'cell_id'), mgd.TempSpace('bamdisc_cell_tempspace', 'cell_id'), ), ) workflow.transform( name='merge_read_counts', ret=mgd.TempOutputObj("readcounts"), func= "single_cell.workflows.destruct_singlecell.tasks.merge_read_counts", ctx={ 'io': 1, 'mem': 8 }, args=(mgd.TempInputObj('numreads', 'cell_id'), )) workflow.transform( name='reindex_reads', func= "single_cell.workflows.destruct_singlecell.tasks.re_index_reads_both", ctx={ 'io': 1, 'mem': 8 }, axes=('cell_id', ), args=( mgd.TempInputFile('cell_reads_1.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_reads_1_reindex.fastq.gz', 'cell_id'), mgd.TempInputFile('cell_reads_2.fastq.gz', 'cell_id'), mgd.TempOutputFile('cell_reads_2_reindex.fastq.gz', 'cell_id'), mgd.InputInstance('cell_id'), cells, mgd.TempInputObj('readcounts'), ), kwargs={'tag': tag}) workflow.transform( name='merge_reads_r1', ctx={ 'io': 1, 'mem': 8, 'disk': 100 }, func= "single_cell.workflows.destruct_singlecell.tasks.merge_cell_fastqs", args=( mgd.TempInputFile('cell_reads_1_reindex.fastq.gz', 'cell_id'), mgd.OutputFile(reads_1), ), ) workflow.transform( name='merge_reads_r2', ctx={ 'io': 1, 'mem': 8, 'disk': 100 }, func= "single_cell.workflows.destruct_singlecell.tasks.merge_cell_fastqs", args=( mgd.TempInputFile('cell_reads_2_reindex.fastq.gz', 'cell_id'), mgd.OutputFile(reads_2), ), ) workflow.transform( name='merge_sample', ctx={ 'io': 1, 'mem': 8, 'disk': 100 }, func="single_cell.workflows.destruct_singlecell.tasks.resample_fastqs", args=( mgd.TempInputFile('cell_sample_1.fastq.gz', 'cell_id'), mgd.TempInputFile('cell_sample_2.fastq.gz', 'cell_id'), mgd.OutputFile(sample_1), mgd.OutputFile(sample_2), destruct_config['num_read_samples'], ), ) workflow.transform( name='merge_stats', ctx={ 'io': 1, 'mem': 8 }, func="single_cell.workflows.destruct_singlecell.tasks.merge_stats", args=( mgd.TempInputFile('cell_stats', 'cell_id'), mgd.OutputFile(stats), ), ) return workflow
def create_multiple_lane_align_workflow(fastq_files_1, fastq_files_2, ref_genome_dir, out_bam_file, add_xs_tag=False, align_threads=1, merge_threads=1, read_group_info=None, sort_threads=1): if read_group_info is None: read_group_info = {} for key in fastq_files_1: read_group_info[key] = None sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('read_group_info', 'lane'), value=read_group_info) workflow.subworkflow(name='align', axes=('lane', ), func=create_align_workflow, args=( mgd.InputFile('R1.fq.gz', 'lane', fnames=fastq_files_1), mgd.InputFile('R2.fq.gz', 'lane', fnames=fastq_files_2), ref_genome_dir, mgd.TempOutputFile('lane.bam', 'lane'), ), kwargs={ 'add_xs_tag': add_xs_tag, 'align_threads': align_threads, 'read_group_info': mgd.TempInputObj('read_group_info', 'lane'), 'sort_threads': sort_threads, }) workflow.transform(name='markdups_and_merge', axes=(), ctx={ 'mem': 24, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': merge_threads }, func=soil.wrappers.sambamba.tasks.markdups, args=( mgd.TempInputFile('lane.bam', 'lane'), mgd.OutputFile(out_bam_file), mgd.TempSpace('markdup_tmp'), ), kwargs={ 'threads': merge_threads, }) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_somatic_calling_workflow(samples, tumours, normals, museq_vcf, museq_maf, museq_paired_pdf, strelka_snv_vcf, strelka_snv_maf, strelka_indel_vcf, strelka_indel_maf, mutect_vcf, mutect_maf, somatic_consensus_maf, refdir, normal_ids, tumour_ids, single_node=False, is_exome=False): strelka_snv_vcf = dict([(sampid, strelka_snv_vcf[sampid]) for sampid in samples]) strelka_indel_vcf = dict([(sampid, strelka_indel_vcf[sampid]) for sampid in samples]) strelka_snv_maf = dict([(sampid, strelka_snv_maf[sampid]) for sampid in samples]) strelka_indel_maf = dict([(sampid, strelka_indel_maf[sampid]) for sampid in samples]) museq_vcf = dict([(sampid, museq_vcf[sampid]) for sampid in samples]) museq_maf = dict([(sampid, museq_maf[sampid]) for sampid in samples]) museq_paired_pdf = dict([(sampid, museq_paired_pdf[sampid]) for sampid in samples]) mutect_vcf = dict([(sampid, mutect_vcf[sampid]) for sampid in samples]) mutect_maf = dict([(sampid, mutect_maf[sampid]) for sampid in samples]) somatic_consensus_maf = dict([(sampid, somatic_consensus_maf[sampid]) for sampid in samples]) chromosomes = config.refdir_data(refdir)['params']['chromosomes'] paths_refdir = config.refdir_data(refdir)['paths'] workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.setobj(obj=mgd.TempOutputObj('normal_id', 'sample_id', axes_origin=[]), value={v: normal_ids[v] for v in samples}) workflow.setobj(obj=mgd.TempOutputObj('tumour_id', 'sample_id', axes_origin=[]), value={v: tumour_ids[v] for v in samples}) workflow.subworkflow( name="mutationseq_paired", func='wgs.workflows.mutationseq.create_museq_workflow', axes=('sample_id', ), args=( mgd.OutputFile('museq_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_vcf), mgd.OutputFile('museq_snv_ann.maf', 'sample_id', fnames=museq_maf), mgd.OutputFile('museq_paired_pdf', 'sample_id', fnames=museq_paired_pdf), paths_refdir['reference'], paths_refdir['reference_vep'], chromosomes, ), kwargs={ 'normal_id': mgd.TempInputObj('normal_id', 'sample_id'), 'tumour_id': mgd.TempInputObj('tumour_id', 'sample_id'), 'tumour_bam': mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node, }) workflow.subworkflow( name="strelka", func='wgs.workflows.strelka.create_strelka_workflow', axes=('sample_id', ), args=( mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.OutputFile('strelka_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_snv_vcf), mgd.OutputFile('strelka_snv_ann.maf', 'sample_id', fnames=strelka_snv_maf), mgd.OutputFile('strelka_indel_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_indel_vcf), mgd.OutputFile('strelka_indel_ann.maf', 'sample_id', fnames=strelka_indel_maf), paths_refdir['reference'], paths_refdir['reference_vep'], chromosomes, mgd.TempInputObj('normal_id', 'sample_id'), mgd.TempInputObj('tumour_id', 'sample_id'), ), kwargs={ 'single_node': single_node, 'is_exome': is_exome }, ) workflow.subworkflow( name="mutect", func='wgs.workflows.mutect.create_mutect_workflow', axes=('sample_id', ), args=( mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.OutputFile('mutect_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=mutect_vcf), mgd.OutputFile('mutect_snv_ann.maf', 'sample_id', fnames=mutect_maf), paths_refdir['reference'], paths_refdir['reference_vep'], chromosomes, mgd.TempInputObj('normal_id', 'sample_id'), mgd.TempInputObj('tumour_id', 'sample_id'), ), kwargs={ 'single_node': single_node, }, ) workflow.subworkflow( name="somatic_consensus", func= 'wgs.workflows.somatic_calling_consensus.create_somatic_consensus_workflow', axes=('sample_id', ), args=( mgd.InputFile('mutect_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=mutect_vcf), mgd.InputFile('strelka_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_snv_vcf), mgd.InputFile('strelka_indel_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=strelka_indel_vcf), mgd.InputFile('museq_snv_ann.vcf.gz', 'sample_id', extensions=['.csi', '.tbi'], fnames=museq_vcf), mgd.OutputFile("somatic_consensus.maf", 'sample_id', fnames=somatic_consensus_maf), chromosomes, paths_refdir['reference_vep'], mgd.TempInputObj('normal_id', 'sample_id'), mgd.TempInputObj('tumour_id', 'sample_id'), ), ) return workflow
def realign_bam_files(inputs, outputs, metrics_output, metrics_tar, refdir, samples, single_node=False, ignore_bamtofastq_exception=False, picard_mem=8): inputs = dict([(sample, inputs[sample]) for sample in samples]) outputs = dict([(sample, outputs[sample]) for sample in samples]) outputs_tdf = dict([(sample, outputs[sample] + '.tdf') for sample in samples]) metrics_output = dict([(sample, metrics_output[sample]) for sample in samples]) metrics_tar = dict([(sample, metrics_tar[sample]) for sample in samples]) workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.transform(name='bam_to_fastq', ctx=helpers.get_default_ctx(walltime='96:00', disk=500), func="wgs.workflows.realignment.tasks.split_by_rg", axes=('sample_id', ), args=(mgd.InputFile('input.bam', 'sample_id', fnames=inputs), mgd.TempOutputFile("inputdata_read1.fastq.gz", 'sample_id', "readgroup"), mgd.TempOutputFile("inputdata_read2.fastq.gz", 'sample_id', "readgroup", axes_origin=[]), mgd.TempSpace("bamtofastq", 'sample_id'), ignore_bamtofastq_exception)) workflow.transform(name='get_sample_info', func="wgs.workflows.realignment.tasks.get_read_group", axes=('sample_id', ), ret=mgd.TempOutputObj('sample_info', 'sample_id'), args=(mgd.InputFile('input.bam', 'sample_id', fnames=inputs), )) workflow.subworkflow(name='align_samples', func=alignment.align_samples, args=(mgd.TempInputFile("inputdata_read1.fastq.gz", "sample_id", "readgroup", axes_origin=[]), mgd.TempInputFile("inputdata_read2.fastq.gz", "sample_id", "readgroup", axes_origin=[]), mgd.OutputFile('output.bam', 'sample_id', fnames=outputs, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('output_metrics.csv', 'sample_id', fnames=metrics_output, extensions=['.yaml'], axes_origin=[]), mgd.OutputFile('output_metrics.tar', 'sample_id', fnames=metrics_tar, axes_origin=[]), mgd.OutputFile('output.bam.tdf', 'sample_id', fnames=outputs_tdf, axes_origin=[]), mgd.TempInputObj('sample_info', 'sample_id', axes_origin=[]), refdir), kwargs={ 'single_node': single_node, 'picard_mem': picard_mem }) return workflow
def create_ref_panel_phase_workflow(genetic_map_file, ref_file, target_file, out_file): """ Run EAGLE using a reference panel. """ sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'eagle']) workflow = pypeliner.workflow.Workflow(default_ctx=default_ctx, default_sandbox=sandbox) workflow.setobj( obj=mgd.TempOutputObj('chrom', 'chrom'), value=get_chromosomes(target_file) ) workflow.transform( name='split_ref', axes=('chrom',), func=tasks.get_chrom_variant_file, args=( mgd.TempInputObj('chrom', 'chrom'), mgd.InputFile(ref_file), mgd.TempOutputFile('ref.bcf', 'chrom') ) ) workflow.transform( name='split_target', axes=('chrom',), func=tasks.get_chrom_variant_file, args=( mgd.TempInputObj('chrom', 'chrom'), mgd.InputFile(target_file), mgd.TempOutputFile('target.bcf', 'chrom') ) ) workflow.transform( name='run_eagle', axes=('chrom',), func=tasks.run_eagle, args=( mgd.InputFile(genetic_map_file), mgd.TempInputFile('ref.bcf', 'chrom'), mgd.TempInputFile('target.bcf', 'chrom'), mgd.TempOutputFile('phased.bcf', 'chrom'), mgd.TempSpace('eagle_tmp', 'chrom') ) ) workflow.transform( name='concat_results', func=tasks.concat_results, args=( mgd.TempInputFile('phased.bcf', 'chrom'), mgd.OutputFile(out_file) ) ) workflow.commandline( name='index', args=( 'bcftools', 'index', '-t', '-o', mgd.OutputFile(out_file + '.tbi'), mgd.InputFile(out_file) ) ) return workflow
def create_resample_simulation_workflow( sim_defs, mixture_filename, source_filename, normal_filename, tumour_filename, breakpoint_filename, config, ref_data_dir, ): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4}) workflow.setobj( obj=mgd.TempOutputObj('sim_defs'), value=sim_defs, ) workflow.transform( name='simulate_germline_alleles', ctx={'mem': 8}, func=remixt.simulations.pipeline.simulate_germline_alleles, args=( mgd.TempOutputFile('germline_alleles'), mgd.TempInputObj('sim_defs'), config, ref_data_dir, ), ) workflow.transform( name='resample_normal_data', ctx={'mem': 128}, func=remixt.simulations.pipeline.resample_normal_data, args=( mgd.OutputFile(normal_filename), mgd.InputFile(source_filename), mgd.InputFile(mixture_filename), mgd.TempInputFile('germline_alleles'), mgd.TempInputObj('sim_defs'), ), ) workflow.transform( name='resample_tumour_data', ctx={'mem': 128}, func=remixt.simulations.pipeline.resample_tumour_data, args=( mgd.OutputFile(tumour_filename), mgd.InputFile(source_filename), mgd.InputFile(mixture_filename), mgd.TempInputFile('germline_alleles'), mgd.TempInputObj('sim_defs'), ), ) workflow.transform( name='write_breakpoints', func=remixt.simulations.pipeline.write_breakpoints, args=( mgd.OutputFile(breakpoint_filename), mgd.InputFile(mixture_filename), ), ) return workflow
def create_calc_bias_workflow( tumour_seqdata_filename, segment_filename, segment_length_filename, config, ref_data_dir, ): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4}) workflow.transform( name='calc_fragment_stats', ctx={'mem': 16}, func=remixt.analysis.stats.calculate_fragment_stats, ret=mgd.TempOutputObj('fragstats'), args=( mgd.InputFile(tumour_seqdata_filename), config, ) ) workflow.transform( name='sample_gc', ctx={'mem': 16}, func=remixt.analysis.gcbias.sample_gc, args=( mgd.TempOutputFile('gcsamples.tsv'), mgd.InputFile(tumour_seqdata_filename), mgd.TempInputObj('fragstats').prop('fragment_mean'), config, ref_data_dir, ) ) workflow.transform( name='gc_lowess', ctx={'mem': 16}, func=remixt.analysis.gcbias.gc_lowess, args=( mgd.TempInputFile('gcsamples.tsv'), mgd.TempOutputFile('gcloess.tsv'), mgd.TempOutputFile('gctable.tsv'), ) ) workflow.transform( name='split_segments', func=remixt.utils.split_table, args=( mgd.TempOutputFile('segments.tsv', 'segment_rows_idx'), mgd.InputFile(segment_filename), 100, ), ) workflow.transform( name='gc_map_bias', axes=('segment_rows_idx',), ctx={'mem': 16}, func=remixt.analysis.gcbias.gc_map_bias, args=( mgd.TempInputFile('segments.tsv', 'segment_rows_idx'), mgd.TempInputObj('fragstats').prop('fragment_mean'), mgd.TempInputObj('fragstats').prop('fragment_stddev'), mgd.TempInputFile('gcloess.tsv'), mgd.TempOutputFile('biases.tsv', 'segment_rows_idx'), config, ref_data_dir, ) ) workflow.transform( name='merge_biases', func=remixt.utils.merge_tables, args=( mgd.TempOutputFile('biases.tsv'), mgd.TempInputFile('biases.tsv', 'segment_rows_idx'), ), ) workflow.transform( name='biased_length', func=remixt.analysis.gcbias.biased_length, args=( mgd.OutputFile(segment_length_filename), mgd.TempInputFile('biases.tsv'), ), ) return workflow
def create_titan_workflow(normal_bam_file, tumour_bam_file, dbsnp_vcf_file, mappability_file, ref_genome_fasta_file, out_file, exome_bed_file=None, sample='Tumour', threads=1): sandbox = soil.utils.workflow.get_sandbox( ['hmmcopy', 'hmmcopy_utils', 'titan']) sandbox.channels.append('conda-forge') sandbox.packages.extend(['pandas', 'rpy2']) chromosomes = soil.utils.genome.load_bam_chromosome_lengths( normal_bam_file, 'autosomes') workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('init_params', 'param_idx'), value=tasks.create_intialization_parameters()) workflow.subworkflow(name='get_allele_counts', func=create_allele_counts_workflow, args=(mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(dbsnp_vcf_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('allele_counts.tsv')), kwargs={'chromosomes': 'autosomes'}) workflow.commandline(name='build_normal_wig', args=('readCounter', '-c', ','.join(chromosomes), mgd.InputFile(normal_bam_file), '>', mgd.TempOutputFile('normal.wig'))) workflow.commandline(name='build_tumour_wig', args=('readCounter', '-c', ','.join(chromosomes), mgd.InputFile(tumour_bam_file), '>', mgd.TempOutputFile('tumour.wig'))) workflow.commandline(name='build_gc_wig', args=('gcCounter', '-c', ','.join(chromosomes), mgd.InputFile(ref_genome_fasta_file), '>', mgd.TempOutputFile('gc.wig'))) workflow.commandline(name='build_mappability_wig', args=('mapCounter', '-c', ','.join(chromosomes), mgd.InputFile(mappability_file), '>', mgd.TempOutputFile('mappability.wig'))) workflow.transform(name='build_coverage_file', func=tasks.build_coverage_file, args=(mgd.TempInputFile('normal.wig'), mgd.TempInputFile('tumour.wig'), mgd.TempInputFile('gc.wig'), mgd.TempInputFile('mappability.wig'), mgd.TempOutputFile('coverage.wig')), kwargs={'target_file': exome_bed_file}) workflow.transform(name='run_titan', axes=('param_idx', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3, 'threads': threads }, func=tasks.run_titan, args=(mgd.TempInputFile('coverage.wig'), mgd.TempInputFile('allele_counts.tsv'), mgd.TempInputObj('init_params', 'param_idx'), mgd.TempOutputFile('run.tar.gz', 'param_idx'), mgd.TempSpace('titan_tmp', 'param_idx')), kwargs={ 'is_exome': (exome_bed_file is not None), 'sample': sample, 'threads': threads }) workflow.transform(name='build_run_stats_file', func=tasks.build_run_stats_file, args=(mgd.TempInputFile('run.tar.gz', 'param_idx'), mgd.TempInputObj('init_params', 'param_idx'), mgd.TempOutputFile('stats.tsv'))) workflow.transform(name='build_output', func=tasks.build_final_results_file, args=(mgd.TempInputFile('coverage.wig'), mgd.TempInputFile('allele_counts.tsv'), mgd.TempInputFile('run.tar.gz', 'param_idx'), mgd.TempInputFile('stats.tsv'), mgd.OutputFile(out_file), mgd.TempSpace('build_results'))) return workflow
def create_hmmcopy_workflow( bam_file, reads, segs, metrics, params, igv_seg_filename, segs_pdf, bias_pdf, plot_heatmap_ec_output, plot_metrics_output, plot_kernel_density_output, hmmcopy_data_tar, cell_ids, hmmparams, sample_info ): chromosomes = hmmparams["chromosomes"] baseimage = hmmparams['docker']['single_cell_pipeline'] hmmcopy_docker = hmmparams['docker']['hmmcopy'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.setobj( obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]), value=sample_info) workflow.transform( name='run_hmmcopy', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.run_hmmcopy", axes=('cell_id',), args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']), mgd.TempOutputFile('reads.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('segs.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('params.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('hmm_metrics.csv.gz', 'cell_id', extensions=['.yaml']), mgd.TempOutputFile('hmm_data.tar.gz', 'cell_id'), mgd.InputInstance('cell_id'), hmmparams, mgd.TempSpace('hmmcopy_temp', 'cell_id'), hmmcopy_docker ), ) workflow.transform( name='merge_reads', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempOutputFile('reads_merged.csv.gz', extensions=['.yaml']), ), kwargs={'low_memory': True} ) workflow.transform( name='add_mappability_bool', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.get_mappability_col", args=( mgd.TempInputFile('reads_merged.csv.gz', extensions=['.yaml']), mgd.OutputFile(reads, extensions=['.yaml']), ), ) workflow.transform( name='merge_segs', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.OutputFile(segs, extensions=['.yaml']), ), kwargs={'low_memory': True} ) workflow.transform( name='merge_metrics', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempOutputFile("hmm_metrics.csv.gz", extensions=['.yaml']), ), ) workflow.transform( name='merge_params', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.concatenate_csv", args=( mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.OutputFile(params, extensions=['.yaml']), ), ) workflow.transform( name='get_max_cn', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.get_max_cn", ret=mgd.TempOutputObj('max_cn'), args=( mgd.InputFile(reads, extensions=['.yaml']), ) ) workflow.transform( name='hmmcopy_plots', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_hmmcopy", axes=('cell_id',), args=( mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']), hmmparams['ref_genome'], mgd.TempOutputFile('segments.png', 'cell_id', axes_origin=[]), mgd.TempOutputFile('bias.png', 'cell_id', axes_origin=[]), mgd.InputInstance('cell_id'), ), kwargs={ 'num_states': hmmparams['num_states'], 'sample_info': mgd.TempInputObj('sampleinfo', 'cell_id'), 'max_cn': mgd.TempInputObj("max_cn") } ) workflow.transform( name='annotate_metrics_with_info_and_clustering', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.add_clustering_order", args=( mgd.InputFile(reads, extensions=['.yaml']), mgd.TempInputFile("hmm_metrics.csv.gz", extensions=['.yaml']), mgd.OutputFile(metrics, extensions=['.yaml']), ), kwargs={ 'chromosomes': hmmparams["chromosomes"], 'sample_info': sample_info } ) workflow.transform( name='merge_hmm_copy_plots', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.merge_pdf", args=( [ mgd.TempInputFile('segments.png', 'cell_id'), mgd.TempInputFile('bias.png', 'cell_id'), ], [ mgd.OutputFile(segs_pdf), mgd.OutputFile(bias_pdf), ], mgd.InputFile(metrics, extensions=['.yaml']), None, mgd.TempSpace("hmmcopy_plot_merge_temp"), ['segments', 'bias'] ) ) workflow.transform( name='create_igv_seg', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.create_igv_seg", args=( mgd.InputFile(segs, extensions=['.yaml']), mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(igv_seg_filename), hmmparams, ) ) workflow.transform( name='plot_metrics', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_metrics", args=( mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_metrics_output), 'QC pipeline metrics', ) ) workflow.transform( name='plot_kernel_density', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_kernel_density", args=( mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_kernel_density_output), ',', 'mad_neutral_state', 'QC pipeline metrics', ) ) workflow.transform( name='plot_heatmap_ec', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.workflows.hmmcopy.tasks.plot_pcolor", args=( mgd.InputFile(reads, extensions=['.yaml']), mgd.InputFile(metrics, extensions=['.yaml']), mgd.OutputFile(plot_heatmap_ec_output), ), kwargs={ 'plot_title': 'QC pipeline metrics', 'column_name': 'state', 'plot_by_col': 'experimental_condition', 'color_by_col': 'cell_call', 'chromosomes': chromosomes, 'max_cn': hmmparams['num_states'], 'scale_by_cells': False, 'mappability_threshold': hmmparams["map_cutoff"] } ) workflow.transform( name='merge_hmmcopy_data_tars', ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage}, func="single_cell.utils.helpers.tar_files", args=( mgd.TempInputFile('hmm_data.tar.gz', 'cell_id', axes_origin=[]), mgd.OutputFile(hmmcopy_data_tar), mgd.TempSpace("merge_tarballs") ), ) return workflow
def create_destruct_workflow(normal_stats, normal_reads_1, normal_reads_2, normal_sample_1, normal_sample_2, tumour_stats, tumour_reads_1, tumour_reads_2, tumour_sample_1, tumour_sample_2, destruct_config, ref_data_directory, breakpoints_filename, breakpoints_library_filename, cell_counts_filename, raw_data_directory, normal_sample_id='normal', tumour_sample_id='tumour', tumour_library_id='tumour'): tumour_sample_id = '_'.join([tumour_sample_id, tumour_library_id]) workflow = pypeliner.workflow.Workflow() workflow.transform(name="get_destruct_config", func="destruct.defaultconfig.get_config", ret=mgd.TempOutputObj("destruct_config"), args=(ref_data_directory, destruct_config)) workflow.subworkflow( name='destruct', func="destruct.workflow.create_destruct_fastq_workflow", ctx={'disk': 200}, args=( { normal_sample_id: mgd.InputFile(normal_reads_1), tumour_sample_id: mgd.InputFile(tumour_reads_1), }, { normal_sample_id: mgd.InputFile(normal_reads_2), tumour_sample_id: mgd.InputFile(tumour_reads_2), }, { normal_sample_id: mgd.InputFile(normal_sample_1), tumour_sample_id: mgd.InputFile(tumour_sample_1), }, { normal_sample_id: mgd.InputFile(normal_sample_2), tumour_sample_id: mgd.InputFile(tumour_sample_2), }, { normal_sample_id: mgd.InputFile(normal_stats), tumour_sample_id: mgd.InputFile(tumour_stats), }, mgd.TempOutputFile('breakpoint_table'), mgd.TempOutputFile('breakpoint_library_table'), mgd.TempOutputFile('breakpoint_read_table'), mgd.TempInputObj("destruct_config"), ref_data_directory, ), kwargs={ 'raw_data_dir': raw_data_directory, }, ) workflow.transform( name='filter_annotate_breakpoints', ctx={'mem': 8}, func= "biowrappers.components.breakpoint_calling.destruct.tasks.filter_annotate_breakpoints", args=( pypeliner.managed.TempInputFile('breakpoint_table'), pypeliner.managed.TempInputFile('breakpoint_library_table'), [normal_sample_id], pypeliner.managed.TempOutputFile('breakpoints_filename.csv'), pypeliner.managed.TempOutputFile( 'breakpoints_library_filename.csv'), ), ) workflow.transform( name='filter_breakpoint_reads', ctx={'mem': 8}, func= "single_cell.workflows.destruct_singlecell.tasks.filter_reads_file", args=( mgd.TempInputFile('breakpoint_read_table'), pypeliner.managed.TempInputFile('breakpoints_filename.csv'), mgd.TempOutputFile('breakpoint_read_table_filtered'), ), ) workflow.transform( name='extract_cell_counts', ctx={'mem': 8}, func= "single_cell.workflows.destruct_singlecell.tasks.extract_cell_counts", args=( mgd.TempInputFile('breakpoint_read_table_filtered'), mgd.TempOutputFile('cell_counts_filename.csv'), ), ) workflow.transform( name='prep_cell_counts', ctx={ 'mem': 8, 'ncpus': 1 }, func="single_cell.utils.csvutils.prep_csv_files", args=( mgd.TempInputFile('cell_counts_filename.csv'), mgd.TempOutputFile("cell_counts_prep.csv.gz", extensions=['.yaml']), ), ) workflow.transform( name='finalize_cell_counts', ctx={ 'mem': 8, 'ncpus': 1 }, func="single_cell.utils.csvutils.finalize_csv", args=( mgd.TempInputFile("cell_counts_prep.csv.gz", extensions=['.yaml']), mgd.OutputFile(cell_counts_filename, extensions=['.yaml']), ), ) workflow.transform( name='prep_breakpoints', ctx={ 'mem': 8, 'ncpus': 1 }, func="single_cell.utils.csvutils.prep_csv_files", args=( pypeliner.managed.TempInputFile('breakpoints_filename.csv'), pypeliner.managed.TempOutputFile( "breakpoints_filename_prep.csv.gz", extensions=['.yaml']), ), ) workflow.transform( name='finalize_breakpoints', ctx={ 'mem': 8, 'ncpus': 1 }, func="single_cell.utils.csvutils.finalize_csv", args=( pypeliner.managed.TempInputFile("breakpoints_filename_prep.csv.gz", extensions=['.yaml']), pypeliner.managed.OutputFile(breakpoints_filename, extensions=['.yaml']), ), ) workflow.transform( name='prep_breakpoints_library', ctx={ 'mem': 8, 'ncpus': 1 }, func="single_cell.utils.csvutils.prep_csv_files", args=( pypeliner.managed.TempInputFile( 'breakpoints_library_filename.csv'), pypeliner.managed.TempOutputFile('breakpoints_library_prep.csv.gz', extensions=['.yaml']), ), ) workflow.transform( name='finalize_breakpoints_library', ctx={ 'mem': 8, 'ncpus': 1 }, func="single_cell.utils.csvutils.finalize_csv", args=( pypeliner.managed.TempInputFile('breakpoints_library_prep.csv.gz', extensions=['.yaml']), pypeliner.managed.OutputFile(breakpoints_library_filename, extensions=['.yaml']), ), ) return workflow