def create_eagle_ref_data_workflow(vcf_url_template, out_file, local_download=False): chrom_map_file = soil.utils.package_data.load_data_file( 'ref_data/data/GRCh37/chrom_map.tsv') chrom_map = pd.read_csv(chrom_map_file, sep='\t') chrom_map = chrom_map[chrom_map['ncbi'].isin( [str(x) for x in range(1, 23)])] chrom_map['url'] = chrom_map['ncbi'].apply( lambda x: vcf_url_template.format(chrom=x)) vcf_urls = chrom_map['url'].to_dict() sandbox = soil.utils.workflow.get_sandbox(['bcftools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls) workflow.transform(name='download_vcf_files', axes=('chrom', ), ctx={'local': local_download}, func=soil.ref_data.tasks.download, args=(mgd.TempInputObj('vcf_url', 'chrom'), mgd.TempOutputFile('raw.vcf.gz', 'chrom'))) workflow.transform(name='write_chrom_map', func=tasks.write_chrom_map_file, args=(mgd.InputFile(chrom_map_file), mgd.TempOutputFile('chrom_map.tsv'))) workflow.transform(name='rename_chroms', axes=('chrom', ), func=soil.wrappers.bcftools.tasks.rename_chroms, args=(mgd.TempInputFile('chrom_map.tsv'), mgd.TempInputFile('raw.vcf.gz', 'chrom'), mgd.TempOutputFile('renamed.bcf', 'chrom'))) workflow.transform(name='concat_vcfs', func=soil.wrappers.bcftools.tasks.concatenate_vcf, args=(mgd.TempInputFile('renamed.bcf', 'chrom'), mgd.OutputFile(out_file)), kwargs={'bcf_output': True}) workflow.commandline(name='index', args=('bcftools', 'index', mgd.InputFile(out_file), '-o', mgd.OutputFile(out_file + '.csi'))) return workflow
def create_variant_counting_workflow( vcfs, tumour_cell_bams, results_h5, config, ): """ Count variant reads for multiple sets of variants across cells. """ workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=tumour_cell_bams.keys(), ) workflow.transform(name='merge_snvs', func='biowrappers.components.io.vcf.tasks.merge_vcfs', args=([mgd.InputFile(vcf) for vcf in vcfs], mgd.TempOutputFile('all.snv.vcf'))) workflow.transform(name='finalise_snvs', func="biowrappers.components.io.vcf.tasks.finalise_vcf", args=(mgd.TempInputFile('all.snv.vcf'), mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi'])), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'vcftools') }) workflow.subworkflow( name='count_alleles', func=create_snv_allele_counts_for_vcf_targets_workflow, args=( config, mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams), mgd.TempInputFile('all.snv.vcf.gz'), mgd.OutputFile(results_h5), ), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') }, ) return workflow
def create_consensus_workflow(destruct_breakpoints, lumpy_vcf, output, chromosomes): params = config.default_params('breakpoint_calling') workflow = pypeliner.workflow.Workflow() workflow.transform( name='parse_lumpy', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func= 'wgs.workflows.breakpoint_calling_consensus.tasks.parse_lumpy_task', args=( mgd.InputFile(lumpy_vcf), mgd.TempOutputFile('lumpy.csv'), params["parse_lumpy"], ), kwargs={'chromosomes': chromosomes}) workflow.transform( name='parse_destruct', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func= 'wgs.workflows.breakpoint_calling_consensus.tasks.parse_destruct_task', args=( mgd.InputFile(destruct_breakpoints), mgd.TempOutputFile('destruct.csv'), params["parse_destruct"], ), kwargs={'chromosomes': chromosomes}) workflow.transform( name='consensus_breakpoint_calling', ctx=helpers.get_default_ctx( memory=15, walltime='8:00', ), func='wgs.workflows.breakpoint_calling_consensus.tasks.consensus_calls', args=(mgd.TempInputFile('destruct.csv'), mgd.TempInputFile('lumpy.csv'), mgd.OutputFile(output), params['consensus']), ) return workflow
def run_MutationSeq(config, normal_bam, tumour_bam, output_file): workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X']))) workflow.transform( name='run_museq_paired', ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'}, axes=('interval',), func=tasks.run_museq, args=( config, mgd.InputFile(normal_bam), mgd.InputFile(tumour_bam), mgd.InputInstance('interval'), mgd.TempOutputFile('museq.vcf', 'interval'), mgd.TempOutputFile('museq.log', 'interval'), ) ) workflow.transform( name='merge_vcfs', func=tasks.merge_vcfs, args=( mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]), mgd.OutputFile(output_file), mgd.TempSpace('merge_vcf'), ) ) return workflow
def _create_download_decompress_workflow(url, local_path, local_download=False): workflow = pypeliner.workflow.Workflow() workflow.setobj(mgd.TempOutputObj('url'), value=url) workflow.transform( name='download', ctx={'local': local_download}, func=tasks.download, args=( mgd.TempInputObj('url'), mgd.TempOutputFile('download'), ), ) workflow.transform(name='decompress', func=tasks.decompress, args=( mgd.TempInputFile('download'), mgd.OutputFile(local_path), )) return workflow
def create_workflow_1(input_filename, output_filename): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1}) # Read data into a managed object workflow.transform(name='read', func=read_stuff, ret=mgd.TempOutputObj('input_data'), args=(mgd.InputFile(input_filename), )) # Extract a property of the managed object, modify it # and store the result in another managed object workflow.transform( name='do', func=do_stuff, ret=mgd.TempOutputObj('output_data'), args=(mgd.TempInputObj('input_data').prop('some_string'), )) # Write the object to an output file workflow.transform(name='write', func=write_stuff, args=(mgd.TempInputObj('output_data'), mgd.TempOutputFile('output_file'))) # Recursive workflow workflow.subworkflow(name='sub_workflow_2', func=create_workflow_2, args=(mgd.TempInputFile('output_file'), mgd.OutputFile(output_filename))) return workflow
def create_vcf2maf_workflow(vcf_file, maf_file, reference, tumour_id=None, normal_id=None): workflow = pypeliner.workflow.Workflow() workflow.transform(name='vcf2maf', func='wgs.workflows.vcf2maf.tasks.run_vcf2maf', args=(mgd.InputFile(vcf_file), mgd.TempOutputFile('maf_file.maf'), mgd.TempSpace('vcf2maf_temp'), reference), kwargs={ 'tumour_id': tumour_id, 'normal_id': normal_id }) workflow.transform(name='update_ids', func='wgs.workflows.vcf2maf.tasks.update_ids', args=( mgd.TempInputFile('maf_file.maf'), tumour_id, normal_id, mgd.OutputFile(maf_file), )) return workflow
def create_lumpy_workflow(config, normal_bam, tumour_cell_bams, lumpy_breakpoints_csv, lumpy_breakpoints_evidence, lumpy_breakpoints_bed): ctx = {'docker_image': config['docker']['single_cell_pipeline']} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(tumour_cell_bams.keys()), ) workflow.subworkflow( name='normal_preprocess_lumpy', func='single_cell.workflows.lumpy.lumpy_preprocess_workflow', ctx={'docker_image': config['docker']['single_cell_pipeline']}, args=(normal_bam, config, mgd.TempOutputFile('normal.discordants.sorted.bam'), mgd.TempOutputFile('normal.splitters.sorted.bam'), mgd.TempOutputFile('hist_normal_formatted.csv'), mgd.TempOutputFile('normal_mean_stdev.yaml')), ) workflow.subworkflow( name='tumour_preprocess_lumpy', func='single_cell.workflows.lumpy.lumpy_preprocess_workflow', ctx={'docker_image': config['docker']['single_cell_pipeline']}, args=(mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams), config, mgd.TempOutputFile('tumour.discordants.sorted.bam'), mgd.TempOutputFile('tumour.splitters.sorted.bam'), mgd.TempOutputFile('hist_tumour_formatted.csv'), mgd.TempOutputFile('tumour_mean_stdev.yaml')), ) workflow.subworkflow( name='lumpy', ctx={'docker_image': config['docker']['single_cell_pipeline']}, func="single_cell.workflows.lumpy.lumpy_calling_workflow", args=( config, mgd.TempInputFile('normal.discordants.sorted.bam'), mgd.TempInputFile('normal.splitters.sorted.bam'), mgd.TempInputFile('hist_normal_formatted.csv'), mgd.TempInputFile('normal_mean_stdev.yaml'), mgd.TempInputFile('tumour.discordants.sorted.bam'), mgd.TempInputFile('tumour.splitters.sorted.bam'), mgd.TempInputFile('hist_tumour_formatted.csv'), mgd.TempInputFile('tumour_mean_stdev.yaml'), mgd.OutputFile(lumpy_breakpoints_bed), mgd.OutputFile(lumpy_breakpoints_csv, extensions=['.yaml']), mgd.OutputFile(lumpy_breakpoints_evidence, extensions=['.yaml']), ), ) return workflow
def create_fit_model_workflow( experiment_filename, results_filename, config, ref_data_dir, tumour_id=None, ): config = remixt.config.get_sample_config(config, tumour_id) workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 16}) workflow.transform( name='init', func=remixt.analysis.pipeline.init, ret=mgd.TempOutputObj('init_params', 'init_id'), args=( mgd.TempOutputFile('init_results'), mgd.InputFile(experiment_filename), config, ), ) workflow.transform( name='fit', axes=('init_id',), func=remixt.analysis.pipeline.fit_task, args=( mgd.TempOutputFile('fit_results', 'init_id'), mgd.InputFile(experiment_filename), mgd.TempInputObj('init_params', 'init_id'), config, ), ) workflow.transform( name='collate', func=remixt.analysis.pipeline.collate, args=( mgd.OutputFile(results_filename), mgd.InputFile(experiment_filename), mgd.TempInputFile('init_results'), mgd.TempInputFile('fit_results', 'init_id'), config, ), ) return workflow
def create_basic_workflow(fastq_file_1, fastq_file_2, out_file, threads=1): sandbox = soil.utils.workflow.get_sandbox([ 'mixcr', ]) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.commandline(name='align', ctx={ 'mem': 32, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, args=('mixcr', 'align', '-f', '-t', threads, mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), mgd.TempOutputFile('alignments.vdjca'))) workflow.commandline(name='assemble', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads }, args=('mixcr', 'assemble', '-f', '-t', 1, mgd.TempInputFile('alignments.vdjca'), mgd.TempOutputFile('clones.clns'))) workflow.commandline(name='export', ctx={ 'mem': 16, 'mem_retry_increment': 8, 'num_retry': 3 }, args=('mixcr', 'exportClones', '-f', mgd.TempInputFile('clones.clns'), mgd.TempOutputFile('results.tsv'))) workflow.commandline(name='compress', args=('gzip', '-c', mgd.TempInputFile('results.tsv'), '>', mgd.OutputFile(out_file))) return workflow
def create_snpeff_annotation_workflow(db, data_dir, target_vcf_file, out_file, classic_mode=True, split_size=int(1e3), table_name='snpeff'): ctx = {'num_retry': 3, 'mem_retry_increment': 2} workflow = Workflow() workflow.transform(name='split_vcf', ctx=dict(mem=2, **ctx), func='biowrappers.components.io.vcf.tasks.split_vcf', args=(mgd.InputFile(target_vcf_file), mgd.TempOutputFile('split.vcf', 'split')), kwargs={'lines_per_file': split_size}) workflow.transform( name='run_snpeff', axes=('split', ), ctx=dict(mem=8, **ctx), func='biowrappers.components.variant_calling.snpeff.tasks.run_snpeff', args=(db, data_dir, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('snpeff.vcf', 'split')), kwargs={ 'classic_mode': classic_mode, }) workflow.transform( name='convert_vcf_to_csv', axes=('split', ), ctx=dict(mem=4, **ctx), func= 'biowrappers.components.variant_calling.snpeff.tasks.convert_vcf_to_table', args=(mgd.TempInputFile('snpeff.vcf', 'split'), mgd.TempOutputFile('snpeff.csv.gz', 'split'), table_name)) workflow.transform( name='concatenate_tables', ctx=dict(mem=4, **ctx), func='biowrappers.components.io.csv.tasks.concatenate_csv', args=(mgd.TempInputFile('snpeff.csv.gz', 'split'), mgd.OutputFile(out_file))) return workflow
def circos_plot(titan_calls, remixt_calls, sample_id, breakpoints, circos_plot_remixt, circos_plot_titan): workflow = pypeliner.workflow.Workflow() workflow.transform( name='prep_titan', func='wgs_qc_utils.reader.read_titan.make_for_circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(titan_calls), mgd.TempOutputFile("titan_prepped"), ) ) workflow.transform( name='prep_remixt', func='wgs_qc_utils.reader.read_remixt.make_for_circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.InputFile(remixt_calls), sample_id, mgd.TempOutputFile("remixt_prepped"), ) ) workflow.transform( name='circos_plot', func='wgs.workflows.sample_qc.tasks.circos', ctx=helpers.get_default_ctx( memory=5 ), args=( mgd.TempInputFile("titan_prepped"), mgd.TempInputFile("remixt_prepped"), sample_id, breakpoints, mgd.OutputFile(circos_plot_remixt), mgd.OutputFile(circos_plot_titan), mgd.TempSpace("circos") ) ) return workflow
def create_align_workflow(fastq_file_1, fastq_file_2, ref_genome_dir, out_bam_file, add_xs_tag=False, align_threads=1, read_group_info=None, sort_threads=1): sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.transform(name='star_align', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': align_threads }, func=tasks.align, args=( mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), ref_genome_dir, mgd.TempOutputFile('aligned.bam'), mgd.TempSpace('align_tmp'), ), kwargs={ 'add_xs_tag': add_xs_tag, 'read_group_info': read_group_info, 'threads': align_threads, }) workflow.transform(name='sort', ctx={ 'mem': 32, 'mem_retry_increment': 16, 'num_retry': 3, 'threads': sort_threads }, func=soil.wrappers.sambamba.tasks.sort, args=( mgd.TempInputFile('aligned.bam'), mgd.OutputFile(out_bam_file), mgd.TempSpace('sort_tmp'), ), kwargs={'threads': sort_threads}) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def create_destruct_wrapper_workflow(bam_filenames, output_filename, raw_data_dir, control_id=None, config=None, ref_data_dir=None): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4}) workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=list(bam_filenames.keys()), ) workflow.subworkflow( name='run_destruct', func=destruct.workflow.create_destruct_workflow, args=( mgd.InputFile('bam', 'sample_id', fnames=bam_filenames), mgd.TempOutputFile('breakpoint_table'), mgd.TempOutputFile('breakpoint_library_table'), mgd.TempOutputFile('breakpoint_read_table'), config, ref_data_dir, ), kwargs={ 'raw_data_dir': raw_data_dir, }, ) workflow.transform( name='post_process', func=destruct.benchmark.wrappers.destruct.tasks.destruct_postprocess, args=( mgd.TempInputFile('breakpoint_table'), mgd.TempInputFile('breakpoint_library_table'), mgd.OutputFile(output_filename), ), kwargs={ 'control_id': control_id, }) return workflow
def pre_alignment(fastq_r1, fastq_r2, metrics_tar): workflow = pypeliner.workflow.Workflow() workflow.transform( name="fastqc_r1", ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), func='alignment.workflows.pre_alignment.tasks.run_fastqc', args=( mgd.InputFile(fastq_r1), mgd.TempOutputFile('R1.html'), mgd.TempOutputFile('R1.pdf'), mgd.TempSpace('fastqc_R1'), ), kwargs={ 'docker_image': config.containers("fastqc"), }) workflow.transform( name="fastqc_r2", func='alignment.workflows.pre_alignment.tasks.run_fastqc', ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400), args=( mgd.InputFile(fastq_r2), mgd.TempOutputFile('R2.html'), mgd.TempOutputFile('R2.pdf'), mgd.TempSpace('fastqc_R2'), ), kwargs={ 'docker_image': config.containers('fastqc'), }) workflow.transform(name='tar', func='alignment.utils.helpers.make_tar_from_files', axes=('sample_id', ), args=(mgd.OutputFile(metrics_tar), [ mgd.TempInputFile('R2.html'), mgd.TempInputFile('R2.pdf'), mgd.TempInputFile('R2.html'), mgd.TempInputFile('R2.pdf'), ], mgd.TempSpace('wgs_metrics'))) return workflow
def create_vcf_mappability_annotation_workflow( mappability_file, vcf_file, out_file, chromosomes=default_chromosomes, split_size=int(1e7), ): ctx = {'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2} workflow = pypeliner.workflow.Workflow() workflow.transform( name='get_regions', ret=mgd.TempOutputObj('regions_obj', 'regions'), ctx=ctx, func='biowrappers.components.variant_calling.utils.get_vcf_regions', args=( mgd.InputFile(vcf_file, extensions=['.tbi']), split_size, ), kwargs={ 'chromosomes': chromosomes, }, ) workflow.transform( name='annotate_db_status', axes=('regions',), ctx=ctx, func='biowrappers.components.variant_calling.mappability.tasks.get_mappability', args=( mappability_file, mgd.InputFile(vcf_file, extensions=['.tbi']), mgd.TempOutputFile('mappability.csv.gz', 'regions') ), kwargs={ 'region': mgd.TempInputObj('regions_obj', 'regions'), }, ) workflow.transform( name='merge_tables', ctx=ctx, func='biowrappers.components.io.csv.tasks.concatenate_csv', args=( mgd.TempInputFile('mappability.csv.gz', 'regions'), mgd.OutputFile(out_file) ) ) return workflow
def create_trinuc_annotation_workflow( in_vcf_file, out_csv_file, ref_genome, split_size=int(1e4), ): workflow = pypeliner.workflow.Workflow(ctx={ 'num_retry': 3, 'mem_retry_increment': 2 }) workflow.transform(name='split_vcf', func='single_cell.utils.vcfutils.split_vcf', args=(mgd.InputFile(in_vcf_file), mgd.TempOutputFile('split.vcf', 'split')), kwargs={'lines_per_file': split_size}) workflow.transform( name='annotate_db_status', axes=('split', ), func= 'single_cell.workflows.trinuc_annotation.tasks.get_tri_nucelotide_context', args=( ref_genome, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('tri_nucleotide_context.csv.gz', 'split', extensions=['.yaml']), )) workflow.transform(name='merge_tables', func='single_cell.utils.csvutils.concatenate_csv', args=(mgd.TempInputFile('tri_nucleotide_context.csv.gz', 'split', extensions=['.yaml']), mgd.OutputFile(out_csv_file, extensions=['.yaml']))) return workflow
def create_somatic_workflow(normal_bam_file, tumour_bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)): regions = utils.get_bam_regions(normal_bam_file, split_size, chromosomes=chromosomes) workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'), value=regions) workflow.transform( name='run_somatic', axes=('regions', ), ctx={ 'mem': 6, 'mem_retry_increment': 2, 'num_retry': 3 }, func=tasks.run_somatic, args=( mgd.InputFile(normal_bam_file), mgd.InputFile(tumour_bam_file), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('region.vcf.gz', 'regions'), mgd.TempInputObj('config', 'regions'), mgd.TempSpace('varscan_tmp', 'regions'), ), ) workflow.transform( name='merge', axes=(), ctx={ 'mem': 2, 'mem_retry_increment': 2, 'num_retry': 3 }, func=vcf_tasks.concatenate_vcf, args=( mgd.TempInputFile('region.vcf.gz', 'regions'), mgd.OutputFile(out_file), ), ) return workflow
def create_vcf_tric_nucleotide_annotation_workflow( ref_genome_fasta_file, vcf_file, out_file, split_size=int(1e4), table_name='tri_nucleotide_context'): ctx = {'num_retry': 3, 'mem_retry_increment': 2} merged_file = mgd.TempFile('merged.csv.gz') workflow = pypeliner.workflow.Workflow() workflow.transform(name='split_vcf', ctx=dict(mem=2, **ctx), func='biowrappers.components.io.vcf.tasks.split_vcf', args=(mgd.InputFile(vcf_file), mgd.TempOutputFile('split.vcf', 'split')), kwargs={'lines_per_file': split_size}) workflow.transform( name='annotate_db_status', axes=('split', ), ctx=dict(mem=4, **ctx), func= 'biowrappers.components.variant_calling.tri_nucleotide_context.tasks.get_tri_nucelotide_context', args=(ref_genome_fasta_file, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('tri_nucleotide_context.csv.gz', 'split'), table_name)) workflow.transform( name='merge_tables', ctx=dict(mem=2, **ctx), func='biowrappers.components.io.csv.tasks.concatenate_csv', args=(mgd.TempInputFile('tri_nucleotide_context.csv.gz', 'split'), mgd.OutputFile(out_file))) return workflow
def create_workflow_2(input_filename, output_filename): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1}) workflow.transform(name='dofilestuff1', func=do_file_stuff, args=(mgd.InputFile(input_filename), mgd.TempOutputFile('intermediate1'), 'a')) workflow.transform(name='dofilestuff2', func=do_file_stuff, args=(mgd.TempInputFile('intermediate1'), mgd.OutputFile(output_filename), 'b')) return workflow
def create_snv_allele_counts_workflow( bam_file, out_file, table_name, chromosomes=default_chromosomes, count_duplicates=False, min_bqual=0, min_mqual=0, report_non_variant_positions=True, report_zero_count_positions=False, split_size=int(1e7)): workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.TempOutputObj('regions_obj', 'regions'), value=biowrappers.components.variant_calling.utils.get_bam_regions(bam_file, split_size, chromosomes=chromosomes) ) workflow.transform( name='get_counts', axes=('regions',), ctx=med_ctx, func='biowrappers.components.snv_allele_counts.tasks.get_snv_allele_counts_for_region', args=( mgd.InputFile(bam_file), mgd.TempOutputFile('counts.h5', 'regions'), mgd.TempInputObj('regions_obj', 'regions'), table_name ), kwargs={ 'count_duplicates': count_duplicates, 'min_bqual': min_bqual, 'min_mqual': min_mqual, 'report_non_variant_positions': report_non_variant_positions, 'report_zero_count_positions': report_zero_count_positions } ) workflow.transform( name='concatenate_counts', ctx=med_ctx, func='biowrappers.components.io.hdf5.tasks.concatenate_tables', args=( mgd.TempInputFile('counts.h5', 'regions'), mgd.OutputFile(out_file) ) ) return workflow
def run_VarScan(config, normal_bam, tumour_bam, snp_output_file, indel_output_file): workflow = pypeliner.workflow.Workflow() workflow.transform( name='generate_normal_mpileup', func=tasks.generate_mpileup, args=( config, mgd.InputFile(normal_bam), mgd.TempOutputFile("normal.pileup"), ) ) workflow.transform( name='generate_tumour_mpileup', func=tasks.generate_mpileup, args=( config, mgd.InputFile(tumour_bam), mgd.TempOutputFile("tumour.pileup"), ) ) workflow.transform( name='run_varscan_somatic', ctx={'mem': 8, 'ncpus': 1, 'walltime': '08:00'}, func=tasks.run_varscan_somatic, args=( config, mgd.TempInputFile("normal.pileup"), mgd.TempInputFile("tumour.pileup"), mgd.OutputFile(snp_output_file), mgd.OutputFile(indel_output_file), ) ) return workflow
def create_cohort_oncoplot(config, merged_germline, merged_somatic, maftools_cna, maftools_maf, oncoplot, report, cohort): """create oncoplot from cna table, and germlinne/somatic dataa. Args: config ([dict]): [config] merged_germline ([str]): [path to merged germline file] merged_somatic ([str]): [path to merged somatic file] maftools_cna ([str]): [path to merged cna data] maftools_maf ([sstr]): [path to output prepped maftools input maf] oncoplot ([str]): [path to output oncoplot] Returns: [type]: [description] """ ctx = { 'mem': config["memory"]['low'], 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1 } workflow = pypeliner.workflow.Workflow(ctx=ctx) non_synonymous_labels = config["non_synonymous_labels"] workflow.transform( name='postprocess_maf', func='single_cell.workflows.cohort_qc.tasks.prepare_maf_for_maftools', args=(mgd.InputFile(merged_germline), mgd.InputFile(merged_somatic), mgd.OutputFile(maftools_maf), non_synonymous_labels, mgd.TempOutputFile("vcNames")), ) workflow.transform( name='make_oncoplot', func='single_cell.workflows.cohort_qc.tasks.make_oncoplot', args=(mgd.InputFile(maftools_maf), mgd.InputFile(maftools_cna), mgd.OutputFile(oncoplot), mgd.TempInputFile("vcNames")), ) workflow.transform( name='create_report', func='single_cell.workflows.cohort_qc.tasks.create_report', args=(cohort, mgd.InputFile(oncoplot), report), ) return workflow
def create_museq_workflow( normal_bam, tumour_bam, ref_genome, snv_vcf, config): ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('region'), value=normal_bam.keys(), ) workflow.transform( name='run_museq', ctx=dict(mem=config["memory"]['med'], pool_id=config['pools']['highmem'], **ctx), axes=('region',), func='single_cell.workflows.mutationseq.tasks.run_museq', args=( mgd.InputFile('merged_bam', 'region', fnames=tumour_bam), mgd.InputFile('normal.split.bam', 'region', fnames=normal_bam), mgd.TempOutputFile('museq.vcf', 'region'), mgd.TempOutputFile('museq.log', 'region'), mgd.InputInstance('region'), config, ), kwargs={'docker_kwargs': helpers.get_container_ctx(config['containers'], 'mutationseq')} ) workflow.transform( name='merge_snvs', ctx=dict(mem=config["memory"]['med'], pool_id=config['pools']['standard'], **ctx), func='biowrappers.components.io.vcf.tasks.concatenate_vcf', args=( mgd.TempInputFile('museq.vcf', 'region'), mgd.OutputFile(snv_vcf), ), ) return workflow
def create_extract_seqdata_workflow( bam_filename, seqdata_filename, remixt_config, remixt_ref_data_dir, config, multiprocess=False, ): ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow = pypeliner.workflow.Workflow() workflow.transform( name='create_chromosome_seqdata', ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func= "single_cell.workflows.extract_seqdata.tasks.create_chromosome_seqdata", args=( mgd.TempOutputFile('seqdata', 'chromosome'), mgd.InputFile(bam_filename), remixt_config, remixt_ref_data_dir, ), kwargs={ 'multiprocess': multiprocess, 'ncores': config['max_cores'] }) workflow.transform( name='merge_seqdata', ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func="remixt.seqdataio.merge_seqdata", args=( mgd.OutputFile(seqdata_filename), mgd.TempInputFile('seqdata', 'chromosome'), ), ) return workflow
def create_mappability_annotation_workflow( in_vcf_file, out_csv_file, mappability_file, split_size=1e4 ): workflow = pypeliner.workflow.Workflow( ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2} ) workflow.transform( name="get_regions", func="single_cell.workflows.mappability_annotation.tasks.get_vcf_regions", ret=mgd.TempOutputObj('regions_obj', 'regions'), args=( mgd.InputFile(in_vcf_file, extensions=['.tbi']), int(split_size), ), ) workflow.transform( name='annotate_db_status', axes=('regions',), func='single_cell.workflows.mappability_annotation.tasks.get_mappability', args=( mappability_file, mgd.InputFile(in_vcf_file, extensions=['.tbi']), mgd.TempOutputFile('mappability.csv.gz', 'regions', extensions=['.yaml']) ), kwargs={ 'region': mgd.TempInputObj('regions_obj', 'regions'), }, ) workflow.transform( name='merge_tables', func='single_cell.utils.csvutils.concatenate_csv', args=( mgd.TempInputFile('mappability.csv.gz', 'regions', extensions=['.yaml']), mgd.OutputFile(out_csv_file, extensions=['.yaml']) ) ) return workflow
def create_extract_seqdata_workflow( bam_filename, seqdata_filename, config, ref_data_dir, ): chromosomes = remixt.config.get_chromosomes(config, ref_data_dir) snp_positions_filename = remixt.config.get_filename(config, ref_data_dir, 'snp_positions') bam_max_fragment_length = remixt.config.get_param(config, 'bam_max_fragment_length') bam_max_soft_clipped = remixt.config.get_param(config, 'bam_max_soft_clipped') bam_check_proper_pair = remixt.config.get_param(config, 'bam_check_proper_pair') workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes) workflow.transform( name='create_chromosome_seqdata', axes=('chromosome',), ctx={'mem': 16}, func=remixt.seqdataio.create_chromosome_seqdata, args=( mgd.TempOutputFile('seqdata', 'chromosome'), mgd.InputFile(bam_filename), mgd.InputFile(snp_positions_filename), mgd.InputInstance('chromosome'), bam_max_fragment_length, bam_max_soft_clipped, bam_check_proper_pair, ), ) workflow.transform( name='merge_seqdata', ctx={'mem': 16}, func=remixt.seqdataio.merge_seqdata, args=( mgd.OutputFile(seqdata_filename), mgd.TempInputFile('seqdata', 'chromosome'), ), ) return workflow
def create_workflow_2(input_filename, output_filename): workflow = pypeliner.workflow.Workflow() workflow.transform( name='dofilestuff1', func='pypeliner.tests.tasks.do_file_stuff', args=( mgd.InputFile(input_filename), mgd.TempOutputFile('intermediate1'), 'a')) workflow.transform( name='dofilestuff2', func='pypeliner.tests.tasks.do_file_stuff', args=( mgd.TempInputFile('intermediate1'), mgd.OutputFile(output_filename), 'b')) return workflow
def create_align_workflow(fastq_file_1, fastq_file_2, ref_genome_fasta_file, out_bam_file, threads=1): sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools']) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.subworkflow( name='align', func=soil.wrappers.bwa.workflows.create_align_workflow, args=(mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2), mgd.InputFile(ref_genome_fasta_file), mgd.TempOutputFile('aligned.bam')), kwargs={ 'align_threads': threads, 'sort_threads': threads }) workflow.transform(name='mark_dups', func=soil.wrappers.sambamba.tasks.markdups, args=(mgd.TempInputFile('aligned.bam'), mgd.OutputFile(out_bam_file), mgd.TempSpace('mark_dups_tmp')), kwargs={'threads': threads}) workflow.commandline(name='index', args=( 'samtools', 'index', mgd.InputFile(out_bam_file), mgd.OutputFile(out_bam_file + '.bai'), )) return workflow
def download_external_files(config): download_keys = [x for x in config if 'url' in config[x]] urls = dict(zip( download_keys, [config[x]['url'] for x in download_keys], )) downloaded_files = dict( zip( urls.keys(), [config[x]['local_path'] for x in urls.keys()], )) workflow = Workflow() workflow.setobj( obj=mgd.TempOutputObj('url', 'files'), value=urls, ) workflow.subworkflow( name='download', func=create_download_workflow, axes=('files', ), args=( mgd.TempInputObj('url', 'files'), mgd.TempOutputFile('download.file', 'files'), ), ) workflow.transform( name='unzip', axes=('files', ), func=tasks.unzip, args=( mgd.TempInputFile('download.file', 'files'), mgd.OutputFile('unzipped', 'files', fnames=downloaded_files), ), ) return workflow