def call_copynumber( samples, config, tumours, normals, breakpoints, titan_raw_dir, remixt_results, remixt_raw_dir, titan_segments, titan_params, titan_markers ): breakpoints = dict([(sampid, breakpoints[sampid]) for sampid in samples]) remixt_results = dict([(sampid, remixt_results[sampid]) for sampid in samples]) titan_segments = dict([(sampid, titan_segments[sampid]) for sampid in samples]) titan_params = dict([(sampid, titan_params[sampid]) for sampid in samples]) titan_markers = dict([(sampid, titan_markers[sampid]) for sampid in samples]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.Template(titan_raw_dir, 'sample_id'), mgd.OutputFile('titan_segments', 'sample_id', fnames=titan_segments), mgd.OutputFile('titan_params', 'sample_id', fnames=titan_params), mgd.OutputFile('titan_markers', 'sample_id', fnames=titan_markers), config['globals'], config['cna_calling'], config['cna_calling']['titan_intervals'], ), ) workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('breakpoints', 'sample_id', fnames=breakpoints), mgd.InputInstance('sample_id'), config['cna_calling']['remixt_refdata'], mgd.OutputFile('remixt_results', 'sample_id', fnames=remixt_results), mgd.Template(remixt_raw_dir, 'sample_id'), config['cna_calling']['min_num_reads'] ), ) return workflow
def conversion_workflow(args): docker = docker_containers() converted_dir = args["out_dir"] cell_ids, cfse_images, livedead_images = get_cell_images( args['input_yaml']) converted_image_template = os.path.join(converted_dir, '{cell_id}.png') workflow = pypeliner.workflow.Workflow( ctx={'docker_image': docker['microscope_image_converter']}) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.transform( name='convert', func='microscope_image_converter.tasks.convert', axes=('cell_id', ), args=( mgd.InputFile('livedead.tif', 'cell_id', fnames=livedead_images), mgd.InputFile('cfse.tif', 'cell_id', fnames=cfse_images), mgd.OutputFile('converted.png', 'cell_id', template=converted_image_template, axes_origin=[]), ), ) converted_meta = os.path.join(converted_dir, 'metadata.yaml') input_yaml_blob = os.path.join(converted_dir, 'input.yaml') workflow.transform( name='generate_meta_files_results', func='microscope_image_converter.tasks.generate_and_upload_metadata', args=(sys.argv[0:], converted_dir, mgd.Template('converted.png', 'cell_id', template=converted_image_template), mgd.OutputFile(converted_meta)), kwargs={ 'input_yaml_data': load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'cell_ids': cell_ids, 'type': 'dlp_microscope_merged', } }) return workflow
def alignment_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] outputs = os.path.join(outdir, '{sample_id}', '{sample_id}.bam') metrics_output = os.path.join(outdir, '{sample_id}', '{sample_id}_metrics.csv.gz') prealignment_tar = os.path.join(outdir, '{sample_id}', '{sample_id}_fastqc.tar.gz') postalignment_tar = os.path.join(outdir, '{sample_id}', '{sample_id}_metrics.tar.gz') samples = list(inputs.keys()) fastqs_r1, fastqs_r2 = helpers.get_fastqs(inputs, samples, None) sample_info = helpers.get_sample_info(inputs) pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('alignment'))) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'lane_id'), value=list(fastqs_r1.keys()), ) workflow.subworkflow(name="prealign", func=pre_alignment.pre_alignment, axes=('sample_id', 'lane_id'), args=( mgd.InputFile('input.r1.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r1), mgd.InputFile('input.r2.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r2), mgd.Template('prealignment.tar', 'sample_id', template=prealignment_tar), )) workflow.subworkflow( name="align", func=alignment.alignment, args=( mgd.InputFile('input.r1.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r1, axes_origin=[]), mgd.InputFile('input.r2.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r2, axes_origin=[]), mgd.OutputFile('output.bam', 'sample_id', template=outputs, axes_origin=[]), args['refdir'], sample_info, ), ) workflow.subworkflow( name="postalign", func=post_alignment.post_alignment, axes=('sample_id', ), args=( mgd.InputFile('output.bam', 'sample_id', template=outputs), mgd.OutputFile('metrics.csv.gz', 'sample_id', template=metrics_output, extensions=['.yaml']), mgd.OutputFile('metrics.tar.gz', 'sample_id', template=postalignment_tar), mgd.InputInstance('sample_id'), args['refdir'], ), ) pyp.run(workflow)
def destruct_multi_sample_workflow( normal_bam, tumour_bam_files, destruct_config, config, destruct_ref_data_dir, breakpoints_csv, breakpoints_library_csv, cell_counts_csv, raw_data_dir, normal_sample_id='normal', ): ctx = {'docker_image': config['docker']['destruct']} workflow = pypeliner.workflow.Workflow(ctx=ctx) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(tumour_bam_files.keys()), ) keys = [(sample_id, library_id) for (sample_id, library_id, _) in list(tumour_bam_files.keys())] keys = sorted(set(keys)) breakpoints_csv = dict([(key, breakpoints_csv(*key)) for key in keys]) breakpoints_library_csv = dict([(key, breakpoints_library_csv(*key)) for key in keys]) cell_counts_csv = dict([(key, cell_counts_csv(*key)) for key in keys]) workflow.set_filenames('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', fnames=tumour_bam_files) workflow.set_filenames('breakpoints.csv', 'sample_id', 'library_id', fnames=breakpoints_csv) workflow.set_filenames('breakpoints_library.csv', 'sample_id', 'library_id', fnames=breakpoints_library_csv) workflow.set_filenames('cell_counts.csv', 'sample_id', 'library_id', fnames=cell_counts_csv) workflow.subworkflow( name='normal_preprocess_destruct', func= 'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow', args=( normal_bam, mgd.TempOutputFile('normal_stats'), mgd.TempOutputFile('normal_reads_1.fastq.gz'), mgd.TempOutputFile('normal_reads_2.fastq.gz'), mgd.TempOutputFile('normal_sample_1.fastq.gz'), mgd.TempOutputFile('normal_sample_2.fastq.gz'), destruct_ref_data_dir, destruct_config, ), ) workflow.subworkflow( name='tumour_preprocess_destruct', func= 'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow', axes=('sample_id', 'library_id'), args=( mgd.InputFile('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai']), mgd.TempOutputFile('tumour_stats', 'sample_id', 'library_id'), mgd.TempOutputFile('tumour_reads_1.fastq.gz', 'sample_id', 'library_id'), mgd.TempOutputFile('tumour_reads_2.fastq.gz', 'sample_id', 'library_id'), mgd.TempOutputFile('tumour_sample_1.fastq.gz', 'sample_id', 'library_id'), mgd.TempOutputFile('tumour_sample_2.fastq.gz', 'sample_id', 'library_id'), destruct_ref_data_dir, destruct_config, ), kwargs={'tag': True}) workflow.subworkflow( name='run_destruct', func= 'single_cell.workflows.destruct_singlecell.create_destruct_workflow', axes=('sample_id', 'library_id'), args=( mgd.TempInputFile('normal_stats'), mgd.TempInputFile('normal_reads_1.fastq.gz'), mgd.TempInputFile('normal_reads_2.fastq.gz'), mgd.TempInputFile('normal_sample_1.fastq.gz'), mgd.TempInputFile('normal_sample_2.fastq.gz'), mgd.TempInputFile('tumour_stats', 'sample_id', 'library_id'), mgd.TempInputFile('tumour_reads_1.fastq.gz', 'sample_id', 'library_id'), mgd.TempInputFile('tumour_reads_2.fastq.gz', 'sample_id', 'library_id'), mgd.TempInputFile('tumour_sample_1.fastq.gz', 'sample_id', 'library_id'), mgd.TempInputFile('tumour_sample_2.fastq.gz', 'sample_id', 'library_id'), destruct_config, destruct_ref_data_dir, mgd.OutputFile('breakpoints.csv', 'sample_id', 'library_id'), mgd.OutputFile('breakpoints_library.csv', 'sample_id', 'library_id'), mgd.OutputFile('cell_counts.csv', 'sample_id', 'library_id'), mgd.Template(raw_data_dir, 'sample_id', 'library_id'), ), kwargs={ 'tumour_sample_id': mgd.Instance('sample_id'), 'tumour_library_id': mgd.Instance('library_id'), 'normal_sample_id': normal_sample_id, }, ) return workflow
def cna_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config_file']) inputs = helpers.load_yaml(args['input_yaml']) tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') targets = helpers.get_values_from_input(inputs, 'target_list') breakpoints = helpers.get_values_from_input(inputs, 'breakpoints') samples = tumours.keys() cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5') remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data') titan_raw_dir = os.path.join(cna_outdir, 'titan') titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5') titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5') titan_params_filename = os.path.join(titan_raw_dir, 'params.h5') workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.InputFile("target_list", 'sample_id', fnames=targets, axes_origin=[]), mgd.Template(titan_raw_dir, 'sample_id'), mgd.OutputFile('titan_segments_filename', 'sample_id', axes_origin=[], template=titan_segments_filename), mgd.OutputFile('titan_params_filename', 'sample_id', axes_origin=[], template=titan_params_filename), mgd.OutputFile('titan_markers_filename', 'sample_id', axes_origin=[], template=titan_markers_filename), config['globals'], config['cna_calling'], config['cna_calling']['titan_intervals'], mgd.InputInstance('sample_id'), ), ) workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('destruct_breakpoints', 'sample_id', axes_origin=[], fnames=breakpoints), mgd.InputInstance('sample_id'), config['cna_calling']['remixt_refdata'], mgd.OutputFile('remixt_results_filename', 'sample_id', axes_origin=[], template=remixt_results_filename), mgd.Template(remixt_raw_dir, 'sample_id'), config['cna_calling']['min_num_reads'] ), ) pyp.run(workflow)
def alignment_workflow(args): config = inpututils.load_config(args) config = config['alignment'] lib = args["library_id"] alignment_dir = args["out_dir"] bams_dir = args["bams_dir"] trim = args['trim'] center = args['sequencing_center'] sampleinfo = inpututils.get_sample_info(args['input_yaml']) cellids = inpututils.get_samples(args['input_yaml']) fastq1_files, fastq2_files = inpututils.get_fastqs(args['input_yaml']) alignment_files = get_output_files(alignment_dir, lib) alignment_meta = os.path.join(alignment_dir, 'metadata.yaml') bam_files_template = os.path.join(bams_dir, '{cell_id}.bam') mt_bam_files_template = os.path.join(bams_dir, '{cell_id}_MT.bam') bams_meta = os.path.join(bams_dir, 'metadata.yaml') lanes = sorted(set([v[1] for v in fastq1_files.keys()])) cells = sorted(set([v[0] for v in fastq1_files.keys()])) input_yaml_blob = os.path.join(alignment_dir, 'input.yaml') workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=list(fastq1_files.keys()), ) workflow.subworkflow( name='alignment_workflow', func=align.create_alignment_workflow, args=( mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq1_files, axes_origin=[]), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq2_files, axes_origin=[]), mgd.OutputFile('bam_markdups', 'cell_id', template=bam_files_template, axes_origin=[], extensions=['.bai']), mgd.OutputFile('mt_bam_markdups', 'cell_id', template=mt_bam_files_template, axes_origin=[], extensions=['.bai']), mgd.OutputFile(alignment_files['alignment_metrics_csv']), mgd.OutputFile(alignment_files['gc_metrics_csv']), mgd.OutputFile(alignment_files['fastqc_metrics_csv']), mgd.OutputFile(alignment_files['plot_metrics_output']), config['ref_genome'], config, sampleinfo, cellids, mgd.OutputFile(alignment_files['alignment_metrics_tar']), lib, trim, center, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], alignment_dir, list(alignment_files.values()), mgd.OutputFile(alignment_meta)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'library_id': lib, 'cell_ids': cells, 'lane_ids': lanes, 'type': 'alignment' } }) workflow.transform( name='generate_meta_files_bams', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], bams_dir, mgd.Template('aligned.bam', 'cell_id', template=bam_files_template), mgd.OutputFile(bams_meta)), kwargs={ 'metadata': { 'library_id': lib, 'cell_ids': cells, 'lane_ids': lanes, 'type': 'cellbams' }, 'template': (mgd.InputChunks('cell_id'), bam_files_template, 'cell_id'), }) return workflow
def merge_bams_workflow(args): config = inpututils.load_config(args) config = config['merge_bams'] ctx = { 'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'mem': config["memory"]['low'] } workflow = pypeliner.workflow.Workflow(ctx=ctx) bam_files = inpututils.load_merge_cell_bams(args['input_yaml']) merge_out_template = os.path.join(args['out_dir'], '{region}.bam') meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=list(bam_files.keys()), ) workflow.transform( name="get_regions", func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.transform( name="remove_softclipped_reads", func="single_cell.utils.pysamutils.remove_softclipped_reads", axes=('cell_id', ), args=(mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.TempOutputFile('bam_rm_softclipped.bam', 'cell_id', extensions=['.bai']), args['softclipped_reads_threshold'])) workflow.subworkflow(name="wgs_merge_workflow", func=merge_bams.create_merge_bams_workflow, args=( mgd.TempInputFile('bam_rm_softclipped.bam', 'cell_id', extensions=['.bai']), mgd.OutputFile("merged.bam", "region", axes_origin=[], extensions=['.bai'], template=merge_out_template), mgd.InputChunks("region"), config, )) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], mgd.Template('bam_filenames', 'region', template=merge_out_template), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'template': (mgd.InputChunks('region'), merge_out_template, 'region'), 'metadata': { 'type': 'pseudowgs_regionbams', 'cell_ids': list(bam_files.keys()) } }) return workflow
def create_topiary_workflow(hla_alleles, in_file, out_file, copy_pyensembl_cache_dir=False, iedb_dir=None, genome='GRCh37', predictor='netmhc', pyensembl_cache_dir=None): """ Run topiary. Parameters ---------- hla_alleles: list List of HLA alleles i.e. A*02:01. in_file: str Path to VCF file with variants. out_file: str Path where output will be written in tsv format. """ sandbox = soil.utils.workflow.get_sandbox([ 'topiary', ]) workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox) workflow.setobj(obj=mgd.TempOutputObj('raw_hla_alleles'), value=hla_alleles) workflow.setobj(obj=mgd.OutputChunks('pep_len'), value=[8, 9, 10, 11]) workflow.transform(name='filter_hla_alleles', func=tasks.filter_hla_alleles, args=(mgd.TempInputObj('raw_hla_alleles'), ), kwargs={ 'iedb_dir': iedb_dir, 'predictor': predictor, }, ret=mgd.TempOutputObj('hla_alleles')) workflow.transform(name='run_topiary', axes=('pep_len', ), ctx={ 'mem': 8, 'mem_retry_increment': 4, 'num_retry': 3 }, func=tasks.run_topiary, args=(mgd.TempInputObj('hla_alleles'), mgd.InputFile(in_file), mgd.TempOutputFile('raw.tsv', 'pep_len')), kwargs={ 'copy_pyensembl_cache_dir': copy_pyensembl_cache_dir, 'iedb_dir': iedb_dir, 'genome': genome, 'peptide_length': mgd.Template('{pep_len}', 'pep_len'), 'predictor': predictor, 'pyensembl_cache_dir': pyensembl_cache_dir }) workflow.transform(name='reformat_output', axes=(), func=tasks.reformat_output, args=(mgd.TempInputFile('raw.tsv', 'pep_len'), mgd.OutputFile(out_file))) return workflow
def wgs_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config_file']) inputs = helpers.load_yaml(args['input_yaml']) tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') targets = helpers.get_values_from_input(inputs, 'target_list') samples = tumours.keys() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) if args['alignment']: tumour_fastqs_r1, tumour_fastqs_r2 = get_fastqs(inputs, samples, 'tumour') normal_fastqs_r1, normal_fastqs_r2 = get_fastqs(inputs, samples, 'normal') normal_alignment_template = os.path.join( args['out_dir'], 'alignment', '{norm_sample_id}', '{norm_lane}', 'normal' ) tumour_alignment_template = os.path.join( args['out_dir'], 'alignment', '{tum_sample_id}', '{tum_lane}', 'tumour' ) workflow.subworkflow( name='wgs_alignment_paired_lanes', func=paired_alignment, args=( config, mgd.OutputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.OutputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), samples, tumour_fastqs_r1, tumour_fastqs_r2, normal_fastqs_r1, normal_fastqs_r2, normal_alignment_template, tumour_alignment_template, ) ) museq_dir = os.path.join(args['out_dir'], 'variants') museq_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_paired_annotated.vcf.gz') museq_ss_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_single_annotated.vcf.gz') strelka_snv_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_snv_annotated.vcf.gz') strelka_indel_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_indel_annotated.vcf.gz') parsed_snv_csv = os.path.join(museq_dir, '{sample_id}', 'allcalls.csv') museq_paired_pdf = os.path.join(museq_dir, '{sample_id}', 'paired_museqportrait.pdf') museq_single_pdf = os.path.join(museq_dir, '{sample_id}', 'single_museqportrait.pdf') workflow.subworkflow( name='variant_calling', func=call_variants, args=( samples, config, mgd.OutputFile('parsed_snv_csv', 'sample_id', template=parsed_snv_csv, axes_origin=[]), mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq', 'sample_id', template=museq_vcf, axes_origin=[]), mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('strelka_snv', 'sample_id', template=strelka_snv_vcf, axes_origin=[]), mgd.OutputFile('strelka_indel', 'sample_id', template=strelka_indel_vcf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), ) ) sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}') destruct_breakpoints = os.path.join(sv_outdir, 'destruct_breakpoints.csv') destruct_library = os.path.join(sv_outdir, 'destruct_library.csv') destruct_raw_breakpoints = os.path.join(sv_outdir, 'destruct_raw_breakpoints.csv') destruct_raw_library = os.path.join(sv_outdir, 'destruct_raw_library.csv') destruct_reads = os.path.join(sv_outdir, 'destruct_reads.csv') lumpy_vcf = os.path.join(sv_outdir, 'lumpy.vcf') parsed_csv = os.path.join(sv_outdir, 'filtered_consensus_calls.csv') workflow.subworkflow( name="call_breakpoints", func=call_breakpoints, args=( samples, config, mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints, axes_origin=[]), mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library, axes_origin=[]), mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints, axes_origin=[]), mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library, axes_origin=[]), mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads, axes_origin=[]), mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf, axes_origin=[]), mgd.OutputFile('parsed_csv', 'sample_id', template=parsed_csv, axes_origin=[]) ) ) cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data') titan_raw_dir = os.path.join(cna_outdir, 'titan') remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5') titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5') titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5') titan_params_filename = os.path.join(titan_raw_dir, 'params.h5') workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("target_list", 'sample_id', fnames=targets, axes_origin=[]), mgd.Template(titan_raw_dir, 'sample_id'), mgd.OutputFile('titan_segments_filename', 'sample_id', axes_origin=[], template=titan_segments_filename), mgd.OutputFile('titan_params_filename', 'sample_id', axes_origin=[], template=titan_params_filename), mgd.OutputFile('titan_markers_filename', 'sample_id', axes_origin=[], template=titan_markers_filename), config['globals'], config['cna_calling'], config['cna_calling']['titan_intervals'], mgd.InputInstance('sample_id'), ), ) workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('destruct_breakpoints', 'sample_id', axes_origin=[], template=destruct_breakpoints), mgd.InputInstance('sample_id'), config['cna_calling']['remixt_refdata'], mgd.OutputFile('remixt_results_filename', 'sample_id', axes_origin=[], template=remixt_results_filename), mgd.Template(remixt_raw_dir, 'sample_id'), config['cna_calling']['min_num_reads'] ), ) pyp.run(workflow)
def split_bam_workflow(args): config = inpututils.load_config(args) config = config['split_bam'] bam_file = inpututils.load_split_wgs_input(args['input_yaml']) baseimage = config['docker']['single_cell_pipeline'] split_bam_template = os.path.join(args['out_dir'], '{region}.bam') meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') workflow = pypeliner.workflow.Workflow(ctx={'docker_image': baseimage}) workflow.transform( name="get_regions", ctx={ 'mem': config['memory']['low'], 'ncpus': 1, 'docker_image': baseimage }, func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.TempOutputObj('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow( name="split_normal", func=split_bams.create_split_workflow, ctx={ 'mem': config['memory']['low'], 'ncpus': 1 }, args=( mgd.InputFile(bam_file), mgd.OutputFile("normal.split.bam", 'region', template=split_bam_template, axes_origin=[]), pypeliner.managed.TempInputObj('region'), config, ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], mgd.Template('bam_filenames', 'region', template=split_bam_template), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'wgs_regionbams' }, 'template': (mgd.TempInputObj('region'), split_bam_template, 'region'), }) return workflow
def paired_alignment(config, tumours, normals, samples, tumour_fastqs_r1, tumour_fastqs_r2, normal_fastqs_r1, normal_fastqs_r2, outdir_template_normal, outdir_template_tumour): tumours = dict([(sampid, tumours[sampid]) for sampid in samples]) normals = dict([(sampid, normals[sampid]) for sampid in samples]) tumours_index = dict([(sampid, tumours[sampid] + '.bai') for sampid in samples]) normals_index = dict([(sampid, normals[sampid] + '.bai') for sampid in samples]) workflow = pypeliner.workflow.Workflow() global_config = config['globals'] config = config['alignment'] workflow.setobj( obj=mgd.OutputChunks('tum_sample_id', 'tum_lane'), value=tumour_fastqs_r1.keys(), ) workflow.setobj( obj=mgd.OutputChunks('norm_sample_id', 'norm_lane'), value=normal_fastqs_r1.keys(), ) workflow.subworkflow( name='align_tumours', func=alignment.align_sample, axes=('tum_sample_id', 'tum_lane'), args=(config, mgd.InputFile('input.r1.fastq.gz', 'tum_sample_id', 'tum_lane', fnames=tumour_fastqs_r1), mgd.InputFile('input.r2.fastq.gz', 'tum_sample_id', 'tum_lane', fnames=tumour_fastqs_r2), mgd.TempOutputFile('tumour.bam', 'tum_sample_id', 'tum_lane'), mgd.Template(outdir_template_tumour, 'tum_sample_id', 'tum_lane'), [ mgd.InputInstance('tum_sample_id'), mgd.InputInstance('tum_lane') ]), ) workflow.transform(name='merge_tumour_lanes', ctx={ 'mem': global_config['memory']['med'], 'ncpus': 1 }, func="wgs.workflows.alignment.tasks.merge_bams", axes=('tum_sample_id', ), args=(mgd.TempInputFile('tumour.bam', 'tum_sample_id', 'tum_lane'), mgd.OutputFile('output.bam', 'tum_sample_id', fnames=tumours), mgd.OutputFile('output.bam.bai', 'tum_sample_id', fnames=tumours_index), None)) workflow.subworkflow( name='align_normals', func=alignment.align_sample, axes=('norm_sample_id', 'norm_lane'), args=(config, mgd.InputFile('input.r1.fastq.gz', 'norm_sample_id', 'norm_lane', fnames=normal_fastqs_r1), mgd.InputFile('input.r2.fastq.gz', 'norm_sample_id', 'norm_lane', fnames=normal_fastqs_r2), mgd.TempOutputFile('normal.bam', 'norm_sample_id', 'norm_lane'), mgd.Template(outdir_template_normal, 'norm_sample_id', 'norm_lane'), [ mgd.InputInstance('norm_sample_id'), mgd.InputInstance('norm_lane') ]), ) workflow.transform(name='merge_normal_lanes', ctx={ 'mem': global_config['memory']['med'], 'ncpus': 1 }, func="wgs.workflows.alignment.tasks.merge_bams", axes=('norm_sample_id', ), args=(mgd.TempInputFile('normal.bam', 'norm_sample_id', 'norm_lane'), mgd.OutputFile('output.bam', 'norm_sample_id', fnames=normals), mgd.OutputFile('output.bam.bai', 'norm_sample_id', fnames=normals_index), None)) return workflow
def variant_calling_multi_sample_workflow( config, normal_wgs_bam, tumour_cell_bams, varcall_dir, museq_vcf, strelka_snvs, strelka_indels, museq_csv, strelka_csv, cosmic_csv, dbsnp_csv, mappability_csv, snpeff_csv, trinuc_csv, snv_counts): keys = [(sample_id, library_id) for (sample_id, library_id, _) in list(tumour_cell_bams.keys())] keys = sorted(set(keys)) museq_vcf = dict([(key, museq_vcf(*key)) for key in keys]) strelka_snvs = dict([(key, strelka_snvs(*key)) for key in keys]) strelka_indels = dict([(key, strelka_indels(*key)) for key in keys]) museq_csv = dict([(key, museq_csv(*key)) for key in keys]) strelka_csv = dict([(key, strelka_csv(*key)) for key in keys]) cosmic_csv = dict([(key, cosmic_csv(*key)) for key in keys]) dbsnp_csv = dict([(key, dbsnp_csv(*key)) for key in keys]) mappability_csv = dict([(key, mappability_csv(*key)) for key in keys]) snpeff_csv = dict([(key, snpeff_csv(*key)) for key in keys]) trinuc_csv = dict([(key, trinuc_csv(*key)) for key in keys]) snv_counts = dict([(key, snv_counts(*key)) for key in keys]) variant_calling_raw_data_template = os.path.join( varcall_dir, 'variant_calling_rawdata', '{sample_id}_{library_id}_variant_calling') normal_region_bam_template = os.path.join(varcall_dir, 'normal_region_bams', 'normal_{region}.bam') tumour_region_bam_template = os.path.join( varcall_dir, 'tumour_region_bams', '{sample_id}_{library_id}_{region}.bam') vcftools_image = { 'docker_image': config['variant_calling']['docker']['vcftools'] } workflow = pypeliner.workflow.Workflow(default_ctx={ 'docker_image': config['multi_sample']['docker']['single_cell_pipeline'] }) workflow.transform(name='get_regions', ret=mgd.TempOutputObj("get_regions"), func=refgenome.get_split_regions, args=(config["split_bam"]["split_size"], config["split_bam"]["ref_genome"])) workflow.setobj( obj=mgd.OutputChunks('region'), value=mgd.TempInputObj('get_regions'), ) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(tumour_cell_bams.keys()), ) workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'region'), axes=( 'sample_id', 'library_id', ), value=mgd.TempInputObj('get_regions'), ) if isinstance(normal_wgs_bam, dict): workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=list(normal_wgs_bam.keys()), ) workflow.set_filenames('normal_cells.bam', 'normal_cell_id', fnames=normal_wgs_bam) workflow.subworkflow(name="merge_normal_cells", func=merge_bams.create_merge_bams_workflow, args=( mgd.InputFile('normal_cells.bam', 'normal_cell_id', extensions=['.bai']), mgd.OutputFile( 'normal_regions.bam', 'region', axes_origin=[], extensions=['.bai'], template=normal_region_bam_template), mgd.TempInputObj('get_regions'), config['merge_bams'], )) else: workflow.subworkflow(name="split_normal", func=split_bams.create_split_workflow, args=( mgd.InputFile(normal_wgs_bam, extensions=['.bai']), mgd.OutputFile( 'normal_regions.bam', 'region', extensions=['.bai'], axes_origin=[], template=normal_region_bam_template), pypeliner.managed.TempInputObj('region'), config['split_bam'], ), kwargs={"by_reads": False}) workflow.subworkflow(name="split_merge_tumour", axes=( 'sample_id', 'library_id', ), func=merge_bams.create_merge_bams_workflow, args=( mgd.InputFile('tumour_all_cells.bam', 'sample_id', 'library_id', 'cell_id', fnames=tumour_cell_bams, extensions=['.bai'], axes_origin=[]), mgd.OutputFile( 'tumour_regions.bam', 'sample_id', 'library_id', 'region', axes_origin=[], extensions=['.bai'], template=tumour_region_bam_template), mgd.TempInputObj('get_regions'), config['merge_bams'], )) workflow.subworkflow( name='variant_calling', func=create_variant_calling_workflow, axes=( 'sample_id', 'library_id', ), args=( mgd.InputFile('tumour_all_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams), mgd.InputFile('tumour_regions.bam', 'sample_id', 'library_id', 'region', extensions=['.bai'], template=tumour_region_bam_template), mgd.InputFile('normal_regions.bam', 'region', extensions=['.bai'], template=normal_region_bam_template), mgd.OutputFile('museq.vcf', 'sample_id', 'library_id', extensions=['.tbi', '.csi'], fnames=museq_vcf), mgd.OutputFile('strelka_snv.vcf', 'sample_id', 'library_id', extensions=['.tbi', '.csi'], fnames=strelka_snvs), mgd.OutputFile('strelka_indel.vcf', 'sample_id', 'library_id', extensions=['.tbi', '.csi'], fnames=strelka_indels), mgd.OutputFile('museq.csv.gz', 'sample_id', 'library_id', extensions=['.yaml'], fnames=museq_csv), mgd.OutputFile('strelka.csv.gz', 'sample_id', 'library_id', extensions=['.yaml'], fnames=strelka_csv), mgd.OutputFile('cosmic.csv.gz', 'sample_id', 'library_id', extensions=['.yaml'], fnames=cosmic_csv), mgd.OutputFile('dbsnp.csv.gz', 'sample_id', 'library_id', extensions=['.yaml'], fnames=dbsnp_csv), mgd.OutputFile('mappability.csv.gz', 'sample_id', 'library_id', extensions=['.yaml'], fnames=mappability_csv), mgd.OutputFile('snpeff.csv.gz', 'sample_id', 'library_id', extensions=['.yaml'], fnames=snpeff_csv), mgd.OutputFile('trinuc.csv.gz', 'sample_id', 'library_id', extensions=['.yaml'], fnames=trinuc_csv), config['variant_calling'], mgd.Template(variant_calling_raw_data_template, 'sample_id', 'library_id'), ), ) workflow.transform( name='merge_museq_snvs', func='biowrappers.components.io.vcf.tasks.concatenate_vcf', args=( mgd.InputFile('museq.vcf', 'sample_id', 'library_id', axes_origin=[], extensions=['.tbi', '.csi'], fnames=museq_vcf), mgd.TempOutputFile('museq.vcf.gz', extensions=['.tbi', '.csi']), ), kwargs={ 'allow_overlap': True, 'docker_config': vcftools_image, }, ) workflow.transform( name='merge_strelka_snvs', func='biowrappers.components.io.vcf.tasks.concatenate_vcf', args=( mgd.InputFile('strelka_snv.vcf', 'sample_id', 'library_id', axes_origin=[], extensions=['.tbi', '.csi'], fnames=strelka_snvs), mgd.TempOutputFile('strelka_snv.vcf.gz', extensions=['.tbi', '.csi']), ), kwargs={ 'allow_overlap': True, 'docker_config': vcftools_image, }, ) workflow.subworkflow( name='variant_counting', func=create_variant_counting_workflow, axes=( 'sample_id', 'library_id', ), args=( [ mgd.TempInputFile('museq.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempInputFile('strelka_snv.vcf.gz', extensions=['.tbi', '.csi']), ], mgd.InputFile('tumour_all_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams), mgd.OutputFile('snv_counts.csv.gz', 'sample_id', 'library_id', fnames=snv_counts), config['variant_calling'], ), ) return workflow
def alignment_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') outputs = os.path.join(outdir, '{sample_id}', '{sample_id}.bam') outputs_tdf = os.path.join(outdir, '{sample_id}', '{sample_id}.bam.tdf') metrics_output = os.path.join(outdir, '{sample_id}', '{sample_id}_metrics.csv') metrics_tar = os.path.join(outdir, '{sample_id}', '{sample_id}_metrics.tar.gz') samples = list(inputs.keys()) fastqs_r1, fastqs_r2 = helpers.get_fastqs(inputs, samples, None) sample_info = helpers.get_sample_info(inputs) pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id', 'lane_id'), value=list(fastqs_r1.keys()), ) workflow.subworkflow(name="align_samples", func=alignment.align_samples, args=(mgd.InputFile('input.r1.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r1), mgd.InputFile('input.r2.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r2), mgd.Template('output.bam', 'sample_id', template=outputs), mgd.Template('metrics.txt', 'sample_id', template=metrics_output), mgd.Template('metrics.tar', 'sample_id', template=metrics_tar), mgd.Template('output.bam.tdf', 'sample_id', template=outputs_tdf), sample_info, args['refdir']), kwargs={ 'single_node': args['single_node'], 'picard_mem': args['picard_mem'] }) outputted_filenames = helpers.expand_list([ outputs, outputs_tdf, metrics_output, metrics_tar, ], samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], outdir, outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inputs, 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'alignment' } }) pyp.run(workflow)
def create_variant_counting_workflow(args): """ Count variant reads for multiple sets of variants across cells. """ vcf_files, tumour_cell_bams, sample_library = inpututils.load_variant_counting_input( args['input_yaml']) counts_template = '{sample_id}_{library_id}_counts.csv.gz' counts_output_template = os.path.join(args['out_dir'], counts_template) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') config = inpututils.load_config(args) config = config['variant_calling'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'), value=list(tumour_cell_bams.keys()), ) workflow.transform( name='merge_snvs_museq', func='single_cell.utils.vcfutils.merge_vcf', args=([mgd.InputFile(vcf_file) for vcf_file in vcf_files], mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.TempSpace("merge_vcf_temp")), ) workflow.subworkflow( name='count_alleles', axes=('sample_id', 'library_id'), func= 'single_cell.workflows.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow', args=( mgd.InputFile('tumour_cells.bam', 'sample_id', 'library_id', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams, axes_origin=[]), mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']), mgd.OutputFile('counts.csv.gz', 'sample_id', 'library_id', template=counts_output_template), mgd.Instance('sample_id'), mgd.Instance('library_id'), config['memory'], ), ) workflow.transform( name='generate_meta_files_results', func='single_cell.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], mgd.Template('counts.csv.gz', 'sample_id', 'library_id', template=counts_output_template), mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inpututils.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'snv_genotyping', 'counts': { 'template': counts_template, 'instances': sample_library, } } }) return workflow
def copynumber_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) run_titan = args['titan'] run_remixt = args['remixt'] if not run_titan and not run_remixt: run_titan = True run_remixt = True inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') targets = helpers.get_values_from_input(inputs, 'target_list') breakpoints = helpers.get_values_from_input(inputs, 'breakpoints') samples = list(tumours.keys()) cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') titan_raw_dir = os.path.join(cna_outdir, 'titan') titan_outfile = os.path.join(titan_raw_dir, '{sample_id}_titan_markers.csv.gz') titan_params = os.path.join(titan_raw_dir, '{sample_id}_titan_params.csv.gz') titan_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_segs.csv.gz') titan_igv_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_igv_segs.seg') titan_parsed = os.path.join(titan_raw_dir, '{sample_id}_titan_parsed.csv.gz') titan_plots = os.path.join(titan_raw_dir, '{sample_id}_titan_plots.pdf') titan_tar_outputs = os.path.join(titan_raw_dir, '{sample_id}_data_all_parameters.tar.gz') museq_vcf = os.path.join(titan_raw_dir, '{sample_id}_museq.vcf') remixt_outdir = os.path.join(args['out_dir'], 'remixt', '{sample_id}') remixt_outfile = os.path.join(remixt_outdir, '{sample_id}_remixt.h5') remixt_raw_dir = os.path.join(remixt_outdir, '{sample_id}_raw_dir') remixt_brk_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_brk_cn.csv.gz') remixt_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_cn.csv.gz') remixt_minor_modes_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_minor_modes.csv.gz') remixt_mix_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_mix.csv.gz') remixt_read_depth_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_read_depth.csv.gz') remixt_stats_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_stats.csv.gz') refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) if run_remixt: workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("breakpoints", 'sample_id', fnames=breakpoints), mgd.InputInstance('sample_id'), mgd.OutputFile('remixt.h5', 'sample_id', template=remixt_outfile), mgd.OutputFile('remixt_brk_cn.csv', 'sample_id', template=remixt_brk_cn_csv), mgd.OutputFile('remixt_cn.csv', 'sample_id', template=remixt_cn_csv), mgd.OutputFile('remixt_minor_modes.csv', 'sample_id', template=remixt_minor_modes_csv), mgd.OutputFile('remixt_mix.csv', 'sample_id', template=remixt_mix_csv), mgd.OutputFile('remixt_read_depth.csv', 'sample_id', template=remixt_read_depth_csv), mgd.OutputFile('remixt_stats.csv', 'sample_id', template=remixt_stats_csv), refdir_paths['refdata_remixt'], mgd.Template('rawdir', 'sample_id', template=remixt_raw_dir), refdir_paths['reference'], ), kwargs={'single_node': args['single_node']} ) if run_titan: workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("target_list", 'sample_id', fnames=targets), mgd.OutputFile('outfile', 'sample_id', template=titan_outfile), mgd.OutputFile('params', 'sample_id', template=titan_params), mgd.OutputFile('segs', 'sample_id', template=titan_segs), mgd.OutputFile('igv_segs', 'sample_id', template=titan_igv_segs), mgd.OutputFile('parsed', 'sample_id', template=titan_parsed), mgd.OutputFile('plots', 'sample_id', template=titan_plots), mgd.OutputFile('tar_outputs', 'sample_id', template=titan_tar_outputs), mgd.OutputFile('museq.vcf', 'sample_id', template=museq_vcf), mgd.InputInstance('sample_id'), refdir_paths['reference'], chromosomes, refdir_paths['het_positions_titan'], refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf'], ), kwargs={'single_node': args['single_node']} ) filenames = [] if run_remixt: filenames += [ remixt_outfile, remixt_raw_dir, remixt_brk_cn_csv, remixt_cn_csv, remixt_minor_modes_csv, remixt_mix_csv, remixt_read_depth_csv, remixt_stats_csv ] if run_titan: filenames += [ titan_outfile, titan_params, titan_segs, titan_igv_segs, titan_parsed, titan_plots, titan_tar_outputs, museq_vcf, ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform( name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=( sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml) ), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': {'type': 'copynumber_calling'} } ) pyp.run(workflow)