def realign_bam_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(docker_image=config.containers('wgs'))) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') yamldata = yaml.safe_load(open(args['input_yaml'])) samples = list(yamldata.keys()) input_bams = {sample: yamldata[sample]['input'] for sample in samples} output_bams = os.path.join(outdir, '{sample_id}', '{sample_id}.bam') metrics = os.path.join(outdir, '{sample_id}', '{sample_id}.txt') metrics_tar = os.path.join(outdir, '{sample_id}', '{sample_id}.tar') workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name="realign", func=realign_bams, ctx=helpers.get_default_ctx(), args=( samples, mgd.InputFile("input.bam", 'sample_id', fnames=input_bams, extensions=['.bai'], axes_origin=[]), mgd.OutputFile("realigned.bam", 'sample_id', template=output_bams, extensions=['.bai', '.tdf'], axes_origin=[]), mgd.OutputFile("realigned.txt", 'sample_id', template=metrics, extensions=['.bai'], axes_origin=[]), mgd.OutputFile("realigned.tar", 'sample_id', template=metrics_tar, extensions=['.bai'], axes_origin=[]), args['refdir'], ), kwargs={'single_node': args['single_node']} ) outputted_filenames = helpers.expand_list([output_bams, metrics, metrics_tar], samples, 'sample_id') workflow.transform( name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=( sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml) ), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': {'type': 'realignment'} } ) pyp.run(workflow)
def breakpoint_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml') input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(tumours.keys()) sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}') destruct_breakpoints = os.path.join( sv_outdir, '{sample_id}_destruct_breakpoints.csv.gz') destruct_library = os.path.join(sv_outdir, '{sample_id}_destruct_library.csv.gz') destruct_raw_breakpoints = os.path.join( sv_outdir, '{sample_id}_destruct_raw_breakpoints.csv.gz') destruct_raw_library = os.path.join( sv_outdir, '{sample_id}_destruct_raw_library.csv.gz') destruct_reads = os.path.join(sv_outdir, '{sample_id}_destruct_reads.csv.gz') lumpy_vcf = os.path.join(sv_outdir, '{sample_id}_lumpy.vcf') parsed_csv = os.path.join(sv_outdir, '{sample_id}_filtered_consensus_calls.csv.gz') svaba_vcf = os.path.join(sv_outdir, '{sample_id}_svaba.vcf') single_node = args['single_node'] refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='destruct', func=destruct_wgs.create_destruct_wgs_workflow, axes=('sample_id', ), args=(mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints), mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library), mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints), mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library), mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads), mgd.InputInstance('sample_id'), refdir_paths['reference'], refdir_paths['refdata_destruct'], refdir_paths['gtf'], refdir_paths['blacklist_destruct']), kwargs={'single_node': single_node}) workflow.subworkflow( name='lumpy', func=lumpy.create_lumpy_workflow, axes=('sample_id', ), args=(mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), ), kwargs={ 'tumour_bam': mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node }, ) if args['svaba']: workflow.subworkflow( name='svaba', func=svaba.create_svaba_workflow, axes=('sample_id', ), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('svaba_vcf', 'sample_id', template=svaba_vcf), refdir_paths['reference'], ), ) workflow.subworkflow( name="consensus_calling", func=breakpoint_calling_consensus.create_consensus_workflow, axes=('sample_id', ), args=(mgd.InputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints), mgd.InputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), mgd.OutputFile('consensus_calls', 'sample_id', template=parsed_csv, extensions=['.yaml']), chromosomes), ) filenames = [ destruct_breakpoints, destruct_library, destruct_raw_breakpoints, destruct_raw_library, destruct_reads, lumpy_vcf, parsed_csv ] if args['svaba']: filenames.append(svaba_vcf) outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func=helpers.generate_and_upload_metadata, args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'breakpoint_calling' } }) pyp.run(workflow)
def variant_calling_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(tumours.keys()) var_dir = os.path.join(args['out_dir'], 'variants') museq_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_paired_annotated.vcf.gz') museq_ss_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_single_annotated.vcf.gz') samtools_germline_vcf = os.path.join( var_dir, '{sample_id}', '{sample_id}_samtools_germline.vcf.gz') samtools_roh = os.path.join(var_dir, '{sample_id}', '{sample_id}_roh.csv') strelka_snv_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_strelka_snv_annotated.vcf.gz') strelka_indel_vcf = os.path.join( var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz') museq_paired_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_paired_museqportrait.pdf') museq_single_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_single_museqportrait.pdf') somatic_csv = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic.csv.gz') somatic_snpeff = os.path.join( var_dir, '{sample_id}', '{sample_id}_consensus_somatic_snpeff.csv.gz') somatic_ma = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic_ma.csv.gz') somatic_ids = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic_ids.csv.gz') indel_csv = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel.csv.gz') indel_snpeff = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel_snpeff.csv.gz') indel_ma = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel_ma.csv.gz') indel_ids = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel_ids.csv.gz') germline_csv = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline.csv.gz') germline_snpeff = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline_snpeff.csv.gz') germline_ma = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline_ma.csv.gz') germline_ids = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline_ids.csv.gz') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) if not all(tumours.values()): workflow.subworkflow( name='variant_calling', func=call_germlines_only, args=(samples, mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('samtools_germline', 'sample_id', template=samtools_germline_vcf, axes_origin=[]), mgd.OutputFile('samtools_roh', 'sample_id', template=samtools_roh, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), args['refdir']), kwargs={'single_node': args['single_node']}) else: workflow.subworkflow(name='variant_calling', func=call_variants, args=( samples, mgd.OutputFile('somatic_csv', 'sample_id', template=somatic_csv, axes_origin=[]), mgd.OutputFile('somatic_snpeff', 'sample_id', template=somatic_snpeff, axes_origin=[]), mgd.OutputFile('somatic_ma', 'sample_id', template=somatic_ma, axes_origin=[]), mgd.OutputFile('somatic_ids', 'sample_id', template=somatic_ids, axes_origin=[]), mgd.OutputFile('indel_csv', 'sample_id', template=indel_csv, axes_origin=[]), mgd.OutputFile('indel_snpeff', 'sample_id', template=indel_snpeff, axes_origin=[]), mgd.OutputFile('indel_ma', 'sample_id', template=indel_ma, axes_origin=[]), mgd.OutputFile('indel_ids', 'sample_id', template=indel_ids, axes_origin=[]), mgd.OutputFile('germline_csv', 'sample_id', template=germline_csv, axes_origin=[]), mgd.OutputFile('germline_snpeff', 'sample_id', template=germline_snpeff, axes_origin=[]), mgd.OutputFile('germline_ma', 'sample_id', template=germline_ma, axes_origin=[]), mgd.OutputFile('germline_ids', 'sample_id', template=germline_ids, axes_origin=[]), mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq', 'sample_id', template=museq_vcf, axes_origin=[]), mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('samtools_germline', 'sample_id', template=samtools_germline_vcf, axes_origin=[]), mgd.OutputFile('roh_calls', 'sample_id', template=samtools_roh, axes_origin=[]), mgd.OutputFile('strelka_snv', 'sample_id', template=strelka_snv_vcf, axes_origin=[]), mgd.OutputFile('strelka_indel', 'sample_id', template=strelka_indel_vcf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), args['refdir'], ), kwargs={ 'single_node': args['single_node'], 'is_exome': args['is_exome'], }) filenames = [ somatic_csv, somatic_snpeff, somatic_ma, somatic_ids, indel_csv, indel_snpeff, indel_ma, indel_ids, germline_csv, germline_snpeff, germline_ma, germline_ids, museq_vcf, museq_ss_vcf, strelka_snv_vcf, strelka_indel_vcf, museq_paired_pdf, museq_single_pdf ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform( name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'variant_calling' } }) pyp.run(workflow)
def somatic_calling_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(tumours.keys()) tumour_ids = helpers.get_values_from_input(inputs, 'tumour_id') normal_ids = helpers.get_values_from_input(inputs, 'normal_id') var_dir = os.path.join(args['out_dir'], 'somatic') museq_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_paired_annotated.vcf.gz') museq_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_paired_annotated.maf') museq_paired_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_paired_museqportrait.pdf') strelka_snv_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_strelka_snv_annotated.vcf.gz') strelka_snv_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_strelka_snv_annotated.maf') strelka_indel_vcf = os.path.join( var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz') strelka_indel_maf = os.path.join( var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.maf') mutect_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_mutect.vcf.gz') mutect_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_mutect.maf') consensus_somatic_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic.maf') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) workflow.subworkflow(name='variant_calling', func=somatic_calling.create_somatic_calling_workflow, args=( samples, mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq_vcf', 'sample_id', template=museq_vcf, axes_origin=[]), mgd.OutputFile('museq_maf', 'sample_id', template=museq_maf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]), mgd.OutputFile('strelka_snv_vcf', 'sample_id', template=strelka_snv_vcf, axes_origin=[]), mgd.OutputFile('strelka_snv_maf', 'sample_id', template=strelka_snv_maf, axes_origin=[]), mgd.OutputFile('strelka_indel_vcf', 'sample_id', template=strelka_indel_vcf, axes_origin=[]), mgd.OutputFile('strelka_indel_maf', 'sample_id', template=strelka_indel_maf, axes_origin=[]), mgd.OutputFile('mutect_vcf', 'sample_id', template=mutect_vcf, axes_origin=[]), mgd.OutputFile('mutect_maf', 'sample_id', template=mutect_maf, axes_origin=[]), mgd.OutputFile('consensus_somatic_maf', 'sample_id', template=consensus_somatic_maf, axes_origin=[]), args['refdir'], normal_ids, tumour_ids, ), kwargs={ 'single_node': args['single_node'], 'is_exome': args['is_exome'], }) filenames = [ museq_vcf, museq_maf, museq_paired_pdf, strelka_snv_vcf, strelka_snv_maf, strelka_indel_vcf, strelka_indel_maf, mutect_vcf, mutect_maf, consensus_somatic_maf ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'variant_calling' } }) pyp.run(workflow)
def sample_qc_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) normal_only = args['normal_only'] samples = list(inputs.keys()) # inputs chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] files = make_inputs(inputs, normal_only=normal_only) # outputs out_dir = args['out_dir'] normal_coverage = os.path.join(out_dir, '{sample_id}', '{sample_id}_normal_coverage.tsv') genome_wide_plot = os.path.join(out_dir, '{sample_id}', '{sample_id}_genome_wide.pdf') if not normal_only: circos_plot_remixt = os.path.join(out_dir, '{sample_id}', '{sample_id}_circos_remixt.pdf') circos_plot_titan = os.path.join(out_dir, '{sample_id}', '{sample_id}_circos_titan.pdf') tumour_coverage = os.path.join(out_dir, '{sample_id}', '{sample_id}_tumour_coverage.tsv') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) if normal_only: workflow.subworkflow( name="normal_sample_qc", func=sample_qc.create_sample_qc_workflow_normal_only, ctx=helpers.get_default_ctx(), axes=('sample_id', ), args=(mgd.InputInstance('sample_id'), args["refdir"], mgd.InputFile('normal.bam', 'sample_id', fnames=files["normal"]), mgd.InputFile('roh', 'sample_id', fnames=files["roh"]), mgd.InputFile('germline_calls', 'sample_id', fnames=files["germline"]), mgd.OutputFile('genome_wide_plot.pdf', 'sample_id', template=genome_wide_plot), mgd.OutputFile('normcov', 'sample_id', template=normal_coverage), chromosomes, args['bins'], args['mapping_qual_threshold']), # kwargs={'single_node': args['single_node']} ) outputted_filenames = helpers.expand_list( [normal_coverage, genome_wide_plot], samples, "sample_id") else: workflow.subworkflow( name="sample_qc", func=sample_qc.create_sample_qc_workflow, ctx=helpers.get_default_ctx(), axes=('sample_id', ), args=(mgd.InputInstance('sample_id'), args["refdir"], mgd.InputFile('normal.bam', 'sample_id', fnames=files["normal"]), mgd.InputFile('tumour.bam', 'sample_id', fnames=files["tumor"]), mgd.InputFile('titan', 'sample_id', fnames=files["titan"]), mgd.InputFile('remixt', 'sample_id', fnames=files["remixt"]), mgd.InputFile('breakpoints_consensus', 'sample_id', fnames=files["breakpoints"]), mgd.InputFile('roh', 'sample_id', fnames=files["roh"]), mgd.InputFile('germline_calls', 'sample_id', fnames=files["germline"]), mgd.InputFile('somatic_calls', 'sample_id', fnames=files["somatic"]), mgd.OutputFile('genome_wide_plot.pdf', 'sample_id', template=genome_wide_plot), mgd.OutputFile('normcov', 'sample_id', template=normal_coverage), mgd.OutputFile('tumcov', 'sample_id', template=tumour_coverage), chromosomes, args['bins'], args['mapping_qual_threshold']), kwargs={'single_node': args['single_node']}) workflow.subworkflow( name='generate_circos_plot', ctx=helpers.get_default_ctx(memory=10, walltime='24:00', disk=400), axes=('sample_id', ), func=sample_qc.circos_plot, args=( mgd.InputFile('titan', 'sample_id', fnames=files["titan"]), mgd.InputFile('remixt', 'sample_id', fnames=files["remixt"]), mgd.InputInstance("sample_id"), mgd.InputFile('breakpoints_consensus', 'sample_id', fnames=files["breakpoints"]), mgd.OutputFile('circos_remixt', 'sample_id', template=circos_plot_remixt), mgd.OutputFile('circos_titan', 'sample_id', template=circos_plot_titan), ), ) outputted_filenames = helpers.expand_list([ circos_plot_remixt, circos_plot_titan, normal_coverage, tumour_coverage, genome_wide_plot ], samples, "sample_id") meta_yaml = os.path.join(out_dir, 'metadata.yaml') input_yaml_blob = os.path.join(out_dir, 'input.yaml') workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'sample_qc' } }) pyp.run(workflow)
def single_sample_copynumber_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') bams = helpers.get_values_from_input(inputs, 'bam') samples = list(bams.keys()) cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') hmmcopy_raw_dir = os.path.join(cna_outdir, 'hmmcopy') bias_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_bias.pdf') correction_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_correction.pdf') hmmcopy_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_hmmcopy.pdf') correction_table = os.path.join(hmmcopy_raw_dir, '{sample_id}_correctreads_with_state.txt') pygenes = os.path.join(hmmcopy_raw_dir, '{sample_id}_hmmcopy.seg.pygenes') refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='hmmcopy', func=hmmcopy.create_hmmcopy_workflow, axes=('sample_id', ), args=(mgd.InputFile("sample.bam", 'sample_id', fnames=bams, extensions=['.bai' ]), mgd.InputInstance('sample_id'), mgd.OutputFile('bias', 'sample_id', template=bias_pdf), mgd.OutputFile('correction', 'sample_id', template=correction_pdf), mgd.OutputFile('hmmcopy', 'sample_id', template=hmmcopy_pdf), mgd.OutputFile('correction_table', 'sample_id', template=correction_table), mgd.OutputFile('pygenes', 'sample_id', template=pygenes), chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf']), ) filenames = [ bias_pdf, correction_pdf, hmmcopy_pdf, correction_table, pygenes, ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'single_sample_copynumber_calling' } }) pyp.run(workflow)
def copynumber_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) run_hmmcopy = args['hmmcopy'] run_titan = args['titan'] run_remixt = args['remixt'] if not run_hmmcopy and not run_titan and not run_remixt: run_hmmcopy = True run_titan = True run_remixt = True inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') targets = helpers.get_values_from_input(inputs, 'target_list') breakpoints = helpers.get_values_from_input(inputs, 'breakpoints') samples = list(tumours.keys()) cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') titan_raw_dir = os.path.join(cna_outdir, 'titan') titan_outfile = os.path.join(titan_raw_dir, '{sample_id}_titan_markers.csv.gz') titan_params = os.path.join(titan_raw_dir, '{sample_id}_titan_params.csv.gz') titan_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_segs.csv.gz') titan_igv_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_igv_segs.seg') titan_parsed = os.path.join(titan_raw_dir, '{sample_id}_titan_parsed.csv.gz') titan_plots = os.path.join(titan_raw_dir, '{sample_id}_titan_plots.pdf') titan_tar_outputs = os.path.join(titan_raw_dir, '{sample_id}_data_all_parameters.tar.gz') museq_vcf = os.path.join(titan_raw_dir, '{sample_id}_museq.vcf') hmmcopy_normal_raw_dir = os.path.join(cna_outdir, 'hmmcopy_normal') normal_bias_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots', '{sample_id}_bias.pdf') normal_correction_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots', '{sample_id}_correction.pdf') normal_hmmcopy_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots', '{sample_id}_hmmcopy.pdf') normal_correction_table = os.path.join( hmmcopy_normal_raw_dir, '{sample_id}_correctreads_with_state.txt') normal_pygenes = os.path.join(hmmcopy_normal_raw_dir, '{sample_id}_hmmcopy.seg.pygenes') hmmcopy_tumour_raw_dir = os.path.join(cna_outdir, 'hmmcopy_tumour') tumour_bias_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots', '{sample_id}_bias.pdf') tumour_correction_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots', '{sample_id}_correction.pdf') tumour_hmmcopy_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots', '{sample_id}_hmmcopy.pdf') tumour_correction_table = os.path.join( hmmcopy_tumour_raw_dir, '{sample_id}_correctreads_with_state.txt') tumour_pygenes = os.path.join(hmmcopy_tumour_raw_dir, '{sample_id}_hmmcopy.seg.pygenes') remixt_outdir = os.path.join(args['out_dir'], 'remixt', '{sample_id}') remixt_outfile = os.path.join(remixt_outdir, '{sample_id}_remixt.h5') remixt_brk_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_brk_cn.csv.gz') remixt_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_cn.csv.gz') remixt_minor_modes_csv = os.path.join( remixt_outdir, '{sample_id}_remixt_minor_modes.csv.gz') remixt_mix_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_mix.csv.gz') remixt_read_depth_csv = os.path.join( remixt_outdir, '{sample_id}_remixt_read_depth.csv.gz') remixt_stats_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_stats.csv.gz') refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) if run_remixt: workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id', ), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("breakpoints", 'sample_id', fnames=breakpoints), mgd.InputInstance('sample_id'), mgd.OutputFile('remixt.h5', 'sample_id', template=remixt_outfile), mgd.OutputFile('remixt_brk_cn.csv', 'sample_id', template=remixt_brk_cn_csv), mgd.OutputFile('remixt_cn.csv', 'sample_id', template=remixt_cn_csv), mgd.OutputFile('remixt_minor_modes.csv', 'sample_id', template=remixt_minor_modes_csv), mgd.OutputFile('remixt_mix.csv', 'sample_id', template=remixt_mix_csv), mgd.OutputFile('remixt_read_depth.csv', 'sample_id', template=remixt_read_depth_csv), mgd.OutputFile('remixt_stats.csv', 'sample_id', template=remixt_stats_csv), refdir_paths['refdata_remixt'], refdir_paths['reference'], ), kwargs={'single_node': args['single_node']}) if run_titan: workflow.subworkflow(name='titan', func=titan.create_titan_workflow, axes=('sample_id', ), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("target_list", 'sample_id', fnames=targets), mgd.OutputFile('outfile', 'sample_id', template=titan_outfile), mgd.OutputFile('params', 'sample_id', template=titan_params), mgd.OutputFile('segs', 'sample_id', template=titan_segs), mgd.OutputFile('igv_segs', 'sample_id', template=titan_igv_segs), mgd.OutputFile('parsed', 'sample_id', template=titan_parsed), mgd.OutputFile('plots', 'sample_id', template=titan_plots), mgd.OutputFile('tar_outputs', 'sample_id', template=titan_tar_outputs), mgd.OutputFile('museq.vcf', 'sample_id', template=museq_vcf), mgd.InputInstance('sample_id'), refdir_paths['reference'], chromosomes, refdir_paths['het_positions_titan'], refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf'], ), kwargs={'single_node': args['single_node']}) if run_hmmcopy: workflow.subworkflow( name='hmmcopy_normal', func=hmmcopy.create_hmmcopy_workflow, axes=('sample_id', ), args=(mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputInstance('sample_id'), mgd.OutputFile('normal_bias', 'sample_id', template=normal_bias_pdf), mgd.OutputFile('normal_correction', 'sample_id', template=normal_correction_pdf), mgd.OutputFile('normal_hmmcopy', 'sample_id', template=normal_hmmcopy_pdf), mgd.OutputFile('normal_correction_table', 'sample_id', template=normal_correction_table), mgd.OutputFile('normal_pygenes', 'sample_id', template=normal_pygenes), chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf']), ) workflow.subworkflow( name='hmmcopy_tumour', func=hmmcopy.create_hmmcopy_workflow, axes=('sample_id', ), args=(mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputInstance('sample_id'), mgd.OutputFile('tumour_bias', 'sample_id', template=tumour_bias_pdf), mgd.OutputFile('tumour_correction', 'sample_id', template=tumour_correction_pdf), mgd.OutputFile('tumour_hmmcopy', 'sample_id', template=tumour_hmmcopy_pdf), mgd.OutputFile('tumour_correction_table', 'sample_id', template=tumour_correction_table), mgd.OutputFile('tumour_pygenes', 'sample_id', template=tumour_pygenes), chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf']), ) filenames = [] if run_titan: filenames += [ titan_outfile, titan_params, titan_segs, titan_igv_segs, titan_parsed, titan_plots, titan_tar_outputs, museq_vcf, ] if run_hmmcopy: filenames += [ normal_bias_pdf, normal_correction_pdf, normal_hmmcopy_pdf, normal_correction_table, normal_pygenes, tumour_bias_pdf, tumour_correction_pdf, tumour_hmmcopy_pdf, tumour_correction_table, tumour_pygenes ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'copynumber_calling' } }) pyp.run(workflow)
def alignment_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') outputs = os.path.join(outdir, '{sample_id}', '{sample_id}.bam') outputs_tdf = os.path.join(outdir, '{sample_id}', '{sample_id}.bam.tdf') metrics_output = os.path.join(outdir, '{sample_id}', '{sample_id}_metrics.csv') metrics_tar = os.path.join(outdir, '{sample_id}', '{sample_id}_metrics.tar.gz') samples = list(inputs.keys()) fastqs_r1, fastqs_r2 = helpers.get_fastqs(inputs, samples, None) sample_info = helpers.get_sample_info(inputs) pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id', 'lane_id'), value=list(fastqs_r1.keys()), ) workflow.subworkflow(name="align_samples", func=alignment.align_samples, args=(mgd.InputFile('input.r1.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r1), mgd.InputFile('input.r2.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r2), mgd.Template('output.bam', 'sample_id', template=outputs), mgd.Template('metrics.txt', 'sample_id', template=metrics_output), mgd.Template('metrics.tar', 'sample_id', template=metrics_tar), mgd.Template('output.bam.tdf', 'sample_id', template=outputs_tdf), sample_info, args['refdir']), kwargs={ 'single_node': args['single_node'], 'picard_mem': args['picard_mem'] }) outputted_filenames = helpers.expand_list([ outputs, outputs_tdf, metrics_output, metrics_tar, ], samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], outdir, outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': inputs, 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'alignment' } }) pyp.run(workflow)
def germline_calling_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(normals.keys()) normal_ids = helpers.get_values_from_input(inputs, 'normal_id') var_dir = os.path.join(args['out_dir'], 'germline') museq_ss_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_single_annotated.vcf.gz') museq_ss_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_single_annotated.maf') museq_single_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_single_museqportrait.pdf') samtools_germline_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_samtools_germline.vcf.gz') samtools_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_samtools_germline.maf') samtools_roh = os.path.join(var_dir, '{sample_id}', '{sample_id}_roh.csv.gz') freebayes_germline_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_freebayes_germline.vcf.gz') freebayes_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_freebayes_germline.maf') rtg_germline_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_rtg_germline.vcf.gz') rtg_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_rtg_germline.maf') consensus_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_germline.maf') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow( ctx=helpers.get_default_ctx() ) workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) workflow.subworkflow( name='germline_calling', func=germline_calling.create_germline_calling_workflow, args=( samples, mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq_ss_vcf', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('museq_ss_maf', 'sample_id', template=museq_ss_maf, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), mgd.OutputFile('samtools_germline_vcf', 'sample_id', template=samtools_germline_vcf, axes_origin=[]), mgd.OutputFile('samtools_germline_maf', 'sample_id', template=samtools_germline_maf, axes_origin=[]), mgd.OutputFile('samtools_roh', 'sample_id', template=samtools_roh, axes_origin=[]), mgd.OutputFile('freebayes_germline_vcf', 'sample_id', template=freebayes_germline_vcf, axes_origin=[]), mgd.OutputFile('freebayes_germline_maf', 'sample_id', template=freebayes_germline_maf, axes_origin=[]), mgd.OutputFile('rtg_germline_vcf', 'sample_id', template=rtg_germline_vcf, axes_origin=[]), mgd.OutputFile('rtg_germline_maf', 'sample_id', template=rtg_germline_maf, axes_origin=[]), mgd.OutputFile('consensus_germline_maf', 'sample_id', template=consensus_germline_maf, axes_origin=[]), args['refdir'], normal_ids ), kwargs={ 'single_node': args['single_node'], } ) filenames = [ museq_ss_vcf, museq_ss_maf, museq_single_pdf, samtools_germline_vcf, samtools_germline_maf, samtools_roh, freebayes_germline_vcf, freebayes_germline_maf, rtg_germline_vcf, rtg_germline_maf, consensus_germline_maf ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform( name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=( sys.argv[0:], args['out_dir'], outputted_filenames, mgd.OutputFile(meta_yaml) ), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': {'type': 'variant_calling'} } ) pyp.run(workflow)
def cohort_qc_workflow(args): pypeline = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() inputs = helpers.load_qc_input_yaml_flat(args['input_yaml']) out_dir = args["out_dir"] api_key = args["API_key"] metadata = helpers.load_yaml(os.path.join(args["refdir"], "metadata.yaml")) gtf = os.path.join(args["refdir"], metadata["paths"]["gtf"]) germline_mafs = { label: data["germline_maf"] for label, data in inputs.items() } somatic_mafs = { label: data["somatic_maf"] for label, data in inputs.items() } remixt_data = {label: data["remixt"] for label, data in inputs.items()} report_path = { label[0]: os.path.join(out_dir, label[0], "report.html") for label, data in inputs.items() } cna_table = { label[0]: os.path.join(out_dir, label[0], "cna_table.tsv") for label, data in inputs.items() } segmental_copynumber = { label[0]: os.path.join(out_dir, label[0], "segmental_copynumber.tsv") for label, data in inputs.items() } cohort_maf_oncogenic_filtered = { label[0]: os.path.join(out_dir, label[0], "cohort_oncogenic_filtered.maf") for label, data in inputs.items() } workflow.setobj( obj=mgd.OutputChunks('cohort_label', 'sample_label'), value=list(inputs.keys()), ) workflow.subworkflow( name="classifycopynumber", func="wgs.workflows.cohort_qc.cna_annotation_workflow", axes=("cohort_label", ), args=( mgd.InputFile('remixt_dict', 'cohort_label', 'sample_label', fnames=remixt_data, axes_origin=[]), mgd.TempOutputFile('cna_maftools_table', 'cohort_label'), mgd.OutputFile('segmental_copynumber', 'cohort_label', fnames=segmental_copynumber), mgd.OutputFile('cna_table_cbio', 'cohort_label', fnames=cna_table), gtf, ), ) workflow.subworkflow( name="maf_annotation_workflow", func="wgs.workflows.cohort_qc.preprocess_mafs_workflow", axes=("cohort_label", ), args=(mgd.InputFile('germline_mafs_dict', 'cohort_label', 'sample_label', fnames=germline_mafs, axes_origin=[]), mgd.InputFile('somatic_mafs_dict', 'cohort_label', 'sample_label', fnames=somatic_mafs, axes_origin=[]), mgd.OutputFile('cohort_maf_oncogenic_filtered', 'cohort_label', fnames=cohort_maf_oncogenic_filtered), api_key), ) workflow.subworkflow( name="make_plots_and_report", func="wgs.workflows.cohort_qc.create_cohort_qc_report", axes=("cohort_label", ), args=(mgd.InputInstance("cohort_label", ), out_dir, mgd.InputFile('cohort_maf_oncogenic_filtered', 'cohort_label', fnames=cohort_maf_oncogenic_filtered), mgd.TempInputFile('cna_maftools_table', 'cohort_label'), mgd.OutputFile('report_path', 'cohort_label', fnames=report_path)), ) meta_yaml = os.path.join(out_dir, 'metadata.yaml') input_yaml_blob = os.path.join(out_dir, 'input.yaml') cohort_labels = sorted(set([v[0] for v in inputs.keys()])) outputted_filenames = helpers.expand_list([ segmental_copynumber, cna_table, cohort_maf_oncogenic_filtered, report_path ], cohort_labels, "cohort_label") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'sample_qc' } }) pypeline.run(workflow)
def postprocessing_workflow(args): yamldata = yaml.safe_load(open(args['input_yaml'])) samples = list(yamldata.keys()) normals = {sample: yamldata[sample]['normal_bam'] for sample in samples} tumours = {sample: yamldata[sample]['tumour_bam'] for sample in samples} titan = {sample: yamldata[sample]['titan'] for sample in samples} remixt = {sample: yamldata[sample]['remixt'] for sample in samples} breakpoints_consensus = { sample: yamldata[sample]['breakpoints_consensus'] for sample in samples } roh = {sample: yamldata[sample]['roh'] for sample in samples} germline_calls = { sample: yamldata[sample]['germline_calls'] for sample in samples } somatic_calls = { sample: yamldata[sample]['somatic_calls'] for sample in samples } out_dir = args['out_dir'] meta_yaml = os.path.join(out_dir, 'pipeline_metadata.yaml') input_yaml_blob = os.path.join(out_dir, 'input.yaml') circos_plot_remixt = os.path.join(out_dir, '{sample_id}', '{sample_id}_circos_remixt.pdf') circos_plot_titan = os.path.join(out_dir, '{sample_id}', '{sample_id}_circos_titan.pdf') genome_wide_plot = os.path.join(out_dir, '{sample_id}', '{sample_id}_genome_wide.pdf') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow(name="postprocessing", func=postprocessing.create_postprocessing_workflow, ctx=helpers.get_default_ctx(), axes=('sample_id', ), args=( mgd.InputFile('normal.bam', 'sample_id', fnames=normals), mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours), titan, remixt, breakpoints_consensus, roh, germline_calls, somatic_calls, mgd.OutputFile('circos_plot_remixt.pdf', 'sample_id', template=circos_plot_remixt), mgd.OutputFile('circos_plot_titan.pdf', 'sample_id', template=circos_plot_titan), mgd.OutputFile('genome_wide_plot.pdf', 'sample_id', template=genome_wide_plot), args['refdir'], mgd.InputInstance('sample_id'), ), kwargs={'single_node': args['single_node']}) outputted_filenames = helpers.expand_list( [circos_plot_remixt, circos_plot_titan, genome_wide_plot], samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'postprocessing' } }) pyp.run(workflow)