def alignment_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config_file']) inputs = helpers.load_yaml(args['input_yaml']) fastqs_r1 = helpers.get_values_from_input(inputs, 'fastq1') fastqs_r2 = helpers.get_values_from_input(inputs, 'fastq2') outputs = helpers.get_values_from_input(inputs, 'bam') outdir = args['out_dir'] workflow.subworkflow(name="align_samples", func=alignment.align_samples, args=(config, fastqs_r1, fastqs_r2, outputs, outdir)) pyp.run(workflow)
def breakpoint_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml') input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(tumours.keys()) sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}') destruct_breakpoints = os.path.join( sv_outdir, '{sample_id}_destruct_breakpoints.csv.gz') destruct_library = os.path.join(sv_outdir, '{sample_id}_destruct_library.csv.gz') destruct_raw_breakpoints = os.path.join( sv_outdir, '{sample_id}_destruct_raw_breakpoints.csv.gz') destruct_raw_library = os.path.join( sv_outdir, '{sample_id}_destruct_raw_library.csv.gz') destruct_reads = os.path.join(sv_outdir, '{sample_id}_destruct_reads.csv.gz') lumpy_vcf = os.path.join(sv_outdir, '{sample_id}_lumpy.vcf') parsed_csv = os.path.join(sv_outdir, '{sample_id}_filtered_consensus_calls.csv.gz') svaba_vcf = os.path.join(sv_outdir, '{sample_id}_svaba.vcf') single_node = args['single_node'] refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='destruct', func=destruct_wgs.create_destruct_wgs_workflow, axes=('sample_id', ), args=(mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints), mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library), mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints), mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library), mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads), mgd.InputInstance('sample_id'), refdir_paths['reference'], refdir_paths['refdata_destruct'], refdir_paths['gtf'], refdir_paths['blacklist_destruct']), kwargs={'single_node': single_node}) workflow.subworkflow( name='lumpy', func=lumpy.create_lumpy_workflow, axes=('sample_id', ), args=(mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), ), kwargs={ 'tumour_bam': mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), 'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), 'single_node': single_node }, ) if args['svaba']: workflow.subworkflow( name='svaba', func=svaba.create_svaba_workflow, axes=('sample_id', ), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('svaba_vcf', 'sample_id', template=svaba_vcf), refdir_paths['reference'], ), ) workflow.subworkflow( name="consensus_calling", func=breakpoint_calling_consensus.create_consensus_workflow, axes=('sample_id', ), args=(mgd.InputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints), mgd.InputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), mgd.OutputFile('consensus_calls', 'sample_id', template=parsed_csv, extensions=['.yaml']), chromosomes), ) filenames = [ destruct_breakpoints, destruct_library, destruct_raw_breakpoints, destruct_raw_library, destruct_reads, lumpy_vcf, parsed_csv ] if args['svaba']: filenames.append(svaba_vcf) outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func=helpers.generate_and_upload_metadata, args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'breakpoint_calling' } }) pyp.run(workflow)
def variant_calling_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(tumours.keys()) var_dir = os.path.join(args['out_dir'], 'variants') museq_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_paired_annotated.vcf.gz') museq_ss_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_single_annotated.vcf.gz') samtools_germline_vcf = os.path.join( var_dir, '{sample_id}', '{sample_id}_samtools_germline.vcf.gz') samtools_roh = os.path.join(var_dir, '{sample_id}', '{sample_id}_roh.csv') strelka_snv_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_strelka_snv_annotated.vcf.gz') strelka_indel_vcf = os.path.join( var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz') museq_paired_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_paired_museqportrait.pdf') museq_single_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_single_museqportrait.pdf') somatic_csv = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic.csv.gz') somatic_snpeff = os.path.join( var_dir, '{sample_id}', '{sample_id}_consensus_somatic_snpeff.csv.gz') somatic_ma = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic_ma.csv.gz') somatic_ids = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic_ids.csv.gz') indel_csv = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel.csv.gz') indel_snpeff = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel_snpeff.csv.gz') indel_ma = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel_ma.csv.gz') indel_ids = os.path.join(var_dir, '{sample_id}', '{sample_id}_indel_ids.csv.gz') germline_csv = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline.csv.gz') germline_snpeff = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline_snpeff.csv.gz') germline_ma = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline_ma.csv.gz') germline_ids = os.path.join(var_dir, '{sample_id}', '{sample_id}_germline_ids.csv.gz') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) if not all(tumours.values()): workflow.subworkflow( name='variant_calling', func=call_germlines_only, args=(samples, mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('samtools_germline', 'sample_id', template=samtools_germline_vcf, axes_origin=[]), mgd.OutputFile('samtools_roh', 'sample_id', template=samtools_roh, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), args['refdir']), kwargs={'single_node': args['single_node']}) else: workflow.subworkflow(name='variant_calling', func=call_variants, args=( samples, mgd.OutputFile('somatic_csv', 'sample_id', template=somatic_csv, axes_origin=[]), mgd.OutputFile('somatic_snpeff', 'sample_id', template=somatic_snpeff, axes_origin=[]), mgd.OutputFile('somatic_ma', 'sample_id', template=somatic_ma, axes_origin=[]), mgd.OutputFile('somatic_ids', 'sample_id', template=somatic_ids, axes_origin=[]), mgd.OutputFile('indel_csv', 'sample_id', template=indel_csv, axes_origin=[]), mgd.OutputFile('indel_snpeff', 'sample_id', template=indel_snpeff, axes_origin=[]), mgd.OutputFile('indel_ma', 'sample_id', template=indel_ma, axes_origin=[]), mgd.OutputFile('indel_ids', 'sample_id', template=indel_ids, axes_origin=[]), mgd.OutputFile('germline_csv', 'sample_id', template=germline_csv, axes_origin=[]), mgd.OutputFile('germline_snpeff', 'sample_id', template=germline_snpeff, axes_origin=[]), mgd.OutputFile('germline_ma', 'sample_id', template=germline_ma, axes_origin=[]), mgd.OutputFile('germline_ids', 'sample_id', template=germline_ids, axes_origin=[]), mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq', 'sample_id', template=museq_vcf, axes_origin=[]), mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('samtools_germline', 'sample_id', template=samtools_germline_vcf, axes_origin=[]), mgd.OutputFile('roh_calls', 'sample_id', template=samtools_roh, axes_origin=[]), mgd.OutputFile('strelka_snv', 'sample_id', template=strelka_snv_vcf, axes_origin=[]), mgd.OutputFile('strelka_indel', 'sample_id', template=strelka_indel_vcf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), args['refdir'], ), kwargs={ 'single_node': args['single_node'], 'is_exome': args['is_exome'], }) filenames = [ somatic_csv, somatic_snpeff, somatic_ma, somatic_ids, indel_csv, indel_snpeff, indel_ma, indel_ids, germline_csv, germline_snpeff, germline_ma, germline_ids, museq_vcf, museq_ss_vcf, strelka_snv_vcf, strelka_indel_vcf, museq_paired_pdf, museq_single_pdf ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform( name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'variant_calling' } }) pyp.run(workflow)
def cna_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config_file']) inputs = helpers.load_yaml(args['input_yaml']) tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') targets = helpers.get_values_from_input(inputs, 'target_list') breakpoints = helpers.get_values_from_input(inputs, 'breakpoints') samples = tumours.keys() cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5') remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data') titan_raw_dir = os.path.join(cna_outdir, 'titan') titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5') titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5') titan_params_filename = os.path.join(titan_raw_dir, 'params.h5') workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.InputFile("target_list", 'sample_id', fnames=targets, axes_origin=[]), mgd.Template(titan_raw_dir, 'sample_id'), mgd.OutputFile('titan_segments_filename', 'sample_id', axes_origin=[], template=titan_segments_filename), mgd.OutputFile('titan_params_filename', 'sample_id', axes_origin=[], template=titan_params_filename), mgd.OutputFile('titan_markers_filename', 'sample_id', axes_origin=[], template=titan_markers_filename), config['globals'], config['cna_calling'], config['cna_calling']['titan_intervals'], mgd.InputInstance('sample_id'), ), ) workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('destruct_breakpoints', 'sample_id', axes_origin=[], fnames=breakpoints), mgd.InputInstance('sample_id'), config['cna_calling']['remixt_refdata'], mgd.OutputFile('remixt_results_filename', 'sample_id', axes_origin=[], template=remixt_results_filename), mgd.Template(remixt_raw_dir, 'sample_id'), config['cna_calling']['min_num_reads'] ), ) pyp.run(workflow)
def sv_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config_file']) inputs = helpers.load_yaml(args['input_yaml']) tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = tumours.keys() sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}') destruct_breakpoints = os.path.join(sv_outdir, 'destruct_breakpoints.csv') destruct_library = os.path.join(sv_outdir, 'destruct_library.csv') destruct_raw_breakpoints = os.path.join(sv_outdir, 'destruct_raw_breakpoints.csv') destruct_raw_library = os.path.join(sv_outdir, 'destruct_raw_library.csv') destruct_reads = os.path.join(sv_outdir, 'destruct_reads.csv') lumpy_vcf = os.path.join(sv_outdir, 'lumpy.vcf') parsed_csv = os.path.join(sv_outdir, 'filtered_consensus_calls.csv') workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow(name="call_breakpoints", func=call_breakpoints, args=(samples, config, mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile( 'destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints, axes_origin=[]), mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library, axes_origin=[]), mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints, axes_origin=[]), mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library, axes_origin=[]), mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads, axes_origin=[]), mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf, axes_origin=[]), mgd.OutputFile('parsed_csv', 'sample_id', template=parsed_csv, axes_origin=[]))) pyp.run(workflow)
def wgs_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config_file']) inputs = helpers.load_yaml(args['input_yaml']) tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') targets = helpers.get_values_from_input(inputs, 'target_list') samples = tumours.keys() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) if args['alignment']: tumour_fastqs_r1, tumour_fastqs_r2 = get_fastqs(inputs, samples, 'tumour') normal_fastqs_r1, normal_fastqs_r2 = get_fastqs(inputs, samples, 'normal') normal_alignment_template = os.path.join( args['out_dir'], 'alignment', '{norm_sample_id}', '{norm_lane}', 'normal' ) tumour_alignment_template = os.path.join( args['out_dir'], 'alignment', '{tum_sample_id}', '{tum_lane}', 'tumour' ) workflow.subworkflow( name='wgs_alignment_paired_lanes', func=paired_alignment, args=( config, mgd.OutputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.OutputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), samples, tumour_fastqs_r1, tumour_fastqs_r2, normal_fastqs_r1, normal_fastqs_r2, normal_alignment_template, tumour_alignment_template, ) ) museq_dir = os.path.join(args['out_dir'], 'variants') museq_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_paired_annotated.vcf.gz') museq_ss_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_single_annotated.vcf.gz') strelka_snv_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_snv_annotated.vcf.gz') strelka_indel_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_indel_annotated.vcf.gz') parsed_snv_csv = os.path.join(museq_dir, '{sample_id}', 'allcalls.csv') museq_paired_pdf = os.path.join(museq_dir, '{sample_id}', 'paired_museqportrait.pdf') museq_single_pdf = os.path.join(museq_dir, '{sample_id}', 'single_museqportrait.pdf') workflow.subworkflow( name='variant_calling', func=call_variants, args=( samples, config, mgd.OutputFile('parsed_snv_csv', 'sample_id', template=parsed_snv_csv, axes_origin=[]), mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq', 'sample_id', template=museq_vcf, axes_origin=[]), mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('strelka_snv', 'sample_id', template=strelka_snv_vcf, axes_origin=[]), mgd.OutputFile('strelka_indel', 'sample_id', template=strelka_indel_vcf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), ) ) sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}') destruct_breakpoints = os.path.join(sv_outdir, 'destruct_breakpoints.csv') destruct_library = os.path.join(sv_outdir, 'destruct_library.csv') destruct_raw_breakpoints = os.path.join(sv_outdir, 'destruct_raw_breakpoints.csv') destruct_raw_library = os.path.join(sv_outdir, 'destruct_raw_library.csv') destruct_reads = os.path.join(sv_outdir, 'destruct_reads.csv') lumpy_vcf = os.path.join(sv_outdir, 'lumpy.vcf') parsed_csv = os.path.join(sv_outdir, 'filtered_consensus_calls.csv') workflow.subworkflow( name="call_breakpoints", func=call_breakpoints, args=( samples, config, mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints, axes_origin=[]), mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library, axes_origin=[]), mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints, axes_origin=[]), mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library, axes_origin=[]), mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads, axes_origin=[]), mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf, axes_origin=[]), mgd.OutputFile('parsed_csv', 'sample_id', template=parsed_csv, axes_origin=[]) ) ) cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data') titan_raw_dir = os.path.join(cna_outdir, 'titan') remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5') titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5') titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5') titan_params_filename = os.path.join(titan_raw_dir, 'params.h5') workflow.subworkflow( name='titan', func=titan.create_titan_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("target_list", 'sample_id', fnames=targets, axes_origin=[]), mgd.Template(titan_raw_dir, 'sample_id'), mgd.OutputFile('titan_segments_filename', 'sample_id', axes_origin=[], template=titan_segments_filename), mgd.OutputFile('titan_params_filename', 'sample_id', axes_origin=[], template=titan_params_filename), mgd.OutputFile('titan_markers_filename', 'sample_id', axes_origin=[], template=titan_markers_filename), config['globals'], config['cna_calling'], config['cna_calling']['titan_intervals'], mgd.InputInstance('sample_id'), ), ) workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id',), args=( mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile('destruct_breakpoints', 'sample_id', axes_origin=[], template=destruct_breakpoints), mgd.InputInstance('sample_id'), config['cna_calling']['remixt_refdata'], mgd.OutputFile('remixt_results_filename', 'sample_id', axes_origin=[], template=remixt_results_filename), mgd.Template(remixt_raw_dir, 'sample_id'), config['cna_calling']['min_num_reads'] ), ) pyp.run(workflow)
def somatic_calling_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(tumours.keys()) tumour_ids = helpers.get_values_from_input(inputs, 'tumour_id') normal_ids = helpers.get_values_from_input(inputs, 'normal_id') var_dir = os.path.join(args['out_dir'], 'somatic') museq_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_paired_annotated.vcf.gz') museq_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_paired_annotated.maf') museq_paired_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_paired_museqportrait.pdf') strelka_snv_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_strelka_snv_annotated.vcf.gz') strelka_snv_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_strelka_snv_annotated.maf') strelka_indel_vcf = os.path.join( var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz') strelka_indel_maf = os.path.join( var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.maf') mutect_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_mutect.vcf.gz') mutect_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_mutect.maf') consensus_somatic_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_somatic.maf') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) workflow.subworkflow(name='variant_calling', func=somatic_calling.create_somatic_calling_workflow, args=( samples, mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq_vcf', 'sample_id', template=museq_vcf, axes_origin=[]), mgd.OutputFile('museq_maf', 'sample_id', template=museq_maf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]), mgd.OutputFile('strelka_snv_vcf', 'sample_id', template=strelka_snv_vcf, axes_origin=[]), mgd.OutputFile('strelka_snv_maf', 'sample_id', template=strelka_snv_maf, axes_origin=[]), mgd.OutputFile('strelka_indel_vcf', 'sample_id', template=strelka_indel_vcf, axes_origin=[]), mgd.OutputFile('strelka_indel_maf', 'sample_id', template=strelka_indel_maf, axes_origin=[]), mgd.OutputFile('mutect_vcf', 'sample_id', template=mutect_vcf, axes_origin=[]), mgd.OutputFile('mutect_maf', 'sample_id', template=mutect_maf, axes_origin=[]), mgd.OutputFile('consensus_somatic_maf', 'sample_id', template=consensus_somatic_maf, axes_origin=[]), args['refdir'], normal_ids, tumour_ids, ), kwargs={ 'single_node': args['single_node'], 'is_exome': args['is_exome'], }) filenames = [ museq_vcf, museq_maf, museq_paired_pdf, strelka_snv_vcf, strelka_snv_maf, strelka_indel_vcf, strelka_indel_maf, mutect_vcf, mutect_maf, consensus_somatic_maf ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args['out_dir'], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'variant_calling' } }) pyp.run(workflow)
def single_sample_copynumber_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') bams = helpers.get_values_from_input(inputs, 'bam') samples = list(bams.keys()) cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') hmmcopy_raw_dir = os.path.join(cna_outdir, 'hmmcopy') bias_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_bias.pdf') correction_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_correction.pdf') hmmcopy_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_hmmcopy.pdf') correction_table = os.path.join(hmmcopy_raw_dir, '{sample_id}_correctreads_with_state.txt') pygenes = os.path.join(hmmcopy_raw_dir, '{sample_id}_hmmcopy.seg.pygenes') refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow() workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) workflow.subworkflow( name='hmmcopy', func=hmmcopy.create_hmmcopy_workflow, axes=('sample_id', ), args=(mgd.InputFile("sample.bam", 'sample_id', fnames=bams, extensions=['.bai' ]), mgd.InputInstance('sample_id'), mgd.OutputFile('bias', 'sample_id', template=bias_pdf), mgd.OutputFile('correction', 'sample_id', template=correction_pdf), mgd.OutputFile('hmmcopy', 'sample_id', template=hmmcopy_pdf), mgd.OutputFile('correction_table', 'sample_id', template=correction_table), mgd.OutputFile('pygenes', 'sample_id', template=pygenes), chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf']), ) filenames = [ bias_pdf, correction_pdf, hmmcopy_pdf, correction_table, pygenes, ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'single_sample_copynumber_calling' } }) pyp.run(workflow)
def copynumber_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) run_hmmcopy = args['hmmcopy'] run_titan = args['titan'] run_remixt = args['remixt'] if not run_hmmcopy and not run_titan and not run_remixt: run_hmmcopy = True run_titan = True run_remixt = True inputs = helpers.load_yaml(args['input_yaml']) outdir = args['out_dir'] meta_yaml = os.path.join(outdir, 'metadata.yaml') input_yaml_blob = os.path.join(outdir, 'input.yaml') tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') targets = helpers.get_values_from_input(inputs, 'target_list') breakpoints = helpers.get_values_from_input(inputs, 'breakpoints') samples = list(tumours.keys()) cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}') titan_raw_dir = os.path.join(cna_outdir, 'titan') titan_outfile = os.path.join(titan_raw_dir, '{sample_id}_titan_markers.csv.gz') titan_params = os.path.join(titan_raw_dir, '{sample_id}_titan_params.csv.gz') titan_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_segs.csv.gz') titan_igv_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_igv_segs.seg') titan_parsed = os.path.join(titan_raw_dir, '{sample_id}_titan_parsed.csv.gz') titan_plots = os.path.join(titan_raw_dir, '{sample_id}_titan_plots.pdf') titan_tar_outputs = os.path.join(titan_raw_dir, '{sample_id}_data_all_parameters.tar.gz') museq_vcf = os.path.join(titan_raw_dir, '{sample_id}_museq.vcf') hmmcopy_normal_raw_dir = os.path.join(cna_outdir, 'hmmcopy_normal') normal_bias_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots', '{sample_id}_bias.pdf') normal_correction_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots', '{sample_id}_correction.pdf') normal_hmmcopy_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots', '{sample_id}_hmmcopy.pdf') normal_correction_table = os.path.join( hmmcopy_normal_raw_dir, '{sample_id}_correctreads_with_state.txt') normal_pygenes = os.path.join(hmmcopy_normal_raw_dir, '{sample_id}_hmmcopy.seg.pygenes') hmmcopy_tumour_raw_dir = os.path.join(cna_outdir, 'hmmcopy_tumour') tumour_bias_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots', '{sample_id}_bias.pdf') tumour_correction_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots', '{sample_id}_correction.pdf') tumour_hmmcopy_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots', '{sample_id}_hmmcopy.pdf') tumour_correction_table = os.path.join( hmmcopy_tumour_raw_dir, '{sample_id}_correctreads_with_state.txt') tumour_pygenes = os.path.join(hmmcopy_tumour_raw_dir, '{sample_id}_hmmcopy.seg.pygenes') remixt_outdir = os.path.join(args['out_dir'], 'remixt', '{sample_id}') remixt_outfile = os.path.join(remixt_outdir, '{sample_id}_remixt.h5') remixt_brk_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_brk_cn.csv.gz') remixt_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_cn.csv.gz') remixt_minor_modes_csv = os.path.join( remixt_outdir, '{sample_id}_remixt_minor_modes.csv.gz') remixt_mix_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_mix.csv.gz') remixt_read_depth_csv = os.path.join( remixt_outdir, '{sample_id}_remixt_read_depth.csv.gz') remixt_stats_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_stats.csv.gz') refdir_paths = config.refdir_data(args['refdir'])['paths'] chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes'] workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx( docker_image=config.containers('wgs'))) workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples) if run_remixt: workflow.subworkflow( name='remixt', func=remixt.create_remixt_workflow, axes=('sample_id', ), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("breakpoints", 'sample_id', fnames=breakpoints), mgd.InputInstance('sample_id'), mgd.OutputFile('remixt.h5', 'sample_id', template=remixt_outfile), mgd.OutputFile('remixt_brk_cn.csv', 'sample_id', template=remixt_brk_cn_csv), mgd.OutputFile('remixt_cn.csv', 'sample_id', template=remixt_cn_csv), mgd.OutputFile('remixt_minor_modes.csv', 'sample_id', template=remixt_minor_modes_csv), mgd.OutputFile('remixt_mix.csv', 'sample_id', template=remixt_mix_csv), mgd.OutputFile('remixt_read_depth.csv', 'sample_id', template=remixt_read_depth_csv), mgd.OutputFile('remixt_stats.csv', 'sample_id', template=remixt_stats_csv), refdir_paths['refdata_remixt'], refdir_paths['reference'], ), kwargs={'single_node': args['single_node']}) if run_titan: workflow.subworkflow(name='titan', func=titan.create_titan_workflow, axes=('sample_id', ), args=( mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputFile("target_list", 'sample_id', fnames=targets), mgd.OutputFile('outfile', 'sample_id', template=titan_outfile), mgd.OutputFile('params', 'sample_id', template=titan_params), mgd.OutputFile('segs', 'sample_id', template=titan_segs), mgd.OutputFile('igv_segs', 'sample_id', template=titan_igv_segs), mgd.OutputFile('parsed', 'sample_id', template=titan_parsed), mgd.OutputFile('plots', 'sample_id', template=titan_plots), mgd.OutputFile('tar_outputs', 'sample_id', template=titan_tar_outputs), mgd.OutputFile('museq.vcf', 'sample_id', template=museq_vcf), mgd.InputInstance('sample_id'), refdir_paths['reference'], chromosomes, refdir_paths['het_positions_titan'], refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf'], ), kwargs={'single_node': args['single_node']}) if run_hmmcopy: workflow.subworkflow( name='hmmcopy_normal', func=hmmcopy.create_hmmcopy_workflow, axes=('sample_id', ), args=(mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai']), mgd.InputInstance('sample_id'), mgd.OutputFile('normal_bias', 'sample_id', template=normal_bias_pdf), mgd.OutputFile('normal_correction', 'sample_id', template=normal_correction_pdf), mgd.OutputFile('normal_hmmcopy', 'sample_id', template=normal_hmmcopy_pdf), mgd.OutputFile('normal_correction_table', 'sample_id', template=normal_correction_table), mgd.OutputFile('normal_pygenes', 'sample_id', template=normal_pygenes), chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf']), ) workflow.subworkflow( name='hmmcopy_tumour', func=hmmcopy.create_hmmcopy_workflow, axes=('sample_id', ), args=(mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai']), mgd.InputInstance('sample_id'), mgd.OutputFile('tumour_bias', 'sample_id', template=tumour_bias_pdf), mgd.OutputFile('tumour_correction', 'sample_id', template=tumour_correction_pdf), mgd.OutputFile('tumour_hmmcopy', 'sample_id', template=tumour_hmmcopy_pdf), mgd.OutputFile('tumour_correction_table', 'sample_id', template=tumour_correction_table), mgd.OutputFile('tumour_pygenes', 'sample_id', template=tumour_pygenes), chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'], refdir_paths['gtf']), ) filenames = [] if run_titan: filenames += [ titan_outfile, titan_params, titan_segs, titan_igv_segs, titan_parsed, titan_plots, titan_tar_outputs, museq_vcf, ] if run_hmmcopy: filenames += [ normal_bias_pdf, normal_correction_pdf, normal_hmmcopy_pdf, normal_correction_table, normal_pygenes, tumour_bias_pdf, tumour_correction_pdf, tumour_hmmcopy_pdf, tumour_correction_table, tumour_pygenes ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform(name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=(sys.argv[0:], args["out_dir"], outputted_filenames, mgd.OutputFile(meta_yaml)), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': { 'type': 'copynumber_calling' } }) pyp.run(workflow)
def germline_calling_workflow(args): inputs = helpers.load_yaml(args['input_yaml']) meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml') input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml') normals = helpers.get_values_from_input(inputs, 'normal') samples = list(normals.keys()) normal_ids = helpers.get_values_from_input(inputs, 'normal_id') var_dir = os.path.join(args['out_dir'], 'germline') museq_ss_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_single_annotated.vcf.gz') museq_ss_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_single_annotated.maf') museq_single_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_single_museqportrait.pdf') samtools_germline_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_samtools_germline.vcf.gz') samtools_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_samtools_germline.maf') samtools_roh = os.path.join(var_dir, '{sample_id}', '{sample_id}_roh.csv.gz') freebayes_germline_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_freebayes_germline.vcf.gz') freebayes_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_freebayes_germline.maf') rtg_germline_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_rtg_germline.vcf.gz') rtg_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_rtg_germline.maf') consensus_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_germline.maf') pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow( ctx=helpers.get_default_ctx() ) workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) workflow.subworkflow( name='germline_calling', func=germline_calling.create_germline_calling_workflow, args=( samples, mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq_ss_vcf', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('museq_ss_maf', 'sample_id', template=museq_ss_maf, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), mgd.OutputFile('samtools_germline_vcf', 'sample_id', template=samtools_germline_vcf, axes_origin=[]), mgd.OutputFile('samtools_germline_maf', 'sample_id', template=samtools_germline_maf, axes_origin=[]), mgd.OutputFile('samtools_roh', 'sample_id', template=samtools_roh, axes_origin=[]), mgd.OutputFile('freebayes_germline_vcf', 'sample_id', template=freebayes_germline_vcf, axes_origin=[]), mgd.OutputFile('freebayes_germline_maf', 'sample_id', template=freebayes_germline_maf, axes_origin=[]), mgd.OutputFile('rtg_germline_vcf', 'sample_id', template=rtg_germline_vcf, axes_origin=[]), mgd.OutputFile('rtg_germline_maf', 'sample_id', template=rtg_germline_maf, axes_origin=[]), mgd.OutputFile('consensus_germline_maf', 'sample_id', template=consensus_germline_maf, axes_origin=[]), args['refdir'], normal_ids ), kwargs={ 'single_node': args['single_node'], } ) filenames = [ museq_ss_vcf, museq_ss_maf, museq_single_pdf, samtools_germline_vcf, samtools_germline_maf, samtools_roh, freebayes_germline_vcf, freebayes_germline_maf, rtg_germline_vcf, rtg_germline_maf, consensus_germline_maf ] outputted_filenames = helpers.expand_list(filenames, samples, "sample_id") workflow.transform( name='generate_meta_files_results', func='wgs.utils.helpers.generate_and_upload_metadata', args=( sys.argv[0:], args['out_dir'], outputted_filenames, mgd.OutputFile(meta_yaml) ), kwargs={ 'input_yaml_data': helpers.load_yaml(args['input_yaml']), 'input_yaml': mgd.OutputFile(input_yaml_blob), 'metadata': {'type': 'variant_calling'} } ) pyp.run(workflow)
def variant_calling_workflow(args): pyp = pypeliner.app.Pypeline(config=args) workflow = pypeliner.workflow.Workflow() config = helpers.load_yaml(args['config_file']) inputs = helpers.load_yaml(args['input_yaml']) tumours = helpers.get_values_from_input(inputs, 'tumour') normals = helpers.get_values_from_input(inputs, 'normal') samples = tumours.keys() museq_dir = os.path.join(args['out_dir'], 'variants') museq_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_paired_annotated.vcf.gz') museq_ss_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_single_annotated.vcf.gz') strelka_snv_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_snv_annotated.vcf.gz') strelka_indel_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_indel_annotated.vcf.gz') parsed_snv_csv = os.path.join(museq_dir, '{sample_id}', 'allcalls.csv') museq_paired_pdf = os.path.join(museq_dir, '{sample_id}', 'paired_museqportrait.pdf') museq_paired_pdf_txt = os.path.join(museq_dir, '{sample_id}', 'paired_museqportrait.txt') museq_single_pdf = os.path.join(museq_dir, '{sample_id}', 'single_museqportrait.pdf') museq_single_pdf_txt = os.path.join(museq_dir, '{sample_id}', 'single_museqportrait.txt') workflow.setobj( obj=mgd.OutputChunks('sample_id'), value=samples, ) workflow.subworkflow(name='variant_calling', func=call_variants, args=( samples, museq_dir, config, mgd.OutputFile('parsed_snv_csv', 'sample_id', template=parsed_snv_csv, axes_origin=[]), mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours, extensions=['.bai'], axes_origin=[]), mgd.InputFile("normal.bam", 'sample_id', fnames=normals, extensions=['.bai'], axes_origin=[]), mgd.OutputFile('museq', 'sample_id', template=museq_vcf, axes_origin=[]), mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]), mgd.OutputFile('strelka_snv', 'sample_id', template=strelka_snv_vcf, axes_origin=[]), mgd.OutputFile('strelka_indel', 'sample_id', template=strelka_indel_vcf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]), mgd.OutputFile('museq_paired_pdf_txt', 'sample_id', template=museq_paired_pdf_txt, axes_origin=[]), mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]), mgd.OutputFile('museq_single_pdf_txt', 'sample_id', template=museq_single_pdf_txt, axes_origin=[]), )) pyp.run(workflow)