예제 #1
0
파일: alignment.py 프로젝트: wisekh6/wgs
def alignment_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config_file'])
    inputs = helpers.load_yaml(args['input_yaml'])

    fastqs_r1 = helpers.get_values_from_input(inputs, 'fastq1')
    fastqs_r2 = helpers.get_values_from_input(inputs, 'fastq2')
    outputs = helpers.get_values_from_input(inputs, 'bam')

    outdir = args['out_dir']

    workflow.subworkflow(name="align_samples",
                         func=alignment.align_samples,
                         args=(config, fastqs_r1, fastqs_r2, outputs, outdir))

    pyp.run(workflow)
예제 #2
0
def breakpoint_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)

    inputs = helpers.load_yaml(args['input_yaml'])

    meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml')
    input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = list(tumours.keys())

    sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}')
    destruct_breakpoints = os.path.join(
        sv_outdir, '{sample_id}_destruct_breakpoints.csv.gz')
    destruct_library = os.path.join(sv_outdir,
                                    '{sample_id}_destruct_library.csv.gz')
    destruct_raw_breakpoints = os.path.join(
        sv_outdir, '{sample_id}_destruct_raw_breakpoints.csv.gz')
    destruct_raw_library = os.path.join(
        sv_outdir, '{sample_id}_destruct_raw_library.csv.gz')
    destruct_reads = os.path.join(sv_outdir,
                                  '{sample_id}_destruct_reads.csv.gz')
    lumpy_vcf = os.path.join(sv_outdir, '{sample_id}_lumpy.vcf')
    parsed_csv = os.path.join(sv_outdir,
                              '{sample_id}_filtered_consensus_calls.csv.gz')

    svaba_vcf = os.path.join(sv_outdir, '{sample_id}_svaba.vcf')

    single_node = args['single_node']

    refdir_paths = config.refdir_data(args['refdir'])['paths']
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name='destruct',
        func=destruct_wgs.create_destruct_wgs_workflow,
        axes=('sample_id', ),
        args=(mgd.InputFile("tumour.bam",
                            'sample_id',
                            fnames=tumours,
                            extensions=['.bai'],
                            axes_origin=[]),
              mgd.InputFile("normal.bam",
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai'],
                            axes_origin=[]),
              mgd.OutputFile('destruct_raw_breakpoints',
                             'sample_id',
                             template=destruct_raw_breakpoints),
              mgd.OutputFile('destruct_raw_library',
                             'sample_id',
                             template=destruct_raw_library),
              mgd.OutputFile('destruct_breakpoints',
                             'sample_id',
                             template=destruct_breakpoints),
              mgd.OutputFile('destruct_library',
                             'sample_id',
                             template=destruct_library),
              mgd.OutputFile('destruct_reads',
                             'sample_id',
                             template=destruct_reads),
              mgd.InputInstance('sample_id'), refdir_paths['reference'],
              refdir_paths['refdata_destruct'], refdir_paths['gtf'],
              refdir_paths['blacklist_destruct']),
        kwargs={'single_node': single_node})

    workflow.subworkflow(
        name='lumpy',
        func=lumpy.create_lumpy_workflow,
        axes=('sample_id', ),
        args=(mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), ),
        kwargs={
            'tumour_bam':
            mgd.InputFile("tumour.bam",
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai'],
                          axes_origin=[]),
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node
        },
    )

    if args['svaba']:
        workflow.subworkflow(
            name='svaba',
            func=svaba.create_svaba_workflow,
            axes=('sample_id', ),
            args=(
                mgd.InputFile("tumour.bam",
                              'sample_id',
                              fnames=tumours,
                              extensions=['.bai'],
                              axes_origin=[]),
                mgd.InputFile("normal.bam",
                              'sample_id',
                              fnames=normals,
                              extensions=['.bai'],
                              axes_origin=[]),
                mgd.OutputFile('svaba_vcf', 'sample_id', template=svaba_vcf),
                refdir_paths['reference'],
            ),
        )

    workflow.subworkflow(
        name="consensus_calling",
        func=breakpoint_calling_consensus.create_consensus_workflow,
        axes=('sample_id', ),
        args=(mgd.InputFile('destruct_breakpoints',
                            'sample_id',
                            template=destruct_breakpoints),
              mgd.InputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf),
              mgd.OutputFile('consensus_calls',
                             'sample_id',
                             template=parsed_csv,
                             extensions=['.yaml']), chromosomes),
    )

    filenames = [
        destruct_breakpoints, destruct_library, destruct_raw_breakpoints,
        destruct_raw_library, destruct_reads, lumpy_vcf, parsed_csv
    ]

    if args['svaba']:
        filenames.append(svaba_vcf)

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func=helpers.generate_and_upload_metadata,
                       args=(sys.argv[0:], args["out_dir"],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'breakpoint_calling'
                           }
                       })

    pyp.run(workflow)
예제 #3
0
def variant_calling_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = list(tumours.keys())

    var_dir = os.path.join(args['out_dir'], 'variants')
    museq_vcf = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_museq_paired_annotated.vcf.gz')
    museq_ss_vcf = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_museq_single_annotated.vcf.gz')

    samtools_germline_vcf = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_samtools_germline.vcf.gz')
    samtools_roh = os.path.join(var_dir, '{sample_id}', '{sample_id}_roh.csv')

    strelka_snv_vcf = os.path.join(var_dir, '{sample_id}',
                                   '{sample_id}_strelka_snv_annotated.vcf.gz')
    strelka_indel_vcf = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz')
    museq_paired_pdf = os.path.join(var_dir, '{sample_id}',
                                    '{sample_id}_paired_museqportrait.pdf')
    museq_single_pdf = os.path.join(var_dir, '{sample_id}',
                                    '{sample_id}_single_museqportrait.pdf')

    somatic_csv = os.path.join(var_dir, '{sample_id}',
                               '{sample_id}_consensus_somatic.csv.gz')
    somatic_snpeff = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_consensus_somatic_snpeff.csv.gz')
    somatic_ma = os.path.join(var_dir, '{sample_id}',
                              '{sample_id}_consensus_somatic_ma.csv.gz')
    somatic_ids = os.path.join(var_dir, '{sample_id}',
                               '{sample_id}_consensus_somatic_ids.csv.gz')

    indel_csv = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_indel.csv.gz')
    indel_snpeff = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_indel_snpeff.csv.gz')
    indel_ma = os.path.join(var_dir, '{sample_id}',
                            '{sample_id}_indel_ma.csv.gz')
    indel_ids = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_indel_ids.csv.gz')

    germline_csv = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_germline.csv.gz')
    germline_snpeff = os.path.join(var_dir, '{sample_id}',
                                   '{sample_id}_germline_snpeff.csv.gz')
    germline_ma = os.path.join(var_dir, '{sample_id}',
                               '{sample_id}_germline_ma.csv.gz')
    germline_ids = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_germline_ids.csv.gz')

    pyp = pypeliner.app.Pypeline(config=args)

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples,
    )

    if not all(tumours.values()):
        workflow.subworkflow(
            name='variant_calling',
            func=call_germlines_only,
            args=(samples,
                  mgd.InputFile("normal.bam",
                                'sample_id',
                                fnames=normals,
                                extensions=['.bai'],
                                axes_origin=[]),
                  mgd.OutputFile('museq_ss',
                                 'sample_id',
                                 template=museq_ss_vcf,
                                 axes_origin=[]),
                  mgd.OutputFile('samtools_germline',
                                 'sample_id',
                                 template=samtools_germline_vcf,
                                 axes_origin=[]),
                  mgd.OutputFile('samtools_roh',
                                 'sample_id',
                                 template=samtools_roh,
                                 axes_origin=[]),
                  mgd.OutputFile('museq_single_pdf',
                                 'sample_id',
                                 template=museq_single_pdf,
                                 axes_origin=[]), args['refdir']),
            kwargs={'single_node': args['single_node']})
    else:
        workflow.subworkflow(name='variant_calling',
                             func=call_variants,
                             args=(
                                 samples,
                                 mgd.OutputFile('somatic_csv',
                                                'sample_id',
                                                template=somatic_csv,
                                                axes_origin=[]),
                                 mgd.OutputFile('somatic_snpeff',
                                                'sample_id',
                                                template=somatic_snpeff,
                                                axes_origin=[]),
                                 mgd.OutputFile('somatic_ma',
                                                'sample_id',
                                                template=somatic_ma,
                                                axes_origin=[]),
                                 mgd.OutputFile('somatic_ids',
                                                'sample_id',
                                                template=somatic_ids,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_csv',
                                                'sample_id',
                                                template=indel_csv,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_snpeff',
                                                'sample_id',
                                                template=indel_snpeff,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_ma',
                                                'sample_id',
                                                template=indel_ma,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_ids',
                                                'sample_id',
                                                template=indel_ids,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_csv',
                                                'sample_id',
                                                template=germline_csv,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_snpeff',
                                                'sample_id',
                                                template=germline_snpeff,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_ma',
                                                'sample_id',
                                                template=germline_ma,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_ids',
                                                'sample_id',
                                                template=germline_ids,
                                                axes_origin=[]),
                                 mgd.InputFile("tumour.bam",
                                               'sample_id',
                                               fnames=tumours,
                                               extensions=['.bai'],
                                               axes_origin=[]),
                                 mgd.InputFile("normal.bam",
                                               'sample_id',
                                               fnames=normals,
                                               extensions=['.bai'],
                                               axes_origin=[]),
                                 mgd.OutputFile('museq',
                                                'sample_id',
                                                template=museq_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('museq_ss',
                                                'sample_id',
                                                template=museq_ss_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('samtools_germline',
                                                'sample_id',
                                                template=samtools_germline_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('roh_calls',
                                                'sample_id',
                                                template=samtools_roh,
                                                axes_origin=[]),
                                 mgd.OutputFile('strelka_snv',
                                                'sample_id',
                                                template=strelka_snv_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('strelka_indel',
                                                'sample_id',
                                                template=strelka_indel_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('museq_paired_pdf',
                                                'sample_id',
                                                template=museq_paired_pdf,
                                                axes_origin=[]),
                                 mgd.OutputFile('museq_single_pdf',
                                                'sample_id',
                                                template=museq_single_pdf,
                                                axes_origin=[]),
                                 args['refdir'],
                             ),
                             kwargs={
                                 'single_node': args['single_node'],
                                 'is_exome': args['is_exome'],
                             })

        filenames = [
            somatic_csv, somatic_snpeff, somatic_ma, somatic_ids, indel_csv,
            indel_snpeff, indel_ma, indel_ids, germline_csv, germline_snpeff,
            germline_ma, germline_ids, museq_vcf, museq_ss_vcf,
            strelka_snv_vcf, strelka_indel_vcf, museq_paired_pdf,
            museq_single_pdf
        ]

        outputted_filenames = helpers.expand_list(filenames, samples,
                                                  "sample_id")

        workflow.transform(
            name='generate_meta_files_results',
            func='wgs.utils.helpers.generate_and_upload_metadata',
            args=(sys.argv[0:], args['out_dir'], outputted_filenames,
                  mgd.OutputFile(meta_yaml)),
            kwargs={
                'input_yaml_data': helpers.load_yaml(args['input_yaml']),
                'input_yaml': mgd.OutputFile(input_yaml_blob),
                'metadata': {
                    'type': 'variant_calling'
                }
            })

    pyp.run(workflow)
예제 #4
0
파일: cna_calling.py 프로젝트: wisekh6/wgs
def cna_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config_file'])
    inputs = helpers.load_yaml(args['input_yaml'])

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    targets = helpers.get_values_from_input(inputs, 'target_list')
    breakpoints = helpers.get_values_from_input(inputs, 'breakpoints')
    samples = tumours.keys()

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')
    remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5')
    remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data')

    titan_raw_dir = os.path.join(cna_outdir, 'titan')
    titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5')
    titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5')
    titan_params_filename = os.path.join(titan_raw_dir, 'params.h5')

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name='titan',
        func=titan.create_titan_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("target_list", 'sample_id', fnames=targets,
                          axes_origin=[]),
            mgd.Template(titan_raw_dir, 'sample_id'),
            mgd.OutputFile('titan_segments_filename', 'sample_id',
                           axes_origin=[], template=titan_segments_filename),
            mgd.OutputFile('titan_params_filename', 'sample_id',
                           axes_origin=[], template=titan_params_filename),
            mgd.OutputFile('titan_markers_filename', 'sample_id',
                           axes_origin=[], template=titan_markers_filename),
            config['globals'],
            config['cna_calling'],
            config['cna_calling']['titan_intervals'],
            mgd.InputInstance('sample_id'),
        ),
    )

    workflow.subworkflow(
        name='remixt',
        func=remixt.create_remixt_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour_bam', 'sample_id',
                          fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal_bam', 'sample_id',
                          fnames=normals, extensions=['.bai']),
            mgd.InputFile('destruct_breakpoints', 'sample_id',
                          axes_origin=[], fnames=breakpoints),
            mgd.InputInstance('sample_id'),
            config['cna_calling']['remixt_refdata'],
            mgd.OutputFile('remixt_results_filename', 'sample_id',
                           axes_origin=[], template=remixt_results_filename),
            mgd.Template(remixt_raw_dir, 'sample_id'),
            config['cna_calling']['min_num_reads']
        ),
    )

    pyp.run(workflow)
예제 #5
0
파일: sv_calling.py 프로젝트: wisekh6/wgs
def sv_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config_file'])
    inputs = helpers.load_yaml(args['input_yaml'])

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = tumours.keys()

    sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}')
    destruct_breakpoints = os.path.join(sv_outdir, 'destruct_breakpoints.csv')
    destruct_library = os.path.join(sv_outdir, 'destruct_library.csv')
    destruct_raw_breakpoints = os.path.join(sv_outdir,
                                            'destruct_raw_breakpoints.csv')
    destruct_raw_library = os.path.join(sv_outdir, 'destruct_raw_library.csv')
    destruct_reads = os.path.join(sv_outdir, 'destruct_reads.csv')
    lumpy_vcf = os.path.join(sv_outdir, 'lumpy.vcf')
    parsed_csv = os.path.join(sv_outdir, 'filtered_consensus_calls.csv')

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(name="call_breakpoints",
                         func=call_breakpoints,
                         args=(samples, config,
                               mgd.InputFile("tumour.bam",
                                             'sample_id',
                                             fnames=tumours,
                                             extensions=['.bai'],
                                             axes_origin=[]),
                               mgd.InputFile("normal.bam",
                                             'sample_id',
                                             fnames=normals,
                                             extensions=['.bai'],
                                             axes_origin=[]),
                               mgd.OutputFile(
                                   'destruct_raw_breakpoints',
                                   'sample_id',
                                   template=destruct_raw_breakpoints,
                                   axes_origin=[]),
                               mgd.OutputFile('destruct_raw_library',
                                              'sample_id',
                                              template=destruct_raw_library,
                                              axes_origin=[]),
                               mgd.OutputFile('destruct_breakpoints',
                                              'sample_id',
                                              template=destruct_breakpoints,
                                              axes_origin=[]),
                               mgd.OutputFile('destruct_library',
                                              'sample_id',
                                              template=destruct_library,
                                              axes_origin=[]),
                               mgd.OutputFile('destruct_reads',
                                              'sample_id',
                                              template=destruct_reads,
                                              axes_origin=[]),
                               mgd.OutputFile('lumpy_vcf',
                                              'sample_id',
                                              template=lumpy_vcf,
                                              axes_origin=[]),
                               mgd.OutputFile('parsed_csv',
                                              'sample_id',
                                              template=parsed_csv,
                                              axes_origin=[])))

    pyp.run(workflow)
예제 #6
0
파일: wgs_workflow.py 프로젝트: wisekh6/wgs
def wgs_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config_file'])
    inputs = helpers.load_yaml(args['input_yaml'])

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    targets = helpers.get_values_from_input(inputs, 'target_list')
    samples = tumours.keys()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples,
    )

    if args['alignment']:
        tumour_fastqs_r1, tumour_fastqs_r2 = get_fastqs(inputs, samples, 'tumour')
        normal_fastqs_r1, normal_fastqs_r2 = get_fastqs(inputs, samples, 'normal')

        normal_alignment_template = os.path.join(
            args['out_dir'], 'alignment', '{norm_sample_id}', '{norm_lane}', 'normal'
        )
        tumour_alignment_template = os.path.join(
            args['out_dir'], 'alignment', '{tum_sample_id}', '{tum_lane}', 'tumour'
        )

        workflow.subworkflow(
            name='wgs_alignment_paired_lanes',
            func=paired_alignment,
            args=(
                config,
                mgd.OutputFile("tumour.bam", 'sample_id', fnames=tumours,
                               extensions=['.bai'], axes_origin=[]),
                mgd.OutputFile("normal.bam", 'sample_id', fnames=normals,
                               extensions=['.bai'], axes_origin=[]),
                samples,
                tumour_fastqs_r1,
                tumour_fastqs_r2,
                normal_fastqs_r1,
                normal_fastqs_r2,
                normal_alignment_template,
                tumour_alignment_template,
            )
        )

    museq_dir = os.path.join(args['out_dir'], 'variants')
    museq_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_paired_annotated.vcf.gz')
    museq_ss_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_single_annotated.vcf.gz')
    strelka_snv_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_snv_annotated.vcf.gz')
    strelka_indel_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_indel_annotated.vcf.gz')
    parsed_snv_csv = os.path.join(museq_dir, '{sample_id}', 'allcalls.csv')
    museq_paired_pdf = os.path.join(museq_dir, '{sample_id}', 'paired_museqportrait.pdf')
    museq_single_pdf = os.path.join(museq_dir, '{sample_id}', 'single_museqportrait.pdf')
    workflow.subworkflow(
        name='variant_calling',
        func=call_variants,
        args=(
            samples,
            config,
            mgd.OutputFile('parsed_snv_csv', 'sample_id', template=parsed_snv_csv, axes_origin=[]),
            mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile('museq', 'sample_id', template=museq_vcf, axes_origin=[]),
            mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]),
            mgd.OutputFile('strelka_snv', 'sample_id', template=strelka_snv_vcf, axes_origin=[]),
            mgd.OutputFile('strelka_indel', 'sample_id', template=strelka_indel_vcf, axes_origin=[]),
            mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]),
            mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]),
        )
    )

    sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}')
    destruct_breakpoints = os.path.join(sv_outdir, 'destruct_breakpoints.csv')
    destruct_library = os.path.join(sv_outdir, 'destruct_library.csv')
    destruct_raw_breakpoints = os.path.join(sv_outdir, 'destruct_raw_breakpoints.csv')
    destruct_raw_library = os.path.join(sv_outdir, 'destruct_raw_library.csv')
    destruct_reads = os.path.join(sv_outdir, 'destruct_reads.csv')
    lumpy_vcf = os.path.join(sv_outdir, 'lumpy.vcf')
    parsed_csv = os.path.join(sv_outdir, 'filtered_consensus_calls.csv')
    workflow.subworkflow(
        name="call_breakpoints",
        func=call_breakpoints,
        args=(
            samples,
            config,
            mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints, axes_origin=[]),
            mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library, axes_origin=[]),
            mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints, axes_origin=[]),
            mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library, axes_origin=[]),
            mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads, axes_origin=[]),
            mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf, axes_origin=[]),
            mgd.OutputFile('parsed_csv', 'sample_id', template=parsed_csv, axes_origin=[])
        )
    )

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')
    remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data')
    titan_raw_dir = os.path.join(cna_outdir, 'titan')
    remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5')
    titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5')
    titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5')
    titan_params_filename = os.path.join(titan_raw_dir, 'params.h5')
    workflow.subworkflow(
        name='titan',
        func=titan.create_titan_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.InputFile("target_list", 'sample_id', fnames=targets, axes_origin=[]),
            mgd.Template(titan_raw_dir, 'sample_id'),
            mgd.OutputFile('titan_segments_filename', 'sample_id', axes_origin=[], template=titan_segments_filename),
            mgd.OutputFile('titan_params_filename', 'sample_id', axes_origin=[], template=titan_params_filename),
            mgd.OutputFile('titan_markers_filename', 'sample_id', axes_origin=[], template=titan_markers_filename),
            config['globals'],
            config['cna_calling'],
            config['cna_calling']['titan_intervals'],
            mgd.InputInstance('sample_id'),
        ),
    )
    workflow.subworkflow(
        name='remixt',
        func=remixt.create_remixt_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.InputFile('destruct_breakpoints', 'sample_id', axes_origin=[], template=destruct_breakpoints),
            mgd.InputInstance('sample_id'),
            config['cna_calling']['remixt_refdata'],
            mgd.OutputFile('remixt_results_filename', 'sample_id', axes_origin=[], template=remixt_results_filename),
            mgd.Template(remixt_raw_dir, 'sample_id'),
            config['cna_calling']['min_num_reads']
        ),
    )

    pyp.run(workflow)
예제 #7
0
def somatic_calling_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = list(tumours.keys())

    tumour_ids = helpers.get_values_from_input(inputs, 'tumour_id')
    normal_ids = helpers.get_values_from_input(inputs, 'normal_id')

    var_dir = os.path.join(args['out_dir'], 'somatic')
    museq_vcf = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_museq_paired_annotated.vcf.gz')
    museq_maf = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_museq_paired_annotated.maf')
    museq_paired_pdf = os.path.join(var_dir, '{sample_id}',
                                    '{sample_id}_paired_museqportrait.pdf')

    strelka_snv_vcf = os.path.join(var_dir, '{sample_id}',
                                   '{sample_id}_strelka_snv_annotated.vcf.gz')
    strelka_snv_maf = os.path.join(var_dir, '{sample_id}',
                                   '{sample_id}_strelka_snv_annotated.maf')
    strelka_indel_vcf = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz')
    strelka_indel_maf = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.maf')

    mutect_vcf = os.path.join(var_dir, '{sample_id}',
                              '{sample_id}_mutect.vcf.gz')
    mutect_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_mutect.maf')

    consensus_somatic_maf = os.path.join(var_dir, '{sample_id}',
                                         '{sample_id}_consensus_somatic.maf')

    pyp = pypeliner.app.Pypeline(config=args)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples,
    )

    workflow.subworkflow(name='variant_calling',
                         func=somatic_calling.create_somatic_calling_workflow,
                         args=(
                             samples,
                             mgd.InputFile("tumour.bam",
                                           'sample_id',
                                           fnames=tumours,
                                           extensions=['.bai'],
                                           axes_origin=[]),
                             mgd.InputFile("normal.bam",
                                           'sample_id',
                                           fnames=normals,
                                           extensions=['.bai'],
                                           axes_origin=[]),
                             mgd.OutputFile('museq_vcf',
                                            'sample_id',
                                            template=museq_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('museq_maf',
                                            'sample_id',
                                            template=museq_maf,
                                            axes_origin=[]),
                             mgd.OutputFile('museq_paired_pdf',
                                            'sample_id',
                                            template=museq_paired_pdf,
                                            axes_origin=[]),
                             mgd.OutputFile('strelka_snv_vcf',
                                            'sample_id',
                                            template=strelka_snv_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('strelka_snv_maf',
                                            'sample_id',
                                            template=strelka_snv_maf,
                                            axes_origin=[]),
                             mgd.OutputFile('strelka_indel_vcf',
                                            'sample_id',
                                            template=strelka_indel_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('strelka_indel_maf',
                                            'sample_id',
                                            template=strelka_indel_maf,
                                            axes_origin=[]),
                             mgd.OutputFile('mutect_vcf',
                                            'sample_id',
                                            template=mutect_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('mutect_maf',
                                            'sample_id',
                                            template=mutect_maf,
                                            axes_origin=[]),
                             mgd.OutputFile('consensus_somatic_maf',
                                            'sample_id',
                                            template=consensus_somatic_maf,
                                            axes_origin=[]),
                             args['refdir'],
                             normal_ids,
                             tumour_ids,
                         ),
                         kwargs={
                             'single_node': args['single_node'],
                             'is_exome': args['is_exome'],
                         })

    filenames = [
        museq_vcf, museq_maf, museq_paired_pdf, strelka_snv_vcf,
        strelka_snv_maf, strelka_indel_vcf, strelka_indel_maf, mutect_vcf,
        mutect_maf, consensus_somatic_maf
    ]

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], args['out_dir'],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'variant_calling'
                           }
                       })

    pyp.run(workflow)
예제 #8
0
def single_sample_copynumber_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)

    inputs = helpers.load_yaml(args['input_yaml'])

    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    bams = helpers.get_values_from_input(inputs, 'bam')
    samples = list(bams.keys())

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')

    hmmcopy_raw_dir = os.path.join(cna_outdir, 'hmmcopy')
    bias_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_bias.pdf')
    correction_pdf = os.path.join(hmmcopy_raw_dir, 'plots',
                                  '{sample_id}_correction.pdf')
    hmmcopy_pdf = os.path.join(hmmcopy_raw_dir, 'plots',
                               '{sample_id}_hmmcopy.pdf')
    correction_table = os.path.join(hmmcopy_raw_dir,
                                    '{sample_id}_correctreads_with_state.txt')
    pygenes = os.path.join(hmmcopy_raw_dir, '{sample_id}_hmmcopy.seg.pygenes')

    refdir_paths = config.refdir_data(args['refdir'])['paths']
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name='hmmcopy',
        func=hmmcopy.create_hmmcopy_workflow,
        axes=('sample_id', ),
        args=(mgd.InputFile("sample.bam",
                            'sample_id',
                            fnames=bams,
                            extensions=['.bai'
                                        ]), mgd.InputInstance('sample_id'),
              mgd.OutputFile('bias', 'sample_id', template=bias_pdf),
              mgd.OutputFile('correction',
                             'sample_id',
                             template=correction_pdf),
              mgd.OutputFile('hmmcopy', 'sample_id', template=hmmcopy_pdf),
              mgd.OutputFile('correction_table',
                             'sample_id',
                             template=correction_table),
              mgd.OutputFile('pygenes', 'sample_id', template=pygenes),
              chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'],
              refdir_paths['gtf']),
    )

    filenames = [
        bias_pdf,
        correction_pdf,
        hmmcopy_pdf,
        correction_table,
        pygenes,
    ]

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], args["out_dir"],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'single_sample_copynumber_calling'
                           }
                       })

    pyp.run(workflow)
예제 #9
0
def copynumber_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)

    run_hmmcopy = args['hmmcopy']
    run_titan = args['titan']
    run_remixt = args['remixt']

    if not run_hmmcopy and not run_titan and not run_remixt:
        run_hmmcopy = True
        run_titan = True
        run_remixt = True

    inputs = helpers.load_yaml(args['input_yaml'])

    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    targets = helpers.get_values_from_input(inputs, 'target_list')
    breakpoints = helpers.get_values_from_input(inputs, 'breakpoints')
    samples = list(tumours.keys())

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')

    titan_raw_dir = os.path.join(cna_outdir, 'titan')

    titan_outfile = os.path.join(titan_raw_dir,
                                 '{sample_id}_titan_markers.csv.gz')
    titan_params = os.path.join(titan_raw_dir,
                                '{sample_id}_titan_params.csv.gz')
    titan_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_segs.csv.gz')
    titan_igv_segs = os.path.join(titan_raw_dir,
                                  '{sample_id}_titan_igv_segs.seg')
    titan_parsed = os.path.join(titan_raw_dir,
                                '{sample_id}_titan_parsed.csv.gz')
    titan_plots = os.path.join(titan_raw_dir, '{sample_id}_titan_plots.pdf')
    titan_tar_outputs = os.path.join(titan_raw_dir,
                                     '{sample_id}_data_all_parameters.tar.gz')
    museq_vcf = os.path.join(titan_raw_dir, '{sample_id}_museq.vcf')

    hmmcopy_normal_raw_dir = os.path.join(cna_outdir, 'hmmcopy_normal')
    normal_bias_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots',
                                   '{sample_id}_bias.pdf')
    normal_correction_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots',
                                         '{sample_id}_correction.pdf')
    normal_hmmcopy_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots',
                                      '{sample_id}_hmmcopy.pdf')
    normal_correction_table = os.path.join(
        hmmcopy_normal_raw_dir, '{sample_id}_correctreads_with_state.txt')
    normal_pygenes = os.path.join(hmmcopy_normal_raw_dir,
                                  '{sample_id}_hmmcopy.seg.pygenes')

    hmmcopy_tumour_raw_dir = os.path.join(cna_outdir, 'hmmcopy_tumour')
    tumour_bias_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots',
                                   '{sample_id}_bias.pdf')
    tumour_correction_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots',
                                         '{sample_id}_correction.pdf')
    tumour_hmmcopy_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots',
                                      '{sample_id}_hmmcopy.pdf')
    tumour_correction_table = os.path.join(
        hmmcopy_tumour_raw_dir, '{sample_id}_correctreads_with_state.txt')
    tumour_pygenes = os.path.join(hmmcopy_tumour_raw_dir,
                                  '{sample_id}_hmmcopy.seg.pygenes')

    remixt_outdir = os.path.join(args['out_dir'], 'remixt', '{sample_id}')
    remixt_outfile = os.path.join(remixt_outdir, '{sample_id}_remixt.h5')

    remixt_brk_cn_csv = os.path.join(remixt_outdir,
                                     '{sample_id}_remixt_brk_cn.csv.gz')
    remixt_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_cn.csv.gz')
    remixt_minor_modes_csv = os.path.join(
        remixt_outdir, '{sample_id}_remixt_minor_modes.csv.gz')
    remixt_mix_csv = os.path.join(remixt_outdir,
                                  '{sample_id}_remixt_mix.csv.gz')
    remixt_read_depth_csv = os.path.join(
        remixt_outdir, '{sample_id}_remixt_read_depth.csv.gz')
    remixt_stats_csv = os.path.join(remixt_outdir,
                                    '{sample_id}_remixt_stats.csv.gz')

    refdir_paths = config.refdir_data(args['refdir'])['paths']
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    if run_remixt:
        workflow.subworkflow(
            name='remixt',
            func=remixt.create_remixt_workflow,
            axes=('sample_id', ),
            args=(
                mgd.InputFile("tumour.bam",
                              'sample_id',
                              fnames=tumours,
                              extensions=['.bai']),
                mgd.InputFile("normal.bam",
                              'sample_id',
                              fnames=normals,
                              extensions=['.bai']),
                mgd.InputFile("breakpoints", 'sample_id', fnames=breakpoints),
                mgd.InputInstance('sample_id'),
                mgd.OutputFile('remixt.h5',
                               'sample_id',
                               template=remixt_outfile),
                mgd.OutputFile('remixt_brk_cn.csv',
                               'sample_id',
                               template=remixt_brk_cn_csv),
                mgd.OutputFile('remixt_cn.csv',
                               'sample_id',
                               template=remixt_cn_csv),
                mgd.OutputFile('remixt_minor_modes.csv',
                               'sample_id',
                               template=remixt_minor_modes_csv),
                mgd.OutputFile('remixt_mix.csv',
                               'sample_id',
                               template=remixt_mix_csv),
                mgd.OutputFile('remixt_read_depth.csv',
                               'sample_id',
                               template=remixt_read_depth_csv),
                mgd.OutputFile('remixt_stats.csv',
                               'sample_id',
                               template=remixt_stats_csv),
                refdir_paths['refdata_remixt'],
                refdir_paths['reference'],
            ),
            kwargs={'single_node': args['single_node']})

    if run_titan:
        workflow.subworkflow(name='titan',
                             func=titan.create_titan_workflow,
                             axes=('sample_id', ),
                             args=(
                                 mgd.InputFile("tumour.bam",
                                               'sample_id',
                                               fnames=tumours,
                                               extensions=['.bai']),
                                 mgd.InputFile("normal.bam",
                                               'sample_id',
                                               fnames=normals,
                                               extensions=['.bai']),
                                 mgd.InputFile("target_list",
                                               'sample_id',
                                               fnames=targets),
                                 mgd.OutputFile('outfile',
                                                'sample_id',
                                                template=titan_outfile),
                                 mgd.OutputFile('params',
                                                'sample_id',
                                                template=titan_params),
                                 mgd.OutputFile('segs',
                                                'sample_id',
                                                template=titan_segs),
                                 mgd.OutputFile('igv_segs',
                                                'sample_id',
                                                template=titan_igv_segs),
                                 mgd.OutputFile('parsed',
                                                'sample_id',
                                                template=titan_parsed),
                                 mgd.OutputFile('plots',
                                                'sample_id',
                                                template=titan_plots),
                                 mgd.OutputFile('tar_outputs',
                                                'sample_id',
                                                template=titan_tar_outputs),
                                 mgd.OutputFile('museq.vcf',
                                                'sample_id',
                                                template=museq_vcf),
                                 mgd.InputInstance('sample_id'),
                                 refdir_paths['reference'],
                                 chromosomes,
                                 refdir_paths['het_positions_titan'],
                                 refdir_paths['map_wig'],
                                 refdir_paths['gc_wig'],
                                 refdir_paths['gtf'],
                             ),
                             kwargs={'single_node': args['single_node']})

    if run_hmmcopy:
        workflow.subworkflow(
            name='hmmcopy_normal',
            func=hmmcopy.create_hmmcopy_workflow,
            axes=('sample_id', ),
            args=(mgd.InputFile("normal.bam",
                                'sample_id',
                                fnames=normals,
                                extensions=['.bai']),
                  mgd.InputInstance('sample_id'),
                  mgd.OutputFile('normal_bias',
                                 'sample_id',
                                 template=normal_bias_pdf),
                  mgd.OutputFile('normal_correction',
                                 'sample_id',
                                 template=normal_correction_pdf),
                  mgd.OutputFile('normal_hmmcopy',
                                 'sample_id',
                                 template=normal_hmmcopy_pdf),
                  mgd.OutputFile('normal_correction_table',
                                 'sample_id',
                                 template=normal_correction_table),
                  mgd.OutputFile('normal_pygenes',
                                 'sample_id',
                                 template=normal_pygenes), chromosomes,
                  refdir_paths['map_wig'], refdir_paths['gc_wig'],
                  refdir_paths['gtf']),
        )

        workflow.subworkflow(
            name='hmmcopy_tumour',
            func=hmmcopy.create_hmmcopy_workflow,
            axes=('sample_id', ),
            args=(mgd.InputFile("tumour.bam",
                                'sample_id',
                                fnames=tumours,
                                extensions=['.bai']),
                  mgd.InputInstance('sample_id'),
                  mgd.OutputFile('tumour_bias',
                                 'sample_id',
                                 template=tumour_bias_pdf),
                  mgd.OutputFile('tumour_correction',
                                 'sample_id',
                                 template=tumour_correction_pdf),
                  mgd.OutputFile('tumour_hmmcopy',
                                 'sample_id',
                                 template=tumour_hmmcopy_pdf),
                  mgd.OutputFile('tumour_correction_table',
                                 'sample_id',
                                 template=tumour_correction_table),
                  mgd.OutputFile('tumour_pygenes',
                                 'sample_id',
                                 template=tumour_pygenes), chromosomes,
                  refdir_paths['map_wig'], refdir_paths['gc_wig'],
                  refdir_paths['gtf']),
        )

    filenames = []
    if run_titan:
        filenames += [
            titan_outfile,
            titan_params,
            titan_segs,
            titan_igv_segs,
            titan_parsed,
            titan_plots,
            titan_tar_outputs,
            museq_vcf,
        ]
    if run_hmmcopy:
        filenames += [
            normal_bias_pdf, normal_correction_pdf, normal_hmmcopy_pdf,
            normal_correction_table, normal_pygenes, tumour_bias_pdf,
            tumour_correction_pdf, tumour_hmmcopy_pdf, tumour_correction_table,
            tumour_pygenes
        ]

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], args["out_dir"],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'copynumber_calling'
                           }
                       })

    pyp.run(workflow)
예제 #10
0
def germline_calling_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = list(normals.keys())

    normal_ids = helpers.get_values_from_input(inputs, 'normal_id')

    var_dir = os.path.join(args['out_dir'], 'germline')

    museq_ss_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_single_annotated.vcf.gz')
    museq_ss_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_museq_single_annotated.maf')
    museq_single_pdf = os.path.join(var_dir, '{sample_id}', '{sample_id}_single_museqportrait.pdf')

    samtools_germline_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_samtools_germline.vcf.gz')
    samtools_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_samtools_germline.maf')
    samtools_roh = os.path.join(var_dir, '{sample_id}', '{sample_id}_roh.csv.gz')

    freebayes_germline_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_freebayes_germline.vcf.gz')
    freebayes_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_freebayes_germline.maf')

    rtg_germline_vcf = os.path.join(var_dir, '{sample_id}', '{sample_id}_rtg_germline.vcf.gz')
    rtg_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_rtg_germline.maf')

    consensus_germline_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_consensus_germline.maf')

    pyp = pypeliner.app.Pypeline(config=args)

    workflow = pypeliner.workflow.Workflow(
        ctx=helpers.get_default_ctx()
    )

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples,
    )

    workflow.subworkflow(
        name='germline_calling',
        func=germline_calling.create_germline_calling_workflow,
        args=(
            samples,
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile('museq_ss_vcf', 'sample_id', template=museq_ss_vcf, axes_origin=[]),
            mgd.OutputFile('museq_ss_maf', 'sample_id', template=museq_ss_maf, axes_origin=[]),
            mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]),
            mgd.OutputFile('samtools_germline_vcf', 'sample_id', template=samtools_germline_vcf, axes_origin=[]),
            mgd.OutputFile('samtools_germline_maf', 'sample_id', template=samtools_germline_maf, axes_origin=[]),
            mgd.OutputFile('samtools_roh', 'sample_id', template=samtools_roh, axes_origin=[]),
            mgd.OutputFile('freebayes_germline_vcf', 'sample_id', template=freebayes_germline_vcf, axes_origin=[]),
            mgd.OutputFile('freebayes_germline_maf', 'sample_id', template=freebayes_germline_maf, axes_origin=[]),
            mgd.OutputFile('rtg_germline_vcf', 'sample_id', template=rtg_germline_vcf, axes_origin=[]),
            mgd.OutputFile('rtg_germline_maf', 'sample_id', template=rtg_germline_maf, axes_origin=[]),
            mgd.OutputFile('consensus_germline_maf', 'sample_id', template=consensus_germline_maf, axes_origin=[]),
            args['refdir'],
            normal_ids
        ),
        kwargs={
            'single_node': args['single_node'],
        }
    )

    filenames = [
        museq_ss_vcf,
        museq_ss_maf,
        museq_single_pdf,
        samtools_germline_vcf,
        samtools_germline_maf,
        samtools_roh,
        freebayes_germline_vcf,
        freebayes_germline_maf,
        rtg_germline_vcf,
        rtg_germline_maf,
        consensus_germline_maf
    ]

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(
        name='generate_meta_files_results',
        func='wgs.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            args['out_dir'],
            outputted_filenames,
            mgd.OutputFile(meta_yaml)
        ),
        kwargs={
            'input_yaml_data': helpers.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'variant_calling'}
        }
    )

    pyp.run(workflow)
예제 #11
0
def variant_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config_file'])
    inputs = helpers.load_yaml(args['input_yaml'])

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = tumours.keys()

    museq_dir = os.path.join(args['out_dir'], 'variants')
    museq_vcf = os.path.join(museq_dir, '{sample_id}',
                             'museq_paired_annotated.vcf.gz')
    museq_ss_vcf = os.path.join(museq_dir, '{sample_id}',
                                'museq_single_annotated.vcf.gz')
    strelka_snv_vcf = os.path.join(museq_dir, '{sample_id}',
                                   'strelka_snv_annotated.vcf.gz')
    strelka_indel_vcf = os.path.join(museq_dir, '{sample_id}',
                                     'strelka_indel_annotated.vcf.gz')
    parsed_snv_csv = os.path.join(museq_dir, '{sample_id}', 'allcalls.csv')
    museq_paired_pdf = os.path.join(museq_dir, '{sample_id}',
                                    'paired_museqportrait.pdf')
    museq_paired_pdf_txt = os.path.join(museq_dir, '{sample_id}',
                                        'paired_museqportrait.txt')
    museq_single_pdf = os.path.join(museq_dir, '{sample_id}',
                                    'single_museqportrait.pdf')
    museq_single_pdf_txt = os.path.join(museq_dir, '{sample_id}',
                                        'single_museqportrait.txt')

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples,
    )

    workflow.subworkflow(name='variant_calling',
                         func=call_variants,
                         args=(
                             samples,
                             museq_dir,
                             config,
                             mgd.OutputFile('parsed_snv_csv',
                                            'sample_id',
                                            template=parsed_snv_csv,
                                            axes_origin=[]),
                             mgd.InputFile("tumour.bam",
                                           'sample_id',
                                           fnames=tumours,
                                           extensions=['.bai'],
                                           axes_origin=[]),
                             mgd.InputFile("normal.bam",
                                           'sample_id',
                                           fnames=normals,
                                           extensions=['.bai'],
                                           axes_origin=[]),
                             mgd.OutputFile('museq',
                                            'sample_id',
                                            template=museq_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('museq_ss',
                                            'sample_id',
                                            template=museq_ss_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('strelka_snv',
                                            'sample_id',
                                            template=strelka_snv_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('strelka_indel',
                                            'sample_id',
                                            template=strelka_indel_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('museq_paired_pdf',
                                            'sample_id',
                                            template=museq_paired_pdf,
                                            axes_origin=[]),
                             mgd.OutputFile('museq_paired_pdf_txt',
                                            'sample_id',
                                            template=museq_paired_pdf_txt,
                                            axes_origin=[]),
                             mgd.OutputFile('museq_single_pdf',
                                            'sample_id',
                                            template=museq_single_pdf,
                                            axes_origin=[]),
                             mgd.OutputFile('museq_single_pdf_txt',
                                            'sample_id',
                                            template=museq_single_pdf_txt,
                                            axes_origin=[]),
                         ))

    pyp.run(workflow)