Пример #1
0
def create_battenberg_workflow(
    seqdata_files,
    config,
    out_file,
    raw_data_dir,
    somatic_breakpoint_file=None,
    normal_id=None,
    **kwargs
):
    if normal_id is None:
        raise ValueError('cloneHD requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file)

    workflow.subworkflow(
        name='run_battenberg',
        axes=('sample_id',),
        func=create_battenberg_single_workflow,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files),
            normal_id,
            pypeliner.managed.InputInstance('sample_id'),
            pypeliner.managed.OutputFile('results', 'sample_id', template=results_files),
            config,
        ),
        kwargs={
            'somatic_breakpoint_file': somatic_breakpoint_file,
        },
    )

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results', 'sample_id', template=results_files),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow
Пример #2
0
def main(args):
    config = cli.load_pypeliner_config(args)

    pyp = pypeliner.app.Pypeline([], config)

    workflow = Workflow()

    workflow.subworkflow(name='snpeff',
                         func=snpeff.create_snpeff_annotation_workflow,
                         args=(pypeliner.managed.InputFile(
                             args.target_vcf_file),
                               pypeliner.managed.TempOutputFile('snpeff.h5')),
                         kwargs={
                             'data_base': args.data_base,
                             'split_size': args.split_size,
                             'table_name': 'snpeff'
                         })

    workflow.transform(name='convert_to_tsv',
                       func=convert_hdf5_to_tsv,
                       ctx={'mem': 2},
                       args=(pypeliner.managed.TempInputFile('snpeff.h5'),
                             'snpeff',
                             pypeliner.managed.OutputFile(args.out_file)),
                       kwargs={
                           'compress': True,
                           'index': False
                       })

    pyp.run(workflow)
Пример #3
0
def create_mappability_wig_file(config, out_file):
    workflow = Workflow()

    workflow.subworkflow(
        name='download_mappability_bigwig',
        func=biowrappers.components.io.download.create_download_workflow,
        args=(
            config['mappability_url'],
            pypeliner.managed.OutputFile(out_file + '.bigwig'),
        ))

    workflow.commandline(
        name='convert_mappability_to_wig',
        ctx={'mem': 4},
        args=(
            'mapCounter',
            '-w',
            config['window_size'],
            pypeliner.managed.InputFile(out_file + '.bigwig'),
            '>',
            pypeliner.managed.OutputFile(out_file),
        ),
    )

    return workflow
Пример #4
0
def create_setup_theta_workflow(config, databases, **kwargs):
    mappability_dir = os.path.realpath(
        os.path.join(os.path.dirname(config['mappability_template']),
                     os.pardir))
    map_extract_log = os.path.join(mappability_dir, 'mappability_extract.log')
    chromosomes_dir = os.path.dirname(config['chromosome_template'])

    utils.make_directory(mappability_dir)
    utils.make_directory(chromosomes_dir)

    workflow = Workflow()

    workflow.subworkflow(
        name='download_mappability',
        func=biowrappers.components.io.download.create_download_workflow,
        args=(
            config['mappability_url'],
            pypeliner.managed.TempOutputFile('mappability.tar.gz'),
        ))

    workflow.commandline(
        name='extract_mappability',
        args=(
            'tar',
            '-xzvf',
            pypeliner.managed.TempInputFile('mappability.tar.gz'),
            '-C',
            mappability_dir,
            '>',
            pypeliner.managed.OutputFile(map_extract_log),
        ),
    )

    for chromosome in config['chromosomes']:
        workflow.subworkflow(
            name='download_chromosome_{}'.format(chromosome),
            func=biowrappers.components.io.download.create_download_workflow,
            args=(
                config['chromosome_url_template'].format(chromosome),
                pypeliner.managed.TempOutputFile(
                    'chromosome_{}.fa.gz'.format(chromosome)),
            ))

        workflow.commandline(
            name='extract_chromosome_{}'.format(chromosome),
            args=(
                'gunzip',
                '-c',
                pypeliner.managed.TempInputFile(
                    'chromosome_{}.fa.gz'.format(chromosome)),
                '>',
                pypeliner.managed.OutputFile(
                    config['chromosome_template'].format(chromosome)),
            ),
        )

    return workflow
Пример #5
0
def realignment_readgroups_pipeline(
        config,
        in_file,
        out_file):

    workflow = Workflow()

    workflow.transform(
        name='get_read_group_configs',
        func=tasks.get_read_group_configs,
        ret=pypeliner.managed.TempOutputObj('read_group_config', 'read_group_id'),
        args=(
            pypeliner.managed.InputFile(in_file),
        )
    )

    workflow.commandline(
        name='create_read_group_bam',
        axes=('read_group_id',),
        args=(
            'samtools', 'view', '-b',
            '-r', pypeliner.managed.InputInstance('read_group_id'),
            pypeliner.managed.InputFile(in_file),
            '>',
            pypeliner.managed.TempOutputFile('read_group_bam', 'read_group_id'),
        )
    )

    workflow.subworkflow(
        name='realignment_pipeline',
        axes=('read_group_id',),
        func=realignment_pipeline,
        args=(
            config,
            pypeliner.managed.TempInputFile('read_group_bam', 'read_group_id'),
            pypeliner.managed.TempOutputFile('realigned_read_group_bam', 'read_group_id'),
        ),
        kwargs={
            'read_group_info': pypeliner.managed.TempInputObj('read_group_config', 'read_group_id'),
        }
    )

    workflow.transform(
        name='merge_and_markdups',
        axes=('read_group_id',),
        ctx={'mem' : 48, 'num_retry' : 3, 'mem_retry_increment' : 16},
        func=bam_tasks.mark_duplicates,
        args=(
            pypeliner.managed.TempInputFile('realigned_read_group_bam', 'read_group_id'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'tmp_dir' : pypeliner.managed.TempSpace('markdup_temp', 'read_group_id')
        }
    )

    return workflow
Пример #6
0
def create_setup_titan_workflow(config, databases, **kwargs):
    workflow = Workflow()

    workflow.subworkflow(name='gc_wig',
                         func=create_gc_wig_file,
                         args=(
                             config,
                             pypeliner.managed.InputFile(
                                 databases['ref_genome']['local_path']),
                             pypeliner.managed.OutputFile(config['gc_wig']),
                         ))

    workflow.subworkflow(name='mappability_wig',
                         func=create_mappability_wig_file,
                         args=(
                             config,
                             pypeliner.managed.OutputFile(
                                 config['mappability_wig']),
                         ))

    return workflow
Пример #7
0
def create_dbsnp_download_workflow(config, out_file):

    workflow = Workflow()

    workflow.subworkflow(
        name='download',
        func=download.create_download_workflow,
        args=(
            config['url'],
            pypeliner.managed.OutputFile(out_file)
        )
    )

    workflow.transform(
        name='index',
        ctx={'mem': 4},
        func=vcf_tasks.index_vcf,
        args=(
            pypeliner.managed.InputFile(out_file),
        )
    )

    return workflow
Пример #8
0
def call_and_annotate_pipeline(config,
                               normal_bam_path,
                               tumour_bam_paths,
                               raw_data_dir,
                               results_file,
                               chromosomes=default_chromosomes):

    workflow = Workflow()

    workflow.setobj(
        pypeliner.managed.OutputChunks('tumour_sample_id', axes_origin=[
            0,
        ]), tumour_bam_paths.keys())

    variant_files = get_variant_files(chromosomes, config, raw_data_dir)

    normal_bam_file = pypeliner.managed.File(normal_bam_path)

    tumour_bam_files = pypeliner.managed.File('tumour_bams',
                                              'tumour_sample_id',
                                              fnames=tumour_bam_paths)

    ref_genome_fasta_file = pypeliner.managed.File(
        config['databases']['ref_genome']['local_path'])

    #===================================================================================================================
    # Multi sample calling
    #===================================================================================================================
    if 'nuseq_multi_sample' in config:
        workflow.subworkflow(
            name='nuseq_multi_sample',
            axes=(),
            func=
            'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow',
            args=(
                normal_bam_file.as_input(), [
                    pypeliner.managed.InputFile(x)
                    for x in tumour_bam_paths.values()
                ], ref_genome_fasta_file.as_input(),
                variant_files['snv']['vcf']['nuseq_multi_sample'].as_output()),
            kwargs=config['nuseq_multi_sample']['kwargs'])

        workflow.transform(
            name='convert_nuseq_multi_sample_vcf_to_hdf5',
            axes=(),
            ctx=default_ctx,
            func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5",
            args=(
                variant_files['snv']['vcf']['nuseq_multi_sample'].as_input(),
                variant_files['snv']['hdf']['nuseq_multi_sample'].as_output(),
                '/snv/vcf/nuseq_multi_sample/all',
            ),
            kwargs={'score_callback': vcf_score_callbacks['snv']['nuseq']})

    #===================================================================================================================
    # Single sample calling
    #===================================================================================================================
    if 'nuseq' in config:
        workflow.subworkflow(
            name='nuseq',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow',
            args=(normal_bam_file.as_input(), [
                tumour_bam_files.as_input(),
            ], ref_genome_fasta_file.as_input(),
                  variant_files['snv']['vcf']['nuseq'].as_output()),
            kwargs=config['nuseq']['kwargs'])

    if 'mutect' in config:
        workflow.subworkflow(
            name='mutect',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.mutect.create_mutect_workflow',
            args=(normal_bam_file.as_input(), tumour_bam_files.as_input(),
                  ref_genome_fasta_file.as_input(),
                  config['databases']['cosmic']['local_path'],
                  config['databases']['dbsnp']['local_path'],
                  variant_files['snv']['vcf']['mutect'].as_output()),
            kwargs=config['mutect']['kwargs'])

    if 'strelka' in config:
        workflow.subworkflow(
            name='strelka',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.strelka.create_strelka_workflow',
            args=(normal_bam_file.as_input(), tumour_bam_files.as_input(),
                  ref_genome_fasta_file.as_input(),
                  variant_files['indel']['vcf']['strelka'].as_output(),
                  variant_files['snv']['vcf']['strelka'].as_output()),
            kwargs=config['strelka']['kwargs'])

    #===================================================================================================================
    # Convert vcf to hdf5
    #===================================================================================================================
    for var_type in variant_files:
        for prog in variant_files[var_type]['vcf']:
            if prog == 'nuseq_multi_sample':
                continue

            workflow.transform(
                name='convert_{0}_indel_{1}_to_hdf5'.format(prog, var_type),
                axes=('tumour_sample_id', ),
                ctx=default_ctx,
                func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5",
                args=(variant_files[var_type]['vcf'][prog].as_input(),
                      variant_files[var_type]['hdf'][prog].as_output(),
                      pypeliner.managed.Template(
                          '/{var_type}/vcf/{prog}/{{tumour_sample_id}}'.format(
                              prog=prog, var_type=var_type),
                          'tumour_sample_id')),
                kwargs={'score_callback': vcf_score_callbacks[var_type][prog]})

    #===================================================================================================================
    # Indel annotation
    #===================================================================================================================
    workflow.transform(
        name='merge_indels',
        ctx=big_mem_ctx,
        func='biowrappers.components.io.vcf.tasks.vcf_tasks.merge_vcfs',
        args=([x.as_input() for x in variant_files['indel']['vcf'].values()],
              pypeliner.managed.TempOutputFile('all.indel.vcf')))

    workflow.transform(
        name='finalise_indels',
        func="biowrappers.components.io.vcf.tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('all.indel.vcf'),
              pypeliner.managed.TempOutputFile('all.indel.vcf.gz')))

    workflow.subworkflow(
        name='annotate_indels',
        axes=(),
        func=create_annotation_workflow,
        args=(
            config,
            pypeliner.managed.TempInputFile('all.indel.vcf.gz'),
            pypeliner.managed.TempOutputFile('indel_annotations.h5'),
            os.path.join(raw_data_dir, 'indel'),
        ),
        kwargs={'variant_type': 'indel'})

    #===================================================================================================================
    # SNV
    #===================================================================================================================
    workflow.transform(
        name='merge_snvs',
        ctx=big_mem_ctx,
        func="biowrappers.components.io.vcf.tasks.merge_vcfs",
        args=([x.as_input() for x in variant_files['snv']['vcf'].values()],
              pypeliner.managed.TempOutputFile('all.snv.vcf')))

    workflow.transform(
        name='finalise_snvs',
        func="biowrappers.components.io.vcf.tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('all.snv.vcf'),
              pypeliner.managed.TempOutputFile('all.snv.vcf.gz')))

    workflow.subworkflow(
        name='annotate_snvs',
        axes=(),
        func=create_annotation_workflow,
        args=(
            config,
            pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
            pypeliner.managed.TempOutputFile('snv_annotations.h5'),
            os.path.join(raw_data_dir, 'snv'),
        ),
        kwargs={'variant_type': 'snv'})

    workflow.subworkflow(
        name='normal_snv_counts',
        func=
        'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(
            normal_bam_file.as_input(),
            pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')),
        ),
        kwargs=get_kwargs(config['snv_counts']['kwargs'],
                          '/snv/counts/normal'))

    workflow.subworkflow(
        name='tumour_snv_counts',
        axes=('tumour_sample_id', ),
        func=
        'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(tumour_bam_files.as_input(),
              pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
              pypeliner.managed.OutputFile(
                  os.path.join(raw_data_dir, 'snv', 'counts',
                               '{tumour_sample_id}.h5'), 'tumour_sample_id')),
        kwargs=get_kwargs(
            config['snv_counts']['kwargs'],
            pypeliner.managed.Template('/snv/counts/{tumour_sample_id}',
                                       'tumour_sample_id')))

    #===================================================================================================================
    # Create final output
    #===================================================================================================================
    tables = [
        pypeliner.managed.TempInputFile('indel_annotations.h5'),
        pypeliner.managed.TempInputFile('snv_annotations.h5'),
        pypeliner.managed.InputFile(
            os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')),
        pypeliner.managed.InputFile(
            os.path.join(raw_data_dir, 'snv', 'counts',
                         '{tumour_sample_id}.h5'), 'tumour_sample_id'),
    ]

    for var_type in variant_files:
        for prog in variant_files[var_type]['hdf']:
            tables.append(variant_files[var_type]['hdf'][prog].as_input())

    workflow.transform(
        name='build_results_file',
        ctx=default_ctx,
        func='biowrappers.components.io.hdf5.tasks.concatenate_tables',
        args=(tables, pypeliner.managed.OutputFile(results_file)),
        kwargs={
            'drop_duplicates': True,
        })

    return workflow
Пример #9
0
def call_and_annotate_pipeline(
    config,
    normal_bam_path,
    tumour_bam_paths,
    raw_data_dir,
    results_file,
):
    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('tumour_sample_id'),
        value=tumour_bam_paths.keys(),
    )

    merge_inputs = {}

    if 'destruct' in config:
        destruct_raw_data = os.path.join(raw_data_dir, 'destruct')
        destruct_results_filename = os.path.join(destruct_raw_data,
                                                 'results.h5')
        make_parent_directory(destruct_results_filename)

        workflow.subworkflow(
            name='destruct',
            func=destruct.destruct_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                config['destruct']['config'],
                config['destruct']['ref_data_dir'],
                pypeliner.managed.OutputFile(destruct_results_filename),
                destruct_raw_data,
            ),
        )

        merge_inputs['/breakpoints/destruct'] = pypeliner.managed.InputFile(
            destruct_results_filename)

    if 'delly' in config:
        delly_raw_data = os.path.join(raw_data_dir, 'delly')
        delly_results_filename = os.path.join(delly_raw_data, 'results.h5')
        make_parent_directory(delly_results_filename)

        workflow.subworkflow(
            name='delly',
            func=delly.delly_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                config['delly']['ref_genome_fasta_file'],
                config['delly']['exclude_file'],
                pypeliner.managed.OutputFile(delly_results_filename),
                delly_raw_data,
            ),
        )

        merge_inputs['/breakpoints/delly'] = pypeliner.managed.InputFile(
            delly_results_filename)

    if 'lumpysv' in config:
        lumpysv_raw_data = os.path.join(raw_data_dir, 'lumpysv')
        lumpysv_results_filename = os.path.join(lumpysv_raw_data, 'results.h5')
        make_parent_directory(lumpysv_results_filename)

        workflow.subworkflow(
            name='lumpysv',
            func=lumpysv.lumpysv_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                pypeliner.managed.OutputFile(lumpysv_results_filename),
                lumpysv_raw_data,
            ),
        )

        merge_inputs['/breakpoints/lumpysv'] = pypeliner.managed.InputFile(
            lumpysv_results_filename)

    workflow.transform(name='merge_results',
                       ctx={'mem': 8},
                       func=hdf5_tasks.merge_hdf5,
                       args=(
                           merge_inputs,
                           pypeliner.managed.OutputFile(results_file),
                       ))

    return workflow
Пример #10
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            snv_vcf_file,
                            snv_maf_file,
                            indel_vcf_file,
                            indel_maf_file,
                            reference,
                            reference_vep,
                            chromosomes,
                            normal_id,
                            tumour_id,
                            single_node=False,
                            is_exome=False):
    params = config.default_params('variant_calling')

    workflow = Workflow(ctx=helpers.get_default_ctx(memory=5,
                                                    walltime='4:00'), )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ret=mgd.OutputChunks('regions'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    workflow.transform(
        name='count_fasta_bases',
        func="wgs.workflows.strelka.tasks.count_fasta_bases",
        args=(
            reference,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
        ),
    )

    workflow.transform(
        name="get_chrom_sizes",
        func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    if single_node:
        workflow.transform(name='strelka_one_node',
                           func="wgs.workflows.strelka.tasks.strelka_one_node",
                           args=(
                               pypeliner.managed.InputFile(normal_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               pypeliner.managed.InputFile(tumour_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               reference,
                               mgd.TempOutputFile('indels.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempOutputFile('snvs.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempSpace('call_genome_segment_tmp'),
                               mgd.InputChunks('regions'),
                               mgd.TempInputObj('known_sizes'),
                           ),
                           kwargs={
                               'is_exome': is_exome,
                           })
    else:
        workflow.transform(
            name='get_chromosome_depths',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.get_chromosome_depth",
            args=(
                mgd.InputInstance('regions'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('chrom_depth.txt', 'regions'),
            ),
        )

        workflow.transform(
            name='merge_chromosome_depths',
            func="wgs.workflows.strelka.tasks.merge_chromosome_depths",
            args=(mgd.TempInputFile('chrom_depth.txt',
                                    'regions',
                                    axes_origin=[]),
                  mgd.TempOutputFile('merged_chrom_depth.txt')))

        workflow.transform(
            name='call_genome_segment',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.call_genome_segment",
            args=(
                mgd.TempInputFile('merged_chrom_depth.txt'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                pypeliner.managed.InputFile(tumour_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('indels.vcf', 'regions'),
                mgd.TempOutputFile('snvs.vcf', 'regions'),
                mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                mgd.InputInstance('regions'),
                mgd.TempInputObj('known_sizes'),
            ),
            kwargs={
                'is_exome': False,
            })

        workflow.transform(
            name='merge_indels',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('indels.vcf', 'regions'),
                  mgd.TempOutputFile('indels.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("indels_merge")),
        )

        workflow.transform(
            name='merge_snvs',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('snvs.vcf', 'regions'),
                  mgd.TempOutputFile('snvs.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("snvs_merge")),
        )

    workflow.transform(name='bcftools_normalize_snv',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('snvs.vcf.gz'),
                           mgd.TempOutputFile('normalized_snvs.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs.vcf'),
            mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(name='bcftools_normalize_indel',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('indels.vcf.gz'),
                           mgd.TempOutputFile('normalized_indels.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_indel',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_indels.vcf'),
            mgd.TempOutputFile('normalized_indels_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_indel',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_indels_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_snv',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_snv_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(indel_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(indel_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
Пример #11
0
def main(args):
    biowrappers.components.utils.make_directory(args.out_dir)

    with open(args.config_file) as config_file:
        config_text = config_file.read()
    config_text = config_text.format(out_dir=args.out_dir, ref_db_dir=args.ref_db_dir)
    config = yaml.load(config_text)

    pypeliner_args = vars(args)
    pypeliner_args['tmpdir'] = os.path.join(args.out_dir, 'pipeline')

    pyp = pypeliner.app.Pypeline(modules=[tasks], config=pypeliner_args)

    download_urls = {}

    for sample in ('tumour', 'normal'):
        lanes = config['lanes'][sample]

        for lane in lanes:
            download_urls[(sample, lane)] = config['lanes'][sample][lane]['url']

    raw_lane_template = os.path.join(args.out_dir, 'lanes', 'raw', '{lane}.bam')

    realigned_lane_template = os.path.join(args.out_dir, 'lanes', 'realigned', '{lane}.bam')
    sample_bam_template = os.path.join(args.out_dir, '{sample}.bam')

    workflow = Workflow(default_ctx={'mem': 8})

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('url', 'sample', 'lane'),
        value=download_urls,
    )

    workflow.subworkflow(
        name='download_lanes',
        axes=('sample', 'lane'),
        func=biowrappers.components.io.download.create_download_workflow,
        args=(
            pypeliner.managed.TempInputObj('url', 'sample', 'lane'),
            pypeliner.managed.OutputFile('raw_lane', 'sample', 'lane', template=raw_lane_template),
        )
    )

    workflow.subworkflow(
        name='realign_lanes',
        axes=('sample', 'lane'),
        func=biowrappers.pipelines.realignment.realignment_pipeline,
        args=(
            config['realignment'],
            pypeliner.managed.InputFile('raw_lane', 'sample', 'lane', template=raw_lane_template),
            pypeliner.managed.OutputFile('realigned_lane', 'sample', 'lane', template=realigned_lane_template),
        )
    )

    workflow.transform(
        name='merge_and_markdups',
        axes=('sample',),
        func=biowrappers.components.io.bam.tasks.mark_duplicates,
        args=(
            pypeliner.managed.InputFile('realigned_lane', 'sample', 'lane', template=realigned_lane_template),
            pypeliner.managed.OutputFile('bam', 'sample', template=sample_bam_template),
        ),
        kwargs={
            'tmp_dir': pypeliner.managed.TempSpace('markdup_temp', 'sample')
        }
    )

    pyp.run(workflow)

    normal_bam_file = sample_bam_template.format(sample='normal')
    tumour_bam_file = sample_bam_template.format(sample='tumour')

    workflow = Workflow(default_ctx={'mem': 8})

    breakpoint_raw_data_dir = os.path.join(args.out_dir, 'breakpoints', 'raw')
    breakpoint_results_file = os.path.join(args.out_dir, 'breakpoints', 'results.h5')

    workflow.subworkflow(
        name='breakpoint_call_and_annotate',
        func=biowrappers.pipelines.breakpoint_call_and_annotate.call_and_annotate_pipeline,
        args=(
            config,
            pypeliner.managed.InputFile(normal_bam_file),
            {'tumour': pypeliner.managed.InputFile(tumour_bam_file)},
            pypeliner.managed.Template(os.path.join(breakpoint_raw_data_dir)),
            pypeliner.managed.OutputFile(breakpoint_results_file),
        ),
    )

    somatic_breakpoints_file = os.path.join(args.out_dir, 'somatic_breakpoints.tsv')

    workflow.transform(
        name='extract_somatic_breakpoint',
        ctx={'mem': 4},
        func=tasks.extract_somatic_breakpoint,
        args=(
            pypeliner.managed.InputFile(breakpoint_results_file),
            pypeliner.managed.OutputFile(somatic_breakpoints_file),
            config,
        )
    )

    copy_number_raw_data_dir = os.path.join(args.out_dir, 'copy_number', 'raw')
    breakpoint_results_file = os.path.join(args.out_dir, 'copy_number', 'results.h5')

    workflow.subworkflow(
        name='copy_number_call_and_annotate',
        func=biowrappers.pipelines.copy_number.call_and_annotate_pipeline,
        args=(
            config,
            pypeliner.managed.InputFile(normal_bam_file),
            {'tumour': pypeliner.managed.InputFile(tumour_bam_file)},
            copy_number_raw_data_dir,
            pypeliner.managed.OutputFile(breakpoint_results_file),
        ),
        kwargs={
            'somatic_breakpoint_file': pypeliner.managed.InputFile(somatic_breakpoints_file),
        },
    )

    pyp.run(workflow)
Пример #12
0
def create_setup_tools_workflow(databases, config):

    workflow = Workflow()

    if 'destruct' in config:
        import destruct.create_ref_data
        workflow.transform(
            name='destruct_create_ref_data',
            ctx={'mem': 16},
            func=destruct.create_ref_data.create_ref_data,
            args=(
                config['destruct']['config'],
                config['destruct']['ref_data_dir'],
            ),
        )

    if 'delly' in config:
        workflow.subworkflow(
            name='delly_exclude',
            func=download.create_download_workflow,
            args=(
                config['delly']['exclude_url'],
                pypeliner.managed.OutputFile(config['delly']['exclude_file']),
            )
        )

    if 'remixt' in config:
        workflow.subworkflow(
            name='create_setup_remixt_workflow',
            func=biowrappers.components.copy_number_calling.remixt.create_setup_remixt_workflow,
            args=(
                config['remixt']['config'],
                databases,
            ),
            kwargs={
                'ref_data_dir': config['remixt']['ref_data_dir'],
            },
        )

    if 'titan' in config:
        workflow.subworkflow(
            name='create_setup_titan_workflow',
            func=biowrappers.components.copy_number_calling.titan.create_setup_titan_workflow,
            args=(
                config['titan']['config'],
                databases,
            )
        )

    if 'theta' in config:
        workflow.subworkflow(
            name='create_setup_theta_workflow',
            func=biowrappers.components.copy_number_calling.theta.create_setup_theta_workflow,
            args=(
                config['theta']['config'],
                databases,
            )
        )

    if 'clonehd' in config:
        workflow.subworkflow(
            name='create_setup_clonehd_workflow',
            func=biowrappers.components.copy_number_calling.clonehd.create_setup_clonehd_workflow,
            args=(
                config['clonehd']['config'],
                databases,
            )
        )

    return workflow
Пример #13
0
def create_ref_genome_download_and_index_workflow(config, out_file):

    workflow = Workflow()

    if config['url'].endswith('gz'):
        workflow.subworkflow(
            name='download',
            func=download.create_download_workflow,
            args=(
                config['url'],
                pypeliner.managed.TempOutputFile('ref.fasta.gz'),
            )
        )

        workflow.commandline(
            name='gunzip',
            args=(
                'gzip', '-cd',
                pypeliner.managed.TempInputFile('ref.fasta.gz'),
                '>',
                pypeliner.managed.OutputFile(out_file)
            ),
        )

    else:
        workflow.subworkflow(
            name='download',
            func=download.create_download_workflow,
            args=(
                config['url'],
                pypeliner.managed.OutputFile(out_file)
            )
        )

    workflow.commandline(
        name='build_dict',
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        args=(
            'samtools',
            'dict',
            pypeliner.managed.InputFile(out_file),
            '>',
            pypeliner.managed.OutputFile(out_file + '.build_dict.log'),
        )
    )

    workflow.commandline(
        name='build_fai',
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        args=(
            'samtools',
            'faidx',
            pypeliner.managed.InputFile(out_file),
            '>',
            pypeliner.managed.OutputFile(out_file + '.build_fai.log'),
        )
    )

    workflow.commandline(
        name='build_bwa_index',
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        args=(
            'bwa',
            'index',
            pypeliner.managed.InputFile(out_file),
            '>',
            pypeliner.managed.OutputFile(out_file + '.build_bwa_index.log'),
        )
    )

    return workflow
Пример #14
0
def create_setup_reference_dbs_workflow(config):

    workflow = Workflow()

    if 'cosmic' in config:
        workflow.transform(
            name='cosmic',
            func=tasks.download_cosmic,
            args=(
                config['cosmic'],
                pypeliner.managed.OutputFile(config['cosmic']['local_path']),
                pypeliner.managed.TempSpace('cosmic_work', cleanup=None)
            )
        )

    if 'dbsnp' in config:
        workflow.subworkflow(
            name='dbsnp',
            func=create_dbsnp_download_workflow,
            args=(
                config['dbsnp'],
                pypeliner.managed.OutputFile(config['dbsnp']['local_path']),
            )
        )

    if 'mappability' in config:
        workflow.subworkflow(
            name='mappability',
            func=download.create_download_workflow,
            args=(
                config['mappability']['url'],
                pypeliner.managed.OutputFile(config['mappability']['local_path']),
            )
        )

    if 'ref_genome' in config and 'url' in config['ref_genome']:
        workflow.subworkflow(
            name='ref_genome',
            func=create_ref_genome_download_and_index_workflow,
            args=(
                config['ref_genome'],
                pypeliner.managed.OutputFile(config['ref_genome']['local_path']),
            )
        )

    if 'snpeff' in config:
        workflow.commandline(
            name='snpeff',
            args=(
                'snpEff',
                'download',
                config['snpeff']['db']
            )
        )

    if 'chrom_info' in config:
        workflow.subworkflow(
            name='chrom_info',
            func=download.create_download_workflow,
            args=(
                config['chrom_info']['url'],
                pypeliner.managed.OutputFile(config['chrom_info']['local_path']),
            )
        )

    return workflow
Пример #15
0
def create_annotation_workflow(
    config,
    in_vcf_file,
    cosmic_status_file,
    dbsnp_status_file,
    mappability_file,
    snpeff_file,
    trinuc_file,
    variant_type='snv',
):

    annotators = ('cosmic_status', 'dbsnp_status', 'mappability', 'snpeff',
                  'tri_nucleotide_context')

    kwargs = {}

    for a in annotators:
        kwargs[a] = get_kwargs(config[a]['kwargs'],
                               '/{0}/{1}'.format(variant_type, a))

    workflow = Workflow()

    workflow.subworkflow(
        name='cosmic_status',
        func=
        'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2),
        args=(config['databases']['cosmic']['local_path'],
              pypeliner.managed.InputFile(in_vcf_file),
              pypeliner.managed.OutputFile(cosmic_status_file)),
        kwargs=config["cosmic_status"]['kwargs'])

    workflow.subworkflow(
        name='dbsnp_status',
        func=
        'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2),
        args=(config['databases']['dbsnp']['local_path'],
              pypeliner.managed.InputFile(in_vcf_file),
              pypeliner.managed.OutputFile(dbsnp_status_file)),
        kwargs=config["dbsnp_status"]['kwargs'])

    workflow.subworkflow(
        name='mappability',
        func=
        'biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2),
        args=(
            config['databases']['mappability']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file, extensions=['.tbi']),
            pypeliner.managed.OutputFile(mappability_file),
        ),
        kwargs=config["mappability"]['kwargs'])

    workflow.subworkflow(
        name='snpeff',
        func=
        'biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2),
        args=(config['databases']['snpeff']['db'],
              config['databases']['snpeff']['data_dir'],
              pypeliner.managed.InputFile(in_vcf_file),
              pypeliner.managed.OutputFile(snpeff_file)),
        kwargs=kwargs['snpeff'])

    workflow.subworkflow(
        name='tri_nucleotide_context',
        func=
        'biowrappers.components.variant_calling.tri_nucleotide_context.create_vcf_tric_nucleotide_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2),
        args=(
            config['databases']['ref_genome']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file),
            pypeliner.managed.OutputFile(trinuc_file),
        ),
        kwargs=config["tri_nucleotide_context"]['kwargs'])

    return workflow
Пример #16
0
def destruct_pipeline(
    normal_bam_file,
    tumour_bam_files,
    config,
    ref_data_dir,
    out_file,
    raw_data_dir,
    normal_sample_id='normal',
):
    bam_files = tumour_bam_files
    bam_files[normal_sample_id] = normal_bam_file

    utils.make_directory(os.path.join(raw_data_dir, 'raw'))
    breakpoint_file = os.path.join(raw_data_dir, 'raw', 'breakpoint.tsv')
    breakpoint_library_file = os.path.join(raw_data_dir, 'raw',
                                           'breakpoint_library.tsv')
    breakpoint_read_file = os.path.join(raw_data_dir, 'raw',
                                        'breakpoint_read.tsv')

    utils.make_directory(os.path.join(raw_data_dir, 'somatic'))
    somatic_breakpoint_file = os.path.join(raw_data_dir, 'somatic',
                                           'breakpoint.tsv')
    somatic_breakpoint_library_file = os.path.join(raw_data_dir, 'somatic',
                                                   'breakpoint_library.tsv')

    raw_read_data_dir = os.path.join(raw_data_dir, 'read_data')
    utils.make_directory(raw_read_data_dir)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=bam_files.keys(),
    )

    workflow.subworkflow(
        name='run_destruct',
        func="destruct.workflow.create_destruct_workflow",
        args=(
            pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files),
            pypeliner.managed.OutputFile(breakpoint_file),
            pypeliner.managed.OutputFile(breakpoint_library_file),
            pypeliner.managed.OutputFile(breakpoint_read_file),
            config,
            ref_data_dir,
        ),
        kwargs={
            'raw_data_dir': raw_read_data_dir,
        },
    )

    workflow.transform(
        name='filter_annotate_breakpoints',
        ctx={'mem': 8},
        func=
        'biowrappers.components.breakpoint_calling.destruct.tasks.filter_annotate_breakpoints',
        args=(
            pypeliner.managed.InputFile(breakpoint_file),
            pypeliner.managed.InputFile(breakpoint_library_file),
            [normal_sample_id],
            pypeliner.managed.OutputFile(somatic_breakpoint_file),
            pypeliner.managed.OutputFile(somatic_breakpoint_library_file),
        ),
    )

    workflow.transform(
        name='write_store',
        func=
        'biowrappers.components.breakpoint_calling.destruct.tasks.write_store',
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            pypeliner.managed.InputFile(somatic_breakpoint_file),
            pypeliner.managed.InputFile(somatic_breakpoint_library_file),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Пример #17
0
def create_annotation_workflow(
    config,
    in_vcf_file,
    out_file,
    raw_data_dir,
    variant_type='snv',
    docker_config={},
    snpeff_docker={},
    vcftools_docker={},
):

    annotators = ('cosmic_status', 'dbsnp_status', 'mappability', 'snpeff',
                  'tri_nucleotide_context')

    result_files = {}

    kwargs = {}

    for a in annotators:
        kwargs[a] = get_kwargs(config[a]['kwargs'],
                               '/{0}/{1}'.format(variant_type, a))

        result_files[a] = pypeliner.managed.File(
            os.path.join(raw_data_dir, '{0}.csv.gz'.format(a)))

    if not os.path.isdir(raw_data_dir):
        os.mkdir(raw_data_dir)

    assert os.path.isdir(raw_data_dir)

    workflow = Workflow()

    workflow.subworkflow(
        name='cosmic_status',
        func=
        'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
        args=(
            config['databases']['cosmic']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file),
            result_files['cosmic_status'].as_output(),
        ),
        kwargs=config["cosmic_status"]['kwargs'])

    workflow.subworkflow(
        name='dbsnp_status',
        func=
        'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
        args=(
            config['databases']['dbsnp']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file),
            result_files['dbsnp_status'].as_output(),
        ),
        kwargs=config["dbsnp_status"]['kwargs'])

    workflow.subworkflow(
        name='mappability',
        func=
        'biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
        args=(
            config['databases']['mappability']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file, extensions=['.tbi']),
            result_files['mappability'].as_output(),
        ),
        kwargs=config["mappability"]['kwargs'])

    workflow.subworkflow(
        name='snpeff',
        func=
        'biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
        args=(
            config['databases']['snpeff']['db'],
            config['databases']['snpeff']['data_dir'],
            pypeliner.managed.InputFile(in_vcf_file),
            result_files['snpeff'].as_output(),
        ),
        kwargs=dict(snpeff_docker=snpeff_docker, **kwargs['snpeff']))

    workflow.subworkflow(
        name='tri_nucleotide_context',
        func=
        'biowrappers.components.variant_calling.tri_nucleotide_context.create_vcf_tric_nucleotide_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
        args=(
            config['databases']['ref_genome']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file),
            result_files['tri_nucleotide_context'].as_output(),
        ),
        kwargs=config["tri_nucleotide_context"]['kwargs'])

    workflow.transform(name='build_results_file',
                       ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
                       func='single_cell.utils.csvutils.concatenate_csv',
                       args=(
                           [x.as_input() for x in result_files.values()],
                           pypeliner.managed.OutputFile(out_file,
                                                        extensions=[".yaml"]),
                       ))

    return workflow