Exemplo n.º 1
0
def create_hla_type_workflow(
        normal_bam_file,
        hla_type_file):

    workflow = Workflow()

    workflow.commandline(
        name='extract_chr6',
        args=(
            'samtools', 'view', '-bh', '-f', '2', '-F', '4',
            pypeliner.managed.InputFile(normal_bam_file),
            '6',
            '|',
            'samtools', 'collate', '-O', '-', pypeliner.managed.TempSpace('chr6_collate_temp'),
            '|',
            'samtools', 'bam2fq',
            '-1', pypeliner.managed.TempOutputFile('chr6_reads_1.fq'),
            '-2', pypeliner.managed.TempOutputFile('chr6_reads_2.fq'),
            '-',
        ),
    )

    workflow.transform(
        name='optitype',
        ctx={'mem': 24},
        func=tasks.run_optitype,
        args=(
            pypeliner.managed.TempInputFile('chr6_reads_1.fq'),
            pypeliner.managed.TempInputFile('chr6_reads_2.fq'),
            pypeliner.managed.OutputFile(hla_type_file),
            pypeliner.managed.TempSpace('optitype_temp'),
        )
    )

    return workflow
Exemplo n.º 2
0
def main(args):
    config = cli.load_pypeliner_config(args)

    pyp = pypeliner.app.Pypeline([], config)

    workflow = Workflow()

    workflow.subworkflow(name='snpeff',
                         func=snpeff.create_snpeff_annotation_workflow,
                         args=(pypeliner.managed.InputFile(
                             args.target_vcf_file),
                               pypeliner.managed.TempOutputFile('snpeff.h5')),
                         kwargs={
                             'data_base': args.data_base,
                             'split_size': args.split_size,
                             'table_name': 'snpeff'
                         })

    workflow.transform(name='convert_to_tsv',
                       func=convert_hdf5_to_tsv,
                       ctx={'mem': 2},
                       args=(pypeliner.managed.TempInputFile('snpeff.h5'),
                             'snpeff',
                             pypeliner.managed.OutputFile(args.out_file)),
                       kwargs={
                           'compress': True,
                           'index': False
                       })

    pyp.run(workflow)
Exemplo n.º 3
0
def create_battenberg_workflow(
    seqdata_files,
    config,
    out_file,
    raw_data_dir,
    somatic_breakpoint_file=None,
    normal_id=None,
    **kwargs
):
    if normal_id is None:
        raise ValueError('cloneHD requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file)

    workflow.subworkflow(
        name='run_battenberg',
        axes=('sample_id',),
        func=create_battenberg_single_workflow,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files),
            normal_id,
            pypeliner.managed.InputInstance('sample_id'),
            pypeliner.managed.OutputFile('results', 'sample_id', template=results_files),
            config,
        ),
        kwargs={
            'somatic_breakpoint_file': somatic_breakpoint_file,
        },
    )

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results', 'sample_id', template=results_files),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow
Exemplo n.º 4
0
def create_samtools_germline_workflow(
        normal_bam_files,
        normal_bai_files,
        ref_genome_fasta_file,
        vcf_file,
        config,
        chromosomes=default_chromosomes,
        base_docker=None,
        samtools_docker=None,
        vcftools_docker=None
):

    ctx = {'mem': config["memory"]['low'],
           'pool_id': config['pools']['standard'],
           'mem_retry_increment': 2,
           'ncpus': 1}
    if base_docker:
        ctx.update(base_docker)

    regions = normal_bam_files.keys()

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('regions'),
        value=regions,
    )

    workflow.transform(
        name='run_samtools_variant_calling',
        ctx=ctx,
        axes=('regions',),
        func="single_cell.workflows.germline.tasks.run_samtools_variant_calling",
        args=(
            pypeliner.managed.InputFile('normal.split.bam', 'regions', fnames=normal_bam_files),
            pypeliner.managed.InputFile('normal.split.bam.bai', 'regions', fnames=normal_bai_files),
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'),
        ),
        kwargs={
            'region': pypeliner.managed.InputInstance('regions'),
            'samtools_docker': samtools_docker,
            'vcftools_docker': samtools_docker
        },
    )
  
    workflow.transform(
        name='concatenate_variants',
        ctx=ctx,
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'),
            pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']),
            pypeliner.managed.TempSpace("merge_variants_germline"),
        ),
        kwargs={'docker_config': vcftools_docker}
    )

    return workflow
Exemplo n.º 5
0
def realignment_readgroups_pipeline(
        config,
        in_file,
        out_file):

    workflow = Workflow()

    workflow.transform(
        name='get_read_group_configs',
        func=tasks.get_read_group_configs,
        ret=pypeliner.managed.TempOutputObj('read_group_config', 'read_group_id'),
        args=(
            pypeliner.managed.InputFile(in_file),
        )
    )

    workflow.commandline(
        name='create_read_group_bam',
        axes=('read_group_id',),
        args=(
            'samtools', 'view', '-b',
            '-r', pypeliner.managed.InputInstance('read_group_id'),
            pypeliner.managed.InputFile(in_file),
            '>',
            pypeliner.managed.TempOutputFile('read_group_bam', 'read_group_id'),
        )
    )

    workflow.subworkflow(
        name='realignment_pipeline',
        axes=('read_group_id',),
        func=realignment_pipeline,
        args=(
            config,
            pypeliner.managed.TempInputFile('read_group_bam', 'read_group_id'),
            pypeliner.managed.TempOutputFile('realigned_read_group_bam', 'read_group_id'),
        ),
        kwargs={
            'read_group_info': pypeliner.managed.TempInputObj('read_group_config', 'read_group_id'),
        }
    )

    workflow.transform(
        name='merge_and_markdups',
        axes=('read_group_id',),
        ctx={'mem' : 48, 'num_retry' : 3, 'mem_retry_increment' : 16},
        func=bam_tasks.mark_duplicates,
        args=(
            pypeliner.managed.TempInputFile('realigned_read_group_bam', 'read_group_id'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'tmp_dir' : pypeliner.managed.TempSpace('markdup_temp', 'read_group_id')
        }
    )

    return workflow
Exemplo n.º 6
0
def create_ascat_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('ASCAT requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results',
                                 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    workflow.transform(
        name='prepare_normal_data',
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.prepare_normal_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.TempOutputFile('Germline_LogR.txt'),
            pypeliner.managed.TempOutputFile('Germline_BAF.txt'),
            config,
        ),
    )

    workflow.transform(
        name='prepare_tumour_data',
        axes=('sample_id', ),
        ctx={'mem': 20},
        func=tasks.prepare_tumour_data,
        args=(
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.TempOutputFile('Germline_LogR.txt', 'sample_id'),
            pypeliner.managed.TempOutputFile('Germline_BAF.txt', 'sample_id'),
            config,
        ),
    )

    return workflow
Exemplo n.º 7
0
def create_tophat_transcriptome_index_workflow(
        ref_genome_fasta_file,
        transcript_gtf_file,
        ref_genome_index_prefix,
        transcriptome_index_prefix,
        copy_ref_genome=False):

    workflow = Workflow()

    local_ref_genome_fasta_path = ref_genome_index_prefix + '.fa'

    if copy_ref_genome:
        workflow.commandline(
            name='copy_genome',
            ctx={'local': True},
            args=(
                'cp',
                mgd.InputFile(ref_genome_fasta_file),
                mgd.OutputFile(local_ref_genome_fasta_path),
            ),
        )

    else:
        workflow.commandline(
            name='link_genome',
            ctx={'local': True},
            args=(
                'ln',
                '-s',
                mgd.InputFile(ref_genome_fasta_file),
                mgd.OutputFile(local_ref_genome_fasta_path),
            ),
        )

    workflow.transform(
        name='build_bowtie_index',
        ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8},
        func=tasks.build_genome_index,
        args=(
            mgd.InputFile(local_ref_genome_fasta_path),
            mgd.OutputFile(ref_genome_index_prefix),
        )
    )

    workflow.transform(
        name='build_tophat_index',
        ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8},
        func=tasks.build_transcriptome_index,
        args=(
            mgd.InputFile(ref_genome_index_prefix),
            mgd.InputFile(transcript_gtf_file),
            mgd.OutputFile(transcriptome_index_prefix),
        )
    )

    return workflow
Exemplo n.º 8
0
def create_samtools_germline_workflow(normal_bam_files,
                                      ref_genome_fasta_file,
                                      vcf_file,
                                      config,
                                      samtools_docker=None,
                                      vcftools_docker=None):
    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem': config["memory"]['low'],
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'docker_image': baseimage
    }

    regions = list(normal_bam_files.keys())

    workflow = Workflow(ctx=ctx)

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('regions'),
        value=regions,
    )

    workflow.transform(
        name='run_samtools_variant_calling',
        axes=('regions', ),
        func=
        "single_cell.workflows.germline.tasks.run_samtools_variant_calling",
        args=(
            pypeliner.managed.InputFile('normal.split.bam',
                                        'regions',
                                        fnames=normal_bam_files,
                                        extensions=['.bai']),
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'),
        ),
        kwargs={
            'region': pypeliner.managed.InputInstance('regions'),
            'samtools_docker': samtools_docker,
            'vcftools_docker': samtools_docker
        },
    )

    workflow.transform(
        name='concatenate_variants',
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'),
            pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']),
            pypeliner.managed.TempSpace("merge_variants_germline"),
        ),
        kwargs={'docker_config': vcftools_docker})

    return workflow
Exemplo n.º 9
0
def create_download_workflow(url, file_name):

    workflow = Workflow()

    workflow.setobj(obj=pypeliner.managed.TempOutputObj('url'), value=url)

    workflow.transform(name='download',
                       ctx={'local': True},
                       func=tasks.download_from_url,
                       args=(pypeliner.managed.TempInputObj('url'),
                             pypeliner.managed.OutputFile(file_name)))

    return workflow
Exemplo n.º 10
0
def create_battenberg_single_workflow(
    normal_seqdata_file,
    tumour_seqdata_file,
    normal_id,
    tumour_id,
    results_file,
    config,
    somatic_breakpoint_file=None,
    **kwargs
):
    workflow = Workflow()

    workflow.transform(
        name='prepare_data',
        ctx={'mem': 20},
        func=tasks.prepare_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.InputFile(tumour_seqdata_file),
            normal_id,
            tumour_id,
            pypeliner.managed.TempOutputFile('allele_counts.tar.gz'),
            pypeliner.managed.TempSpace('prepare_battenberg_temp', cleanup=None),
            config,
        ),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file)

    workflow.transform(
        name='run_battenberg',
        ctx={'mem': 20},
        func=tasks.run_battenberg,
        args=(
            pypeliner.managed.TempInputFile('allele_counts.tar.gz'),
            normal_id,
            tumour_id,
            pypeliner.managed.OutputFile(results_file),
            pypeliner.managed.TempSpace('run_battenberg_temp', cleanup=None),
            config
        ),
        kwargs={
            'somatic_breakpoint_file': somatic_breakpoint_file,
        },
    )

    return workflow
Exemplo n.º 11
0
def create_snpeff_annotation_workflow(db,
                                      data_dir,
                                      target_vcf_file,
                                      out_file,
                                      base_docker={},
                                      snpeff_docker={},
                                      classic_mode=True,
                                      split_size=int(1e3),
                                      table_name='snpeff'):

    ctx = {'num_retry': 3, 'mem_retry_increment': 2}

    if base_docker:
        ctx.update(base_docker)

    workflow = Workflow()

    workflow.transform(name='split_vcf',
                       ctx=dict(mem=2, **ctx),
                       func='biowrappers.components.io.vcf.tasks.split_vcf',
                       args=(mgd.InputFile(target_vcf_file),
                             mgd.TempOutputFile('split.vcf', 'split')),
                       kwargs={'lines_per_file': split_size})

    workflow.transform(
        name='run_snpeff',
        axes=('split', ),
        ctx=dict(mem=8, **ctx),
        func='biowrappers.components.variant_calling.snpeff.tasks.run_snpeff',
        args=(db, data_dir, mgd.TempInputFile('split.vcf', 'split'),
              mgd.TempOutputFile('snpeff.vcf', 'split')),
        kwargs={
            'classic_mode': classic_mode,
            'docker_config': snpeff_docker
        })

    workflow.transform(
        name='convert_vcf_to_csv',
        axes=('split', ),
        ctx=dict(mem=4, **ctx),
        func=
        'biowrappers.components.variant_calling.snpeff.tasks.convert_vcf_to_table',
        args=(mgd.TempInputFile('snpeff.vcf', 'split'),
              mgd.TempOutputFile('snpeff.csv.gz',
                                 'split',
                                 extensions=['.yaml']), table_name))

    workflow.transform(name='concatenate_tables',
                       ctx=dict(mem=4, **ctx),
                       func='single_cell.utils.csvutils.concatenate_csv',
                       args=(mgd.TempInputFile('snpeff.csv.gz', 'split'),
                             mgd.OutputFile(out_file, extensions=['.yaml'])))

    return workflow
Exemplo n.º 12
0
def create_pvacseq_workflow(
    vcf_file,
    hla_type_file,
    results_file,
    config,
):
    workflow = Workflow()

    workflow.commandline(
        name='vep',
        ctx={'mem': 16},
        args=(
            'variant_effect_predictor.pl',
            '--input_file', pypeliner.managed.InputFile(vcf_file),
            '--format', 'vcf',
            '--output_file', pypeliner.managed.TempOutputFile('vep_annotated.vcf'),
            '--vcf', '--symbol', '--terms', 'SO',
            '--plugin', 'Downstream',
            '--plugin', 'Wildtype',
            '--cache', '--offline', '--force_overwrite',
            '--assembly', 'GRCh37',
            '--dir', config['vep_dir'],
            '--dir_plugins', os.path.join(config['vep_dir'], 'Plugins'),
        ),
    )

    workflow.transform(
        name='run_pvacseq',
        func=tasks.run_pvacseq,
        args=(
            pypeliner.managed.TempInputFile('vep_annotated.vcf'),
            pypeliner.managed.InputFile(hla_type_file),
            pypeliner.managed.OutputFile(results_file),
            pypeliner.managed.TempSpace('pvacseq_temp'),
            config,
        ),
    )

    return workflow
Exemplo n.º 13
0
def create_clonehd_single_workflow(
    normal_seqdata_file,
    tumour_seqdata_file,
    config,
    results_file,
    somatic_breakpoint_file=None,
    **kwargs
):
    workflow = Workflow()

    workflow.transform(
        name='prepare_data',
        ctx={'mem': 20},
        func=tasks.prepare_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.InputFile(tumour_seqdata_file),
            pypeliner.managed.TempOutputFile('normal.cna.txt'),
            pypeliner.managed.TempOutputFile('tumour.cna.txt'),
            pypeliner.managed.TempOutputFile('tumour.baf.txt'),
            config,
        ),
    )

    workflow.transform(
        name='run_clonehd',
        ctx={'mem': 8},
        func=tasks.run_clonehd,
        args=(
            pypeliner.managed.TempInputFile('normal.cna.txt'),
            pypeliner.managed.TempInputFile('tumour.cna.txt'),
            pypeliner.managed.TempInputFile('tumour.baf.txt'),
            pypeliner.managed.TempOutputFile('tumour.summary.txt'),
            pypeliner.managed.TempOutputFile('cna_subclone', 'subclone'),
            pypeliner.managed.TempOutputFile('bam_subclone', 'subclone', axes_origin=[]),
            pypeliner.managed.TempSpace('run_clonehd_temp', cleanup=None),
        ),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file)

    workflow.transform(
        name='report',
        ctx={'mem': 4},
        func=tasks.report,
        args=(
            pypeliner.managed.TempInputFile('tumour.summary.txt'),
            pypeliner.managed.TempInputFile('cna_subclone', 'subclone'),
            pypeliner.managed.TempInputFile('bam_subclone', 'subclone'),
            pypeliner.managed.OutputFile(results_file),
        ),
        kwargs={
            'somatic_breakpoint_file': somatic_breakpoint_file,
        },
    )

    return workflow
Exemplo n.º 14
0
def create_dbsnp_download_workflow(config, out_file):

    workflow = Workflow()

    workflow.subworkflow(
        name='download',
        func=download.create_download_workflow,
        args=(
            config['url'],
            pypeliner.managed.OutputFile(out_file)
        )
    )

    workflow.transform(
        name='index',
        ctx={'mem': 4},
        func=vcf_tasks.index_vcf,
        args=(
            pypeliner.managed.InputFile(out_file),
        )
    )

    return workflow
Exemplo n.º 15
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            indel_vcf_file,
                            snv_vcf_file,
                            snv_csv_file,
                            chromosomes=default_chromosomes,
                            use_depth_thresholds=True):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'num_retry': 3
    }

    regions = list(normal_bam_file.keys())
    assert set(tumour_bam_file.keys()) == set(regions)

    workflow = Workflow(ctx=ctx)

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('region'),
        value=regions,
    )

    workflow.transform(
        name='count_fasta_bases',
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.tasks.count_fasta_bases",
        args=(
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
        ))

    workflow.transform(
        name="get_chrom_sizes",
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    workflow.transform(
        name='call_somatic_variants',
        ctx=dict(mem=4, disk=40),
        func="single_cell.workflows.strelka.tasks.call_somatic_variants",
        axes=('region', ),
        args=(
            pypeliner.managed.InputFile("normal.split.bam",
                                        "region",
                                        fnames=normal_bam_file,
                                        extensions=['.bai']),
            pypeliner.managed.InputFile("merged_bam",
                                        "region",
                                        fnames=tumour_bam_file,
                                        extensions=['.bai']),
            pypeliner.managed.TempInputObj('known_sizes'),
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf',
                                             'region'),
            pypeliner.managed.TempOutputFile(
                'somatic.indels.unfiltered.vcf.window', 'region'),
            pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf',
                                             'region'),
            pypeliner.managed.TempOutputFile('strelka.stats', 'region'),
            pypeliner.managed.InputInstance("region"),
        ),
    )

    workflow.transform(
        name='add_indel_filters',
        axes=('chrom', ),
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.tasks.filter_indel_file_list",
        args=(pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf',
                                              'region'),
              pypeliner.managed.TempInputFile('strelka.stats', 'region'),
              pypeliner.managed.TempInputFile(
                  'somatic.indels.unfiltered.vcf.window', 'region'),
              pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf',
                                               'chrom'),
              pypeliner.managed.InputInstance("chrom"),
              pypeliner.managed.TempInputObj('known_sizes'), regions),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='add_snv_filters',
        axes=('chrom', ),
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.tasks.filter_snv_file_list",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf',
                                            'region'),
            pypeliner.managed.TempInputFile('strelka.stats', 'region'),
            pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf',
                                             'chrom'),
            pypeliner.managed.InputInstance("chrom"),
            pypeliner.managed.TempInputObj('known_sizes'),
            regions,
        ),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='merge_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf',
                                            'chrom'),
            pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempSpace("merge_indels_temp"),
        ))

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf',
                                            'chrom'),
            pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz'),
            pypeliner.managed.TempSpace("merge_snvs_temp"),
        ))

    workflow.transform(
        name='filter_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf')))

    workflow.transform(
        name='filter_snvs',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'),
              pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf')))

    workflow.transform(
        name='finalise_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'),
            pypeliner.managed.OutputFile(indel_vcf_file,
                                         extensions=['.tbi', '.csi']),
        ))

    workflow.transform(
        name='finalise_snvs',
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'),
            pypeliner.managed.OutputFile(snv_vcf_file,
                                         extensions=['.tbi', '.csi']),
        ))

    workflow.transform(
        name='convert_strelka_to_csv',
        func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv",
        ctx=ctx,
        args=(
            pypeliner.managed.InputFile(snv_vcf_file),
            pypeliner.managed.TempOutputFile('strelka_snv.csv'),
        ),
        kwargs={
            'score_callback': strelka_snv_callback,
        })

    workflow.transform(
        name='prep_strelka_csv',
        func='single_cell.utils.csvutils.rewrite_csv_file',
        args=(pypeliner.managed.TempInputFile('strelka_snv.csv'),
              pypeliner.managed.OutputFile(snv_csv_file,
                                           extensions=['.yaml'])),
        kwargs={'dtypes': dtypes()['snv_strelka']})

    return workflow
Exemplo n.º 16
0
def call_and_annotate_pipeline(config,
                               normal_bam_path,
                               tumour_bam_paths,
                               raw_data_dir,
                               results_file,
                               chromosomes=default_chromosomes):

    workflow = Workflow()

    workflow.setobj(
        pypeliner.managed.OutputChunks('tumour_sample_id', axes_origin=[
            0,
        ]), tumour_bam_paths.keys())

    variant_files = get_variant_files(chromosomes, config, raw_data_dir)

    normal_bam_file = pypeliner.managed.File(normal_bam_path)

    tumour_bam_files = pypeliner.managed.File('tumour_bams',
                                              'tumour_sample_id',
                                              fnames=tumour_bam_paths)

    ref_genome_fasta_file = pypeliner.managed.File(
        config['databases']['ref_genome']['local_path'])

    #===================================================================================================================
    # Multi sample calling
    #===================================================================================================================
    if 'nuseq_multi_sample' in config:
        workflow.subworkflow(
            name='nuseq_multi_sample',
            axes=(),
            func=
            'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow',
            args=(
                normal_bam_file.as_input(), [
                    pypeliner.managed.InputFile(x)
                    for x in tumour_bam_paths.values()
                ], ref_genome_fasta_file.as_input(),
                variant_files['snv']['vcf']['nuseq_multi_sample'].as_output()),
            kwargs=config['nuseq_multi_sample']['kwargs'])

        workflow.transform(
            name='convert_nuseq_multi_sample_vcf_to_hdf5',
            axes=(),
            ctx=default_ctx,
            func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5",
            args=(
                variant_files['snv']['vcf']['nuseq_multi_sample'].as_input(),
                variant_files['snv']['hdf']['nuseq_multi_sample'].as_output(),
                '/snv/vcf/nuseq_multi_sample/all',
            ),
            kwargs={'score_callback': vcf_score_callbacks['snv']['nuseq']})

    #===================================================================================================================
    # Single sample calling
    #===================================================================================================================
    if 'nuseq' in config:
        workflow.subworkflow(
            name='nuseq',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow',
            args=(normal_bam_file.as_input(), [
                tumour_bam_files.as_input(),
            ], ref_genome_fasta_file.as_input(),
                  variant_files['snv']['vcf']['nuseq'].as_output()),
            kwargs=config['nuseq']['kwargs'])

    if 'mutect' in config:
        workflow.subworkflow(
            name='mutect',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.mutect.create_mutect_workflow',
            args=(normal_bam_file.as_input(), tumour_bam_files.as_input(),
                  ref_genome_fasta_file.as_input(),
                  config['databases']['cosmic']['local_path'],
                  config['databases']['dbsnp']['local_path'],
                  variant_files['snv']['vcf']['mutect'].as_output()),
            kwargs=config['mutect']['kwargs'])

    if 'strelka' in config:
        workflow.subworkflow(
            name='strelka',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.strelka.create_strelka_workflow',
            args=(normal_bam_file.as_input(), tumour_bam_files.as_input(),
                  ref_genome_fasta_file.as_input(),
                  variant_files['indel']['vcf']['strelka'].as_output(),
                  variant_files['snv']['vcf']['strelka'].as_output()),
            kwargs=config['strelka']['kwargs'])

    #===================================================================================================================
    # Convert vcf to hdf5
    #===================================================================================================================
    for var_type in variant_files:
        for prog in variant_files[var_type]['vcf']:
            if prog == 'nuseq_multi_sample':
                continue

            workflow.transform(
                name='convert_{0}_indel_{1}_to_hdf5'.format(prog, var_type),
                axes=('tumour_sample_id', ),
                ctx=default_ctx,
                func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5",
                args=(variant_files[var_type]['vcf'][prog].as_input(),
                      variant_files[var_type]['hdf'][prog].as_output(),
                      pypeliner.managed.Template(
                          '/{var_type}/vcf/{prog}/{{tumour_sample_id}}'.format(
                              prog=prog, var_type=var_type),
                          'tumour_sample_id')),
                kwargs={'score_callback': vcf_score_callbacks[var_type][prog]})

    #===================================================================================================================
    # Indel annotation
    #===================================================================================================================
    workflow.transform(
        name='merge_indels',
        ctx=big_mem_ctx,
        func='biowrappers.components.io.vcf.tasks.vcf_tasks.merge_vcfs',
        args=([x.as_input() for x in variant_files['indel']['vcf'].values()],
              pypeliner.managed.TempOutputFile('all.indel.vcf')))

    workflow.transform(
        name='finalise_indels',
        func="biowrappers.components.io.vcf.tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('all.indel.vcf'),
              pypeliner.managed.TempOutputFile('all.indel.vcf.gz')))

    workflow.subworkflow(
        name='annotate_indels',
        axes=(),
        func=create_annotation_workflow,
        args=(
            config,
            pypeliner.managed.TempInputFile('all.indel.vcf.gz'),
            pypeliner.managed.TempOutputFile('indel_annotations.h5'),
            os.path.join(raw_data_dir, 'indel'),
        ),
        kwargs={'variant_type': 'indel'})

    #===================================================================================================================
    # SNV
    #===================================================================================================================
    workflow.transform(
        name='merge_snvs',
        ctx=big_mem_ctx,
        func="biowrappers.components.io.vcf.tasks.merge_vcfs",
        args=([x.as_input() for x in variant_files['snv']['vcf'].values()],
              pypeliner.managed.TempOutputFile('all.snv.vcf')))

    workflow.transform(
        name='finalise_snvs',
        func="biowrappers.components.io.vcf.tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('all.snv.vcf'),
              pypeliner.managed.TempOutputFile('all.snv.vcf.gz')))

    workflow.subworkflow(
        name='annotate_snvs',
        axes=(),
        func=create_annotation_workflow,
        args=(
            config,
            pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
            pypeliner.managed.TempOutputFile('snv_annotations.h5'),
            os.path.join(raw_data_dir, 'snv'),
        ),
        kwargs={'variant_type': 'snv'})

    workflow.subworkflow(
        name='normal_snv_counts',
        func=
        'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(
            normal_bam_file.as_input(),
            pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')),
        ),
        kwargs=get_kwargs(config['snv_counts']['kwargs'],
                          '/snv/counts/normal'))

    workflow.subworkflow(
        name='tumour_snv_counts',
        axes=('tumour_sample_id', ),
        func=
        'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(tumour_bam_files.as_input(),
              pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
              pypeliner.managed.OutputFile(
                  os.path.join(raw_data_dir, 'snv', 'counts',
                               '{tumour_sample_id}.h5'), 'tumour_sample_id')),
        kwargs=get_kwargs(
            config['snv_counts']['kwargs'],
            pypeliner.managed.Template('/snv/counts/{tumour_sample_id}',
                                       'tumour_sample_id')))

    #===================================================================================================================
    # Create final output
    #===================================================================================================================
    tables = [
        pypeliner.managed.TempInputFile('indel_annotations.h5'),
        pypeliner.managed.TempInputFile('snv_annotations.h5'),
        pypeliner.managed.InputFile(
            os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')),
        pypeliner.managed.InputFile(
            os.path.join(raw_data_dir, 'snv', 'counts',
                         '{tumour_sample_id}.h5'), 'tumour_sample_id'),
    ]

    for var_type in variant_files:
        for prog in variant_files[var_type]['hdf']:
            tables.append(variant_files[var_type]['hdf'][prog].as_input())

    workflow.transform(
        name='build_results_file',
        ctx=default_ctx,
        func='biowrappers.components.io.hdf5.tasks.concatenate_tables',
        args=(tables, pypeliner.managed.OutputFile(results_file)),
        kwargs={
            'drop_duplicates': True,
        })

    return workflow
Exemplo n.º 17
0
def delly_pipeline(
    normal_bam_file,
    tumour_bam_files,
    ref_genome_fasta_file,
    delly_excl_chrom,
    out_file,
    raw_data_dir,
):
    bams = list()
    for lib_id, bam_filename in tumour_bam_files.items():
        bams += [
            utils.symlink(bam_filename,
                          link_name='{0}.bam'.format(lib_id),
                          link_directory=raw_data_dir)
        ]
        utils.symlink(bam_filename + '.bai',
                      link_name='{0}.bam.bai'.format(lib_id),
                      link_directory=raw_data_dir)

    bams += [
        utils.symlink(normal_bam_file,
                      link_name='Normal.bam',
                      link_directory=raw_data_dir)
    ]
    utils.symlink(normal_bam_file + '.bai',
                  link_name='Normal.bam.bai',
                  link_directory=raw_data_dir)

    sample_type = {'Normal': 'control'}
    for lib_id in tumour_bam_files.keys():
        sample_type[lib_id] = 'tumor'

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('sample_type', 'sample_id'),
        value=sample_type,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sv_type'),
        value=('DEL', 'DUP', 'INV', 'TRA', 'INS'),
    )

    workflow.transform(
        name='delly_call',
        axes=('sv_type', ),
        ctx={
            'mem': 64,
            'num_retry': 2,
            'mem_retry_factor': 2
        },
        func=tasks.run_delly_call,
        args=(
            mgd.Instance('sv_type'),
            delly_excl_chrom,
            ref_genome_fasta_file,
            [mgd.InputFile(bam) for bam in bams],
            mgd.TempOutputFile('out.bcf', 'sv_type'),
        ),
    )

    workflow.transform(
        name='write_samples_table',
        ctx={'mem': 1},
        func=tasks.write_samples_table,
        args=(
            mgd.TempInputObj('sample_type', 'sample_id'),
            mgd.TempOutputFile('samples.tsv'),
        ),
    )

    workflow.transform(
        name='delly_filter_somatic',
        axes=('sv_type', ),
        ctx={
            'mem': 4,
            'num_retry': 2,
            'mem_retry_factor': 2
        },
        func=tasks.run_delly_filter,
        args=(
            mgd.Instance('sv_type'),
            mgd.TempInputFile('samples.tsv'),
            ref_genome_fasta_file,
            mgd.TempInputFile('out.bcf', 'sv_type'),
            mgd.TempOutputFile('somatic.bcf', 'sv_type'),
        ),
    )

    workflow.transform(
        name='concatenate_vcf',
        func=vcf_tasks.concatenate_bcf,
        ctx={
            'mem': 4,
            'num_retry': 2,
            'mem_retry_factor': 2
        },
        args=(
            mgd.TempInputFile('somatic.bcf', 'sv_type'),
            mgd.TempOutputFile('somatic.bcf'),
        ),
    )

    workflow.transform(
        name='convert_vcf',
        func=tasks.convert_vcf,
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            mgd.TempInputFile('somatic.bcf'),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Exemplo n.º 18
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            indel_vcf_file,
                            snv_vcf_file,
                            config,
                            chromosomes=default_chromosomes,
                            split_size=int(1e7),
                            use_depth_thresholds=True):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'num_retry': 3,
        'docker_image': config['docker']['single_cell_pipeline']
    }

    strelka_docker = {'docker_image': config['docker']['strelka']}
    vcftools_docker = {'docker_image': config['docker']['vcftools']}

    regions = list(normal_bam_file.keys())
    assert set(tumour_bam_file.keys()) == set(regions)

    workflow = Workflow(ctx=ctx)

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('region'),
        value=regions,
    )

    workflow.transform(
        name='count_fasta_bases',
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.tasks.count_fasta_bases",
        args=(ref_genome_fasta_file,
              pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
              strelka_docker))

    workflow.transform(
        name="get_chrom_sizes",
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    workflow.transform(
        name='call_somatic_variants',
        ctx=dict(mem=4, disk=40),
        func="single_cell.workflows.strelka.tasks.call_somatic_variants",
        axes=('region', ),
        args=(pypeliner.managed.InputFile("normal.split.bam",
                                          "region",
                                          fnames=normal_bam_file,
                                          extensions=['.bai']),
              pypeliner.managed.InputFile("merged_bam",
                                          "region",
                                          fnames=tumour_bam_file,
                                          extensions=['.bai']),
              pypeliner.managed.TempInputObj('known_sizes'),
              ref_genome_fasta_file,
              pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf',
                                               'region'),
              pypeliner.managed.TempOutputFile(
                  'somatic.indels.unfiltered.vcf.window', 'region'),
              pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf',
                                               'region'),
              pypeliner.managed.TempOutputFile('strelka.stats', 'region'),
              pypeliner.managed.InputInstance("region"), strelka_docker),
    )

    workflow.transform(
        name='add_indel_filters',
        axes=('chrom', ),
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.tasks.filter_indel_file_list",
        args=(pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf',
                                              'region'),
              pypeliner.managed.TempInputFile('strelka.stats', 'region'),
              pypeliner.managed.TempInputFile(
                  'somatic.indels.unfiltered.vcf.window', 'region'),
              pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf',
                                               'chrom'),
              pypeliner.managed.InputInstance("chrom"),
              pypeliner.managed.TempInputObj('known_sizes'), regions),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='add_snv_filters',
        axes=('chrom', ),
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.tasks.filter_snv_file_list",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf',
                                            'region'),
            pypeliner.managed.TempInputFile('strelka.stats', 'region'),
            pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf',
                                             'chrom'),
            pypeliner.managed.InputInstance("chrom"),
            pypeliner.managed.TempInputObj('known_sizes'),
            regions,
        ),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='merge_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf',
                                            'chrom'),
            pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempSpace("merge_indels_temp"), vcftools_docker))

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf',
                                              'chrom'),
              pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz'),
              pypeliner.managed.TempSpace("merge_snvs_temp"), vcftools_docker))

    workflow.transform(
        name='filter_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf')))

    workflow.transform(
        name='filter_snvs',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'),
              pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf')))

    workflow.transform(
        name='finalise_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'),
              pypeliner.managed.OutputFile(indel_vcf_file,
                                           extensions=['.tbi', '.csi']),
              vcftools_docker))

    workflow.transform(
        name='finalise_snvs',
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'),
              pypeliner.managed.OutputFile(snv_vcf_file,
                                           extensions=['.tbi', '.csi']),
              vcftools_docker))

    return workflow
Exemplo n.º 19
0
def create_theta_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('Theta requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_template = os.path.join(raw_data_dir, 'results',
                                    'sample_{sample_id}.h5')
    bicseq2_seg_template = os.path.join(raw_data_dir, 'bicseq2',
                                        'bicseq2_{sample_id}.seg')
    utils.make_parent_directory(results_template)
    utils.make_parent_directory(bicseq2_seg_template)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(
            somatic_breakpoint_file)

    workflow.transform(
        name='run_bicseq2',
        axes=('sample_id', ),
        ctx={'mem': 30},
        func=tasks.run_bicseq2_seg,
        args=(
            pypeliner.managed.OutputFile('bicseq2_seg',
                                         'sample_id',
                                         template=bicseq2_seg_template),
            pypeliner.managed.InputFile('normal_seqdata',
                                        template=normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            config,
            pypeliner.managed.TempSpace('bicseq2_work',
                                        'sample_id',
                                        cleanup=None),
        ),
    )

    workflow.transform(
        name='run_theta',
        axes=('sample_id', ),
        ctx={'mem': 32},
        func=tasks.run_theta,
        args=(
            pypeliner.managed.OutputFile('results',
                                         'sample_id',
                                         template=results_template),
            pypeliner.managed.InputFile('normal_seqdata',
                                        template=normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.InputFile('bicseq2_seg',
                                        'sample_id',
                                        template=bicseq2_seg_template),
            config,
            pypeliner.managed.TempSpace('theta_work',
                                        'sample_id',
                                        cleanup=None),
        ),
        kwargs={
            'breakpoints_filename': somatic_breakpoint_file,
            'num_clones': kwargs.get('num_clones', None),
        },
    )

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results',
                                        'sample_id',
                                        template=results_template),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow
Exemplo n.º 20
0
def create_nuseq_classify_workflow(normal_bam_file,
                                   tumour_bam_files,
                                   ref_genome_fasta_file,
                                   out_file,
                                   chromosomes=utils.default_chromosomes,
                                   chunk_size=int(1e5),
                                   indel_threshold=0.05,
                                   min_normal_depth=1,
                                   min_tumour_depth=1,
                                   min_somatic_probability=0.0,
                                   split_size=int(1e7)):

    workflow = Workflow()

    workflow.transform(
        name='get_regions',
        func=utils.get_bam_regions,
        ret=pypeliner.managed.TempOutputObj('regions', 'regions'),
        args=(
            pypeliner.managed.InputFile(tumour_bam_files[0]),
            split_size,
        ),
        kwargs={
            'chromosomes': chromosomes,
        },
    )

    workflow.transform(
        name='run_classify',
        axes=('regions', ),
        ctx={
            'mem': 6,
            'num_retry': 3,
            'mem_retry_increment': 2,
            'io': 1
        },
        func=tasks.run_classify,
        args=(pypeliner.managed.InputFile(normal_bam_file),
              [pypeliner.managed.InputFile(x) for x in tumour_bam_files],
              pypeliner.managed.InputFile(ref_genome_fasta_file),
              pypeliner.managed.TempInputObj('regions', 'regions'),
              pypeliner.managed.TempOutputFile('classified.h5', 'regions')),
        kwargs={
            'chunk_size': chunk_size,
            'min_normal_depth': min_normal_depth,
            'min_tumour_depth': min_tumour_depth,
            'min_somatic_probability': min_somatic_probability
        })

    workflow.transform(
        name='write_vcf',
        axes=('regions', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.write_vcf,
        args=(pypeliner.managed.TempInputFile('classified.h5', 'regions'),
              pypeliner.managed.TempOutputFile('classified.vcf', 'regions')),
        kwargs={'indel_threshold': indel_threshold})

    workflow.transform(name='merge_vcf',
                       ctx={
                           'mem': 8,
                           'num_retry': 3,
                           'mem_retry_increment': 2
                       },
                       func=vcf_tasks.concatenate_vcf,
                       args=(pypeliner.managed.TempInputFile(
                           'classified.vcf', 'regions'),
                             pypeliner.managed.TempOutputFile('merged.vcf')))

    workflow.transform(
        name='filter_snvs',
        ctx={
            'mem': 2,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=vcf_tasks.filter_vcf,
        args=(pypeliner.managed.TempInputFile('merged.vcf'),
              pypeliner.managed.TempOutputFile('merged.filtered.vcf')))

    workflow.transform(
        name='finalise',
        func=vcf_tasks.finalise_vcf,
        args=(pypeliner.managed.TempInputFile('merged.filtered.vcf'),
              pypeliner.managed.OutputFile(out_file)))

    return workflow
Exemplo n.º 21
0
def call_and_annotate_pipeline(
    config,
    normal_bam_path,
    tumour_bam_paths,
    raw_data_dir,
    results_file,
):
    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('tumour_sample_id'),
        value=tumour_bam_paths.keys(),
    )

    merge_inputs = {}

    if 'destruct' in config:
        destruct_raw_data = os.path.join(raw_data_dir, 'destruct')
        destruct_results_filename = os.path.join(destruct_raw_data,
                                                 'results.h5')
        make_parent_directory(destruct_results_filename)

        workflow.subworkflow(
            name='destruct',
            func=destruct.destruct_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                config['destruct']['config'],
                config['destruct']['ref_data_dir'],
                pypeliner.managed.OutputFile(destruct_results_filename),
                destruct_raw_data,
            ),
        )

        merge_inputs['/breakpoints/destruct'] = pypeliner.managed.InputFile(
            destruct_results_filename)

    if 'delly' in config:
        delly_raw_data = os.path.join(raw_data_dir, 'delly')
        delly_results_filename = os.path.join(delly_raw_data, 'results.h5')
        make_parent_directory(delly_results_filename)

        workflow.subworkflow(
            name='delly',
            func=delly.delly_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                config['delly']['ref_genome_fasta_file'],
                config['delly']['exclude_file'],
                pypeliner.managed.OutputFile(delly_results_filename),
                delly_raw_data,
            ),
        )

        merge_inputs['/breakpoints/delly'] = pypeliner.managed.InputFile(
            delly_results_filename)

    if 'lumpysv' in config:
        lumpysv_raw_data = os.path.join(raw_data_dir, 'lumpysv')
        lumpysv_results_filename = os.path.join(lumpysv_raw_data, 'results.h5')
        make_parent_directory(lumpysv_results_filename)

        workflow.subworkflow(
            name='lumpysv',
            func=lumpysv.lumpysv_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                pypeliner.managed.OutputFile(lumpysv_results_filename),
                lumpysv_raw_data,
            ),
        )

        merge_inputs['/breakpoints/lumpysv'] = pypeliner.managed.InputFile(
            lumpysv_results_filename)

    workflow.transform(name='merge_results',
                       ctx={'mem': 8},
                       func=hdf5_tasks.merge_hdf5,
                       args=(
                           merge_inputs,
                           pypeliner.managed.OutputFile(results_file),
                       ))

    return workflow
Exemplo n.º 22
0
def realignment_pipeline(
        config,
        in_file,
        out_file,
        read_group_info=None):

    if read_group_info is None:
        read_group_info = config.get('read_group', {})

    if 'ID' not in read_group_info:
        read_group_info['ID'] = hash(in_file) % int(1e6)

    ref_genome = pypeliner.managed.InputFile(config['ref_genome']['file'])

    read_1 = pypeliner.managed.TempFile('read_1', 'split')

    read_2 = pypeliner.managed.TempFile('read_2', 'split')

    read_1_sai = pypeliner.managed.TempFile('read_1.sai', 'split')

    read_2_sai = pypeliner.managed.TempFile('read_2.sai', 'split')

    read_group_config = pypeliner.managed.TempObj('read_group_config')

    workflow = Workflow()

    if 'read_group' in config:
        workflow.setobj(
            obj=read_group_config.as_output(),
            value=read_group_info,
        )

    else:
        workflow.transform(
            name='get_read_group_config',
            ctx={'local': True},
            func=tasks.get_read_group_config,
            ret=read_group_config.as_output(),
            args=(
                pypeliner.managed.InputFile(in_file),
            )
        )

    workflow.transform(
        name='bam_to_fasta',
        axes=(),
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bam_tasks.convert_to_fastqs,
        args=(
            pypeliner.managed.InputFile(in_file),
            {
                1: read_1.as_output(),
                2: read_2.as_output(),
            },
            pypeliner.managed.TempSpace('bam_to_fastq'),
        ),
        kwargs={
            'split_size': config['split_size']
        },
    )

    workflow.transform(
        name='aln_read_1',
        axes=('split',),
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bwa_tasks.run_aln,
        args=(
            read_1.as_input(),
            ref_genome,
            read_1_sai.as_output(),
        ),
    )

    workflow.transform(
        name='aln_read_2',
        axes=('split',),
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bwa_tasks.run_aln,
        args=(
            read_2.as_input(),
            ref_genome,
            read_2_sai.as_output(),
        ),
    )

    workflow.transform(
        name='sampe',
        axes=('split',),
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bwa_tasks.run_sampe,
        args=(
            read_1.as_input(),
            read_2.as_input(),
            read_1_sai.as_input(),
            read_2_sai.as_input(),
            ref_genome,
            pypeliner.managed.TempOutputFile('aligned.bam', 'split'),
        ),
        kwargs={
            'read_group_info': read_group_config.as_input()
        },
    )

    workflow.transform(
        name='sort',
        axes=('split',),
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bam_tasks.sort,
        args=(
            pypeliner.managed.TempInputFile('aligned.bam', 'split'),
            pypeliner.managed.TempOutputFile('sorted.bam', 'split'),
        ),
    )

    workflow.transform(
        name='write_header_file',
        axes=(),
        ctx={'local': True},
        func=tasks.write_header_file,
        args=(
            pypeliner.managed.TempInputFile('sorted.bam', 'split'),
            pypeliner.managed.TempOutputFile('header.sam'),
            config['ref_genome']['header']
        ),
    )

    workflow.transform(
        name='merge',
        axes=(),
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bam_tasks.merge,
        args=(
            pypeliner.managed.TempInputFile('sorted.bam', 'split'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'header_file': pypeliner.managed.TempInputFile('header.sam'),
        },
    )

    return workflow
Exemplo n.º 23
0
def main(args):
    biowrappers.components.utils.make_directory(args.out_dir)

    with open(args.config_file) as config_file:
        config_text = config_file.read()
    config_text = config_text.format(out_dir=args.out_dir, ref_db_dir=args.ref_db_dir)
    config = yaml.load(config_text)

    pypeliner_args = vars(args)
    pypeliner_args['tmpdir'] = os.path.join(args.out_dir, 'pipeline')

    pyp = pypeliner.app.Pypeline(modules=[tasks], config=pypeliner_args)

    download_urls = {}

    for sample in ('tumour', 'normal'):
        lanes = config['lanes'][sample]

        for lane in lanes:
            download_urls[(sample, lane)] = config['lanes'][sample][lane]['url']

    raw_lane_template = os.path.join(args.out_dir, 'lanes', 'raw', '{lane}.bam')

    realigned_lane_template = os.path.join(args.out_dir, 'lanes', 'realigned', '{lane}.bam')
    sample_bam_template = os.path.join(args.out_dir, '{sample}.bam')

    workflow = Workflow(default_ctx={'mem': 8})

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('url', 'sample', 'lane'),
        value=download_urls,
    )

    workflow.subworkflow(
        name='download_lanes',
        axes=('sample', 'lane'),
        func=biowrappers.components.io.download.create_download_workflow,
        args=(
            pypeliner.managed.TempInputObj('url', 'sample', 'lane'),
            pypeliner.managed.OutputFile('raw_lane', 'sample', 'lane', template=raw_lane_template),
        )
    )

    workflow.subworkflow(
        name='realign_lanes',
        axes=('sample', 'lane'),
        func=biowrappers.pipelines.realignment.realignment_pipeline,
        args=(
            config['realignment'],
            pypeliner.managed.InputFile('raw_lane', 'sample', 'lane', template=raw_lane_template),
            pypeliner.managed.OutputFile('realigned_lane', 'sample', 'lane', template=realigned_lane_template),
        )
    )

    workflow.transform(
        name='merge_and_markdups',
        axes=('sample',),
        func=biowrappers.components.io.bam.tasks.mark_duplicates,
        args=(
            pypeliner.managed.InputFile('realigned_lane', 'sample', 'lane', template=realigned_lane_template),
            pypeliner.managed.OutputFile('bam', 'sample', template=sample_bam_template),
        ),
        kwargs={
            'tmp_dir': pypeliner.managed.TempSpace('markdup_temp', 'sample')
        }
    )

    pyp.run(workflow)

    normal_bam_file = sample_bam_template.format(sample='normal')
    tumour_bam_file = sample_bam_template.format(sample='tumour')

    workflow = Workflow(default_ctx={'mem': 8})

    breakpoint_raw_data_dir = os.path.join(args.out_dir, 'breakpoints', 'raw')
    breakpoint_results_file = os.path.join(args.out_dir, 'breakpoints', 'results.h5')

    workflow.subworkflow(
        name='breakpoint_call_and_annotate',
        func=biowrappers.pipelines.breakpoint_call_and_annotate.call_and_annotate_pipeline,
        args=(
            config,
            pypeliner.managed.InputFile(normal_bam_file),
            {'tumour': pypeliner.managed.InputFile(tumour_bam_file)},
            pypeliner.managed.Template(os.path.join(breakpoint_raw_data_dir)),
            pypeliner.managed.OutputFile(breakpoint_results_file),
        ),
    )

    somatic_breakpoints_file = os.path.join(args.out_dir, 'somatic_breakpoints.tsv')

    workflow.transform(
        name='extract_somatic_breakpoint',
        ctx={'mem': 4},
        func=tasks.extract_somatic_breakpoint,
        args=(
            pypeliner.managed.InputFile(breakpoint_results_file),
            pypeliner.managed.OutputFile(somatic_breakpoints_file),
            config,
        )
    )

    copy_number_raw_data_dir = os.path.join(args.out_dir, 'copy_number', 'raw')
    breakpoint_results_file = os.path.join(args.out_dir, 'copy_number', 'results.h5')

    workflow.subworkflow(
        name='copy_number_call_and_annotate',
        func=biowrappers.pipelines.copy_number.call_and_annotate_pipeline,
        args=(
            config,
            pypeliner.managed.InputFile(normal_bam_file),
            {'tumour': pypeliner.managed.InputFile(tumour_bam_file)},
            copy_number_raw_data_dir,
            pypeliner.managed.OutputFile(breakpoint_results_file),
        ),
        kwargs={
            'somatic_breakpoint_file': pypeliner.managed.InputFile(somatic_breakpoints_file),
        },
    )

    pyp.run(workflow)
Exemplo n.º 24
0
def create_titan_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('Titan requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results',
                                 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    workflow.transform(
        name='prepare_normal_data',
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.prepare_normal_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.TempOutputFile('normal.wig'),
            pypeliner.managed.TempOutputFile('het_positions.tsv'),
            config,
        ),
    )

    workflow.transform(
        name='prepare_tumour_data',
        axes=('sample_id', ),
        ctx={'mem': 20},
        func=tasks.prepare_tumour_data,
        args=(
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.TempInputFile('het_positions.tsv'),
            pypeliner.managed.TempOutputFile('tumour.wig', 'sample_id'),
            pypeliner.managed.TempOutputFile('tumour_alleles.tsv',
                                             'sample_id'),
            config,
        ),
    )

    workflow.transform(
        name='create_intialization_parameters',
        axes=('sample_id', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.create_intialization_parameters,
        ret=pypeliner.managed.TempOutputObj('init_params', 'sample_id',
                                            'init_param_id'),
        args=(config, ),
    )

    workflow.transform(
        name='run_titan',
        axes=('sample_id', 'init_param_id'),
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.run_titan,
        args=(
            pypeliner.managed.TempInputObj('init_params', 'sample_id',
                                           'init_param_id'),
            pypeliner.managed.TempInputFile('normal.wig'),
            pypeliner.managed.TempInputFile('tumour.wig', 'sample_id'),
            pypeliner.managed.TempInputFile('tumour_alleles.tsv', 'sample_id'),
            pypeliner.managed.TempOutputFile('cn.tsv', 'sample_id',
                                             'init_param_id'),
            pypeliner.managed.TempOutputFile('params.tsv', 'sample_id',
                                             'init_param_id'),
            config,
        ),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(
            somatic_breakpoint_file)

    workflow.transform(
        name='select_solution',
        axes=('sample_id', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.select_solution,
        args=(
            pypeliner.managed.TempInputObj('init_params', 'sample_id',
                                           'init_param_id'),
            pypeliner.managed.TempInputFile('cn.tsv', 'sample_id',
                                            'init_param_id'),
            pypeliner.managed.TempInputFile('params.tsv', 'sample_id',
                                            'init_param_id'),
            pypeliner.managed.OutputFile('results',
                                         'sample_id',
                                         template=results_files),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_loci.tsv'), 'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_segments.tsv'), 'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_cn_igv.tsv'),
                'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'),
                'sample_id'),
            config,
            pypeliner.managed.Template('{sample_id}', 'sample_id'),
        ),
        kwargs={
            'breakpoints_filename': somatic_breakpoint_file,
        },
    )

    workflow.setobj(obj=pypeliner.managed.OutputChunks('sample_id',
                                                       'chromosome'),
                    value=config.get('chromosomes', default_chromosomes),
                    axes=('sample_id', ))

    workflow.commandline(
        name='plot_chromosome',
        axes=('sample_id', 'chromosome'),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            'plot_titan_chromosome.R',
            pypeliner.managed.Instance('chromosome'),
            pypeliner.managed.InputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_loci.tsv'), 'sample_id'),
            pypeliner.managed.InputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'),
                'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_chr_{chromosome}.png'), 'sample_id',
                'chromosome'),
        ),
    )

    workflow.transform(
        name='merge_results',
        ctx={
            'mem': 8,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results',
                                        'sample_id',
                                        template=results_files),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow
Exemplo n.º 25
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            snv_vcf_file,
                            snv_maf_file,
                            indel_vcf_file,
                            indel_maf_file,
                            reference,
                            reference_vep,
                            chromosomes,
                            normal_id,
                            tumour_id,
                            single_node=False,
                            is_exome=False):
    params = config.default_params('variant_calling')

    workflow = Workflow(ctx=helpers.get_default_ctx(memory=5,
                                                    walltime='4:00'), )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ret=mgd.OutputChunks('regions'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    workflow.transform(
        name='count_fasta_bases',
        func="wgs.workflows.strelka.tasks.count_fasta_bases",
        args=(
            reference,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
        ),
    )

    workflow.transform(
        name="get_chrom_sizes",
        func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    if single_node:
        workflow.transform(name='strelka_one_node',
                           func="wgs.workflows.strelka.tasks.strelka_one_node",
                           args=(
                               pypeliner.managed.InputFile(normal_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               pypeliner.managed.InputFile(tumour_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               reference,
                               mgd.TempOutputFile('indels.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempOutputFile('snvs.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempSpace('call_genome_segment_tmp'),
                               mgd.InputChunks('regions'),
                               mgd.TempInputObj('known_sizes'),
                           ),
                           kwargs={
                               'is_exome': is_exome,
                           })
    else:
        workflow.transform(
            name='get_chromosome_depths',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.get_chromosome_depth",
            args=(
                mgd.InputInstance('regions'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('chrom_depth.txt', 'regions'),
            ),
        )

        workflow.transform(
            name='merge_chromosome_depths',
            func="wgs.workflows.strelka.tasks.merge_chromosome_depths",
            args=(mgd.TempInputFile('chrom_depth.txt',
                                    'regions',
                                    axes_origin=[]),
                  mgd.TempOutputFile('merged_chrom_depth.txt')))

        workflow.transform(
            name='call_genome_segment',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.call_genome_segment",
            args=(
                mgd.TempInputFile('merged_chrom_depth.txt'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                pypeliner.managed.InputFile(tumour_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('indels.vcf', 'regions'),
                mgd.TempOutputFile('snvs.vcf', 'regions'),
                mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                mgd.InputInstance('regions'),
                mgd.TempInputObj('known_sizes'),
            ),
            kwargs={
                'is_exome': False,
            })

        workflow.transform(
            name='merge_indels',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('indels.vcf', 'regions'),
                  mgd.TempOutputFile('indels.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("indels_merge")),
        )

        workflow.transform(
            name='merge_snvs',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('snvs.vcf', 'regions'),
                  mgd.TempOutputFile('snvs.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("snvs_merge")),
        )

    workflow.transform(name='bcftools_normalize_snv',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('snvs.vcf.gz'),
                           mgd.TempOutputFile('normalized_snvs.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs.vcf'),
            mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(name='bcftools_normalize_indel',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('indels.vcf.gz'),
                           mgd.TempOutputFile('normalized_indels.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_indel',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_indels.vcf'),
            mgd.TempOutputFile('normalized_indels_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_indel',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_indels_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_snv',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_snv_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(indel_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(indel_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
Exemplo n.º 26
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            indel_vcf_file,
                            snv_vcf_file,
                            chromosomes=default_chromosomes,
                            split_size=int(1e7),
                            use_depth_thresholds=True):

    workflow = Workflow()

    workflow.transform(
        name='get_chromosomes',
        func=get_chromosomes,
        ret=pypeliner.managed.TempOutputObj('chrom_dummy', 'chrom'),
        args=(pypeliner.managed.InputFile(tumour_bam_file), ),
        kwargs={'chromosomes': chromosomes},
    )

    workflow.transform(
        name='split_chromosomes',
        axes=('chrom', ),
        func=get_coords,
        ret=pypeliner.managed.TempOutputObj('coord_dummy', 'chrom', 'coord'),
        args=(pypeliner.managed.InputFile(tumour_bam_file),
              pypeliner.managed.TempInputObj('chrom_dummy',
                                             'chrom'), split_size))

    workflow.transform(
        name='count_fasta_bases',
        ctx={
            'mem': 2,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.count_fasta_bases,
        args=(pypeliner.managed.InputFile(ref_genome_fasta_file),
              pypeliner.managed.TempOutputFile('ref_base_counts.tsv')))

    workflow.transform(
        name='get_known_chromosomes_sizes',
        ctx={'local': True},
        func=get_known_chromosome_sizes,
        ret=pypeliner.managed.TempOutputObj('known_sizes',
                                            'chrom',
                                            axes_origin=[]),
        args=(pypeliner.managed.InputFile(tumour_bam_file),
              pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    workflow.transform(
        name='call_somatic_variants',
        axes=('chrom', 'coord'),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.call_somatic_variants,
        args=(pypeliner.managed.InputFile(normal_bam_file),
              pypeliner.managed.InputFile(tumour_bam_file),
              pypeliner.managed.InputFile(ref_genome_fasta_file),
              pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf',
                                               'chrom', 'coord'),
              pypeliner.managed.TempOutputFile(
                  'somatic.indels.unfiltered.vcf.window', 'chrom', 'coord'),
              pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf',
                                               'chrom', 'coord'),
              pypeliner.managed.TempOutputFile('strelka.stats', 'chrom',
                                               'coord'),
              pypeliner.managed.TempInputObj('chrom_dummy', 'chrom'),
              pypeliner.managed.TempInputObj('coord_dummy', 'chrom', 'coord'),
              pypeliner.managed.TempInputObj('known_sizes', 'chrom')))

    workflow.transform(
        name='add_indel_filters',
        axes=('chrom', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.filter_indel_file_list,
        args=(pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf',
                                              'chrom', 'coord'),
              pypeliner.managed.TempInputFile('strelka.stats', 'chrom',
                                              'coord'),
              pypeliner.managed.TempInputFile(
                  'somatic.indels.unfiltered.vcf.window', 'chrom', 'coord'),
              pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf',
                                               'chrom'),
              pypeliner.managed.TempInputObj('chrom_dummy', 'chrom'),
              pypeliner.managed.TempInputObj('known_sizes', 'chrom')),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='add_snv_filters',
        axes=('chrom', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.filter_snv_file_list,
        args=(pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf',
                                              'chrom', 'coord'),
              pypeliner.managed.TempInputFile('strelka.stats', 'chrom',
                                              'coord'),
              pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf',
                                               'chrom'),
              pypeliner.managed.TempInputObj('chrom_dummy', 'chrom'),
              pypeliner.managed.TempInputObj('known_sizes', 'chrom')),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(name='merge_indels',
                       ctx={
                           'mem': 4,
                           'num_retry': 3,
                           'mem_retry_increment': 2
                       },
                       func=vcf_tasks.concatenate_vcf,
                       args=(pypeliner.managed.TempInputFile(
                           'somatic.indels.filtered.vcf', 'chrom'),
                             pypeliner.managed.TempOutputFile(
                                 'somatic.indels.filtered.vcf.gz')))

    workflow.transform(
        name='merge_snvs',
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=vcf_tasks.concatenate_vcf,
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf',
                                            'chrom'),
            pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz')))

    workflow.transform(
        name='filter_indels',
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=vcf_tasks.filter_vcf,
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf')))

    workflow.transform(
        name='filter_snvs',
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=vcf_tasks.filter_vcf,
        args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'),
              pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf')))

    workflow.transform(
        name='finalise_indels',
        func=vcf_tasks.finalise_vcf,
        args=(pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'),
              pypeliner.managed.OutputFile(indel_vcf_file)))

    workflow.transform(
        name='finalise_snvs',
        func=vcf_tasks.finalise_vcf,
        args=(pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'),
              pypeliner.managed.OutputFile(snv_vcf_file)))

    return workflow
Exemplo n.º 27
0
def create_mutect_workflow(
        normal_bam_file,
        tumour_bam_file,
        ref_genome_fasta_file,
        cosmic_vcf_file,
        dbsnp_vcf_file,
        out_file,
        chromosomes=default_chromosomes,
        split_size=int(1e7)):

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('regions', 'regions'),
        value=utils.get_bam_regions(tumour_bam_file, split_size, chromosomes=chromosomes)
    )

    workflow.transform(
        name='run_classify',
        axes=('regions',),
        ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 2, 'io': 1},
        func=tasks.run_mutect,
        args=(
            pypeliner.managed.InputFile(normal_bam_file),
            pypeliner.managed.InputFile(tumour_bam_file),
            pypeliner.managed.InputFile(ref_genome_fasta_file),
            pypeliner.managed.InputFile(cosmic_vcf_file),
            pypeliner.managed.InputFile(dbsnp_vcf_file),
            pypeliner.managed.TempInputObj('regions', 'regions'),
            pypeliner.managed.TempOutputFile('classified.vcf', 'regions')
        ),
    )

    workflow.transform(
        name='merge_vcf',
        ctx={'mem': 16, 'num_retry': 3, 'mem_retry_increment': 8},
        func=vcf_tasks.concatenate_vcf,
        args=(
            pypeliner.managed.TempInputFile('classified.vcf', 'regions'),
            pypeliner.managed.TempOutputFile('merged.vcf.gz'),
        ),
        kwargs={
            'bcf_index_file': pypeliner.managed.TempOutputFile('merged.vcf.gz.csi'),
            'vcf_index_file': pypeliner.managed.TempOutputFile('merged.vcf.gz.tbi'),
        }
    )

    workflow.transform(
        name='filter_snvs',
        ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2},
        func=vcf_tasks.filter_vcf,
        args=(
            pypeliner.managed.TempInputFile('merged.vcf.gz'),
            pypeliner.managed.TempOutputFile('merged.filtered.vcf')
        )
    )

    workflow.transform(
        name='finalise',
        func=vcf_tasks.finalise_vcf,
        args=(
            pypeliner.managed.TempInputFile('merged.filtered.vcf'),
            pypeliner.managed.OutputFile(out_file)
        )
    )

    return workflow
Exemplo n.º 28
0
def create_annotation_workflow(
    config,
    in_vcf_file,
    out_file,
    raw_data_dir,
    variant_type='snv',
    docker_config={},
    snpeff_docker={},
    vcftools_docker={},
):

    annotators = ('cosmic_status', 'dbsnp_status', 'mappability', 'snpeff',
                  'tri_nucleotide_context')

    result_files = {}

    kwargs = {}

    for a in annotators:
        kwargs[a] = get_kwargs(config[a]['kwargs'],
                               '/{0}/{1}'.format(variant_type, a))

        result_files[a] = pypeliner.managed.File(
            os.path.join(raw_data_dir, '{0}.csv.gz'.format(a)))

    if not os.path.isdir(raw_data_dir):
        os.mkdir(raw_data_dir)

    assert os.path.isdir(raw_data_dir)

    workflow = Workflow()

    workflow.subworkflow(
        name='cosmic_status',
        func=
        'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
        args=(
            config['databases']['cosmic']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file),
            result_files['cosmic_status'].as_output(),
        ),
        kwargs=config["cosmic_status"]['kwargs'])

    workflow.subworkflow(
        name='dbsnp_status',
        func=
        'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
        args=(
            config['databases']['dbsnp']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file),
            result_files['dbsnp_status'].as_output(),
        ),
        kwargs=config["dbsnp_status"]['kwargs'])

    workflow.subworkflow(
        name='mappability',
        func=
        'biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
        args=(
            config['databases']['mappability']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file, extensions=['.tbi']),
            result_files['mappability'].as_output(),
        ),
        kwargs=config["mappability"]['kwargs'])

    workflow.subworkflow(
        name='snpeff',
        func=
        'biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
        args=(
            config['databases']['snpeff']['db'],
            config['databases']['snpeff']['data_dir'],
            pypeliner.managed.InputFile(in_vcf_file),
            result_files['snpeff'].as_output(),
        ),
        kwargs=dict(snpeff_docker=snpeff_docker, **kwargs['snpeff']))

    workflow.subworkflow(
        name='tri_nucleotide_context',
        func=
        'biowrappers.components.variant_calling.tri_nucleotide_context.create_vcf_tric_nucleotide_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
        args=(
            config['databases']['ref_genome']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file),
            result_files['tri_nucleotide_context'].as_output(),
        ),
        kwargs=config["tri_nucleotide_context"]['kwargs'])

    workflow.transform(name='build_results_file',
                       ctx=dict(mem=4, mem_retry_increment=2, **docker_config),
                       func='single_cell.utils.csvutils.concatenate_csv',
                       args=(
                           [x.as_input() for x in result_files.values()],
                           pypeliner.managed.OutputFile(out_file,
                                                        extensions=[".yaml"]),
                       ))

    return workflow
Exemplo n.º 29
0
def destruct_pipeline(
    normal_bam_file,
    tumour_bam_files,
    config,
    ref_data_dir,
    out_file,
    raw_data_dir,
    normal_sample_id='normal',
):
    bam_files = tumour_bam_files
    bam_files[normal_sample_id] = normal_bam_file

    utils.make_directory(os.path.join(raw_data_dir, 'raw'))
    breakpoint_file = os.path.join(raw_data_dir, 'raw', 'breakpoint.tsv')
    breakpoint_library_file = os.path.join(raw_data_dir, 'raw',
                                           'breakpoint_library.tsv')
    breakpoint_read_file = os.path.join(raw_data_dir, 'raw',
                                        'breakpoint_read.tsv')

    utils.make_directory(os.path.join(raw_data_dir, 'somatic'))
    somatic_breakpoint_file = os.path.join(raw_data_dir, 'somatic',
                                           'breakpoint.tsv')
    somatic_breakpoint_library_file = os.path.join(raw_data_dir, 'somatic',
                                                   'breakpoint_library.tsv')

    raw_read_data_dir = os.path.join(raw_data_dir, 'read_data')
    utils.make_directory(raw_read_data_dir)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=bam_files.keys(),
    )

    workflow.subworkflow(
        name='run_destruct',
        func="destruct.workflow.create_destruct_workflow",
        args=(
            pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files),
            pypeliner.managed.OutputFile(breakpoint_file),
            pypeliner.managed.OutputFile(breakpoint_library_file),
            pypeliner.managed.OutputFile(breakpoint_read_file),
            config,
            ref_data_dir,
        ),
        kwargs={
            'raw_data_dir': raw_read_data_dir,
        },
    )

    workflow.transform(
        name='filter_annotate_breakpoints',
        ctx={'mem': 8},
        func=
        'biowrappers.components.breakpoint_calling.destruct.tasks.filter_annotate_breakpoints',
        args=(
            pypeliner.managed.InputFile(breakpoint_file),
            pypeliner.managed.InputFile(breakpoint_library_file),
            [normal_sample_id],
            pypeliner.managed.OutputFile(somatic_breakpoint_file),
            pypeliner.managed.OutputFile(somatic_breakpoint_library_file),
        ),
    )

    workflow.transform(
        name='write_store',
        func=
        'biowrappers.components.breakpoint_calling.destruct.tasks.write_store',
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            pypeliner.managed.InputFile(somatic_breakpoint_file),
            pypeliner.managed.InputFile(somatic_breakpoint_library_file),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Exemplo n.º 30
0
def create_vardict_paired_sample_workflow(normal_bam_file,
                                          tumour_bam_file,
                                          ref_genome_fasta_file,
                                          out_file,
                                          chromosomes=default_chromosomes,
                                          java=False,
                                          min_allele_frequency=0.01,
                                          remove_duplicate_reads=False,
                                          sample_names=None,
                                          split_size=int(1e7)):

    workflow = Workflow()
    workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'),
                    value=utils.get_bam_regions(normal_bam_file,
                                                split_size,
                                                chromosomes=chromosomes))
    workflow.transform(
        name='run_vardict',
        axes=('regions', ),
        ctx={
            'mem': 12,
            'num_retry': 4,
            'mem_retry_increment': 2
        },
        func=tasks.run_paired_sample_vardict,
        args=(
            pypeliner.managed.InputFile(normal_bam_file),
            pypeliner.managed.InputFile(tumour_bam_file),
            pypeliner.managed.InputFile(ref_genome_fasta_file),
            pypeliner.managed.TempInputObj('config', 'regions'),
            pypeliner.managed.TempOutputFile('result.vcf', 'regions'),
        ),
        kwargs={
            'java': java,
            'min_allele_frequency': min_allele_frequency,
            'remove_duplicate_reads': remove_duplicate_reads,
            'sample_names': sample_names,
        },
    )
    workflow.transform(
        name='compress_tmp',
        axes=('regions', ),
        ctx={
            'mem': 2,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=vcf_tasks.compress_vcf,
        args=(
            pypeliner.managed.TempInputFile('result.vcf', 'regions'),
            pypeliner.managed.TempOutputFile('result.vcf.gz', 'regions'),
        ),
        kwargs={
            'index_file':
            pypeliner.managed.TempOutputFile('result.vcf.gz.tbi', 'regions'),
        })
    workflow.transform(
        name='concatenate_vcf',
        ctx={
            'mem': 2,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=vcf_tasks.concatenate_vcf,
        args=(
            pypeliner.managed.TempInputFile('result.vcf.gz', 'regions'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'bcf_index_file': pypeliner.managed.OutputFile(out_file + '.csi'),
            'vcf_index_file': pypeliner.managed.OutputFile(out_file + '.tbi'),
        },
    )
    return workflow