Пример #1
0
def _create_download_decompress_concat_workflow(urls,
                                                out_file,
                                                local_download=False):
    workflow = pypeliner.workflow.Workflow()

    local_files = []

    for i, url in enumerate(urls):
        local_files.append(mgd.TempFile('file_{}'.format(i)))

        workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url)

        workflow.subworkflow(name='download_file_{}'.format(i),
                             func=_create_download_decompress_workflow,
                             args=(
                                 mgd.TempInputObj('url_{}'.format(i)),
                                 local_files[i].as_output(),
                             ),
                             kwargs={'local_download': local_download})

    concat_args = [
        'cat',
    ] + [x.as_input()
         for x in local_files] + ['>', mgd.OutputFile(out_file)]

    workflow.commandline(name='concat', args=concat_args)

    return workflow
Пример #2
0
def _create_download_decompress_workflow(url,
                                         local_path,
                                         local_download=False):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(mgd.TempOutputObj('url'), value=url)

    workflow.transform(
        name='download',
        ctx={'local': local_download},
        func=tasks.download,
        args=(
            mgd.TempInputObj('url'),
            mgd.TempOutputFile('download'),
        ),
    )

    workflow.transform(name='decompress',
                       func=tasks.decompress,
                       args=(
                           mgd.TempInputFile('download'),
                           mgd.OutputFile(local_path),
                       ))

    return workflow
Пример #3
0
def _create_download_cosmic_workflow(ref_data_version,
                                     out_file,
                                     user,
                                     password,
                                     host='sftp-cancer.sanger.ac.uk',
                                     local_download=False):

    host_base_path = '/files/{}/cosmic/v83/VCF'.format(
        ref_data_version.lower())

    coding_host_path = '/'.join([host_base_path, 'CosmicCodingMuts.vcf.gz'])

    non_coding_host_path = '/'.join(
        [host_base_path, 'CosmicNonCodingVariants.vcf.gz'])

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('coding_host_path'),
                    value=coding_host_path)

    workflow.setobj(obj=mgd.TempOutputObj('non_coding_host_path'),
                    value=non_coding_host_path)

    workflow.subworkflow(name='download_coding',
                         func=_create_download_cosmic_file_subworkflow,
                         args=(
                             host,
                             mgd.TempInputObj('coding_host_path'),
                             user,
                             password,
                             mgd.TempOutputFile('coding.vcf.gz'),
                         ),
                         kwargs={'local_download': local_download})

    workflow.subworkflow(name='download_non_coding',
                         func=_create_download_cosmic_file_subworkflow,
                         args=(
                             host,
                             mgd.TempInputObj('non_coding_host_path'),
                             user,
                             password,
                             mgd.TempOutputFile('non_coding.vcf.gz'),
                         ),
                         kwargs={'local_download': local_download})

    workflow.transform(name='merge_files',
                       func=soil.wrappers.samtools.tasks.concatenate_vcf,
                       args=([
                           mgd.TempInputFile('coding.vcf.gz'),
                           mgd.TempInputFile('non_coding.vcf.gz')
                       ], mgd.OutputFile(out_file)),
                       kwargs={
                           'allow_overlap': True,
                           'index_file': mgd.OutputFile(out_file + '.tbi')
                       })

    return workflow
Пример #4
0
def create_pileup2snp_workflow(bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)):

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools', 'varscan'])

    workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox)

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('config', 'regions'),
        value=soil.utils.genome.get_bam_regions(bam_file, split_size, chromosomes=chromosomes)
    )

    workflow.commandline(
        name='run_mpileup',
        axes=('regions',),
        args=(
            'samtools',
            'mpileup',
            '-f', mgd.InputFile(ref_genome_fasta_file),
            '-o', mgd.TempOutputFile('region.mpileup', 'regions'),
            '-r', mgd.TempInputObj('config', 'regions'),
            mgd.InputFile(bam_file),
        )
    )

    workflow.transform(
        name='run_mpileup2snp',
        axes=('regions',),
        ctx=med_mem_ctx,
        func=tasks.mpileup2snp,
        args=(
            mgd.TempInputFile('region.mpileup', 'regions'),
            mgd.TempOutputFile('region.vcf', 'regions'),
        )
    )

    workflow.transform(
        name='compress',
        axes=('regions',),
        func=soil.wrappers.samtools.tasks.compress_vcf,
        args=(
            mgd.TempInputFile('region.vcf', 'regions'),
            mgd.TempOutputFile('region.vcf.gz', 'regions'),
        ),
    )

    workflow.transform(
        name='concatenate_vcfs',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('region.vcf.gz', 'regions'),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Пример #5
0
def create_eagle_ref_data_workflow(vcf_url_template,
                                   out_file,
                                   local_download=False):

    chrom_map_file = soil.utils.package_data.load_data_file(
        'ref_data/data/GRCh37/chrom_map.tsv')

    chrom_map = pd.read_csv(chrom_map_file, sep='\t')

    chrom_map = chrom_map[chrom_map['ncbi'].isin(
        [str(x) for x in range(1, 23)])]

    chrom_map['url'] = chrom_map['ncbi'].apply(
        lambda x: vcf_url_template.format(chrom=x))

    vcf_urls = chrom_map['url'].to_dict()

    sandbox = soil.utils.workflow.get_sandbox(['bcftools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls)

    workflow.transform(name='download_vcf_files',
                       axes=('chrom', ),
                       ctx={'local': local_download},
                       func=soil.ref_data.tasks.download,
                       args=(mgd.TempInputObj('vcf_url', 'chrom'),
                             mgd.TempOutputFile('raw.vcf.gz', 'chrom')))

    workflow.transform(name='write_chrom_map',
                       func=tasks.write_chrom_map_file,
                       args=(mgd.InputFile(chrom_map_file),
                             mgd.TempOutputFile('chrom_map.tsv')))

    workflow.transform(name='rename_chroms',
                       axes=('chrom', ),
                       func=soil.wrappers.bcftools.tasks.rename_chroms,
                       args=(mgd.TempInputFile('chrom_map.tsv'),
                             mgd.TempInputFile('raw.vcf.gz', 'chrom'),
                             mgd.TempOutputFile('renamed.bcf', 'chrom')))

    workflow.transform(name='concat_vcfs',
                       func=soil.wrappers.bcftools.tasks.concatenate_vcf,
                       args=(mgd.TempInputFile('renamed.bcf', 'chrom'),
                             mgd.OutputFile(out_file)),
                       kwargs={'bcf_output': True})

    workflow.commandline(name='index',
                         args=('bcftools', 'index', mgd.InputFile(out_file),
                               '-o', mgd.OutputFile(out_file + '.csi')))

    return workflow
Пример #6
0
def _create_download_vep_plugins_workflow(urls, out_dir, local_download=False):
    workflow = pypeliner.workflow.Workflow()

    for i, url in enumerate(urls):
        out_file = os.path.join(out_dir, os.path.basename(url))

        workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url)

        workflow.transform(name='download_file_{}'.format(i),
                           ctx={'local': local_download},
                           func=tasks.download,
                           args=(
                               mgd.TempInputObj('url_{}'.format(i)),
                               mgd.OutputFile(out_file),
                           ))

    return workflow
Пример #7
0
def create_titan_workflow(normal_bam_file,
                          tumour_bam_file,
                          dbsnp_vcf_file,
                          mappability_file,
                          ref_genome_fasta_file,
                          out_file,
                          exome_bed_file=None,
                          sample='Tumour',
                          threads=1):

    sandbox = soil.utils.workflow.get_sandbox(
        ['hmmcopy', 'hmmcopy_utils', 'titan'])

    sandbox.channels.append('conda-forge')

    sandbox.packages.extend(['pandas', 'rpy2'])

    chromosomes = soil.utils.genome.load_bam_chromosome_lengths(
        normal_bam_file, 'autosomes')

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('init_params', 'param_idx'),
                    value=tasks.create_intialization_parameters())

    workflow.subworkflow(name='get_allele_counts',
                         func=create_allele_counts_workflow,
                         args=(mgd.InputFile(normal_bam_file),
                               mgd.InputFile(tumour_bam_file),
                               mgd.InputFile(dbsnp_vcf_file),
                               mgd.InputFile(ref_genome_fasta_file),
                               mgd.TempOutputFile('allele_counts.tsv')),
                         kwargs={'chromosomes': 'autosomes'})

    workflow.commandline(name='build_normal_wig',
                         args=('readCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(normal_bam_file), '>',
                               mgd.TempOutputFile('normal.wig')))

    workflow.commandline(name='build_tumour_wig',
                         args=('readCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(tumour_bam_file), '>',
                               mgd.TempOutputFile('tumour.wig')))

    workflow.commandline(name='build_gc_wig',
                         args=('gcCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(ref_genome_fasta_file), '>',
                               mgd.TempOutputFile('gc.wig')))

    workflow.commandline(name='build_mappability_wig',
                         args=('mapCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(mappability_file), '>',
                               mgd.TempOutputFile('mappability.wig')))

    workflow.transform(name='build_coverage_file',
                       func=tasks.build_coverage_file,
                       args=(mgd.TempInputFile('normal.wig'),
                             mgd.TempInputFile('tumour.wig'),
                             mgd.TempInputFile('gc.wig'),
                             mgd.TempInputFile('mappability.wig'),
                             mgd.TempOutputFile('coverage.wig')),
                       kwargs={'target_file': exome_bed_file})

    workflow.transform(name='run_titan',
                       axes=('param_idx', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3,
                           'threads': threads
                       },
                       func=tasks.run_titan,
                       args=(mgd.TempInputFile('coverage.wig'),
                             mgd.TempInputFile('allele_counts.tsv'),
                             mgd.TempInputObj('init_params', 'param_idx'),
                             mgd.TempOutputFile('run.tar.gz', 'param_idx'),
                             mgd.TempSpace('titan_tmp', 'param_idx')),
                       kwargs={
                           'is_exome': (exome_bed_file is not None),
                           'sample': sample,
                           'threads': threads
                       })

    workflow.transform(name='build_run_stats_file',
                       func=tasks.build_run_stats_file,
                       args=(mgd.TempInputFile('run.tar.gz', 'param_idx'),
                             mgd.TempInputObj('init_params', 'param_idx'),
                             mgd.TempOutputFile('stats.tsv')))

    workflow.transform(name='build_output',
                       func=tasks.build_final_results_file,
                       args=(mgd.TempInputFile('coverage.wig'),
                             mgd.TempInputFile('allele_counts.tsv'),
                             mgd.TempInputFile('run.tar.gz', 'param_idx'),
                             mgd.TempInputFile('stats.tsv'),
                             mgd.OutputFile(out_file),
                             mgd.TempSpace('build_results')))

    return workflow
Пример #8
0
def create_somatic_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            out_file,
                            chromosomes='default',
                            is_exome=False,
                            split_size=int(1e7)):

    sandbox = soil.utils.workflow.get_sandbox(
        ['bcftools', 'samtools', 'strelka'])

    workflow = pypeliner.workflow.Workflow(default_ctx=med_mem_ctx,
                                           default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('config', 'regions'),
                    value=soil.utils.genome.get_bam_regions(
                        normal_bam_file, split_size, chromosomes=chromosomes))

    workflow.setobj(obj=mgd.TempOutputObj('chrom_names', 'chrom_axis'),
                    value=get_chromosomes(normal_bam_file,
                                          chromosomes=chromosomes))

    workflow.transform(
        name='count_fasta_bases',
        func=soil.wrappers.strelka.tasks.count_fasta_bases,
        args=(
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('ref_base_counts.tsv'),
        ),
    )

    workflow.transform(
        name='get_genome_size',
        ctx={'local': True},
        func=get_known_genome_size,
        ret=mgd.TempOutputObj('genome_size'),
        args=(
            mgd.InputFile(tumour_bam_file),
            mgd.TempInputFile('ref_base_counts.tsv'),
            chromosomes,
        ),
        sandbox=None,
    )

    workflow.transform(
        name='get_chromosome_depths',
        axes=('chrom_axis', ),
        func=soil.wrappers.strelka.tasks.get_chromosome_depth,
        args=(
            mgd.TempInputObj('chrom_names', 'chrom_axis'),
            mgd.InputFile(normal_bam_file),
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('chrom_depth.txt', 'chrom_axis'),
        ),
    )

    workflow.transform(
        name='merge_chromosome_depths',
        func=soil.wrappers.strelka.tasks.merge_chromosome_depth,
        args=(
            mgd.TempInputFile('chrom_depth.txt', 'chrom_axis'),
            mgd.TempOutputFile('chrom_depth_merged.txt'),
        ),
        sandbox=None,
    )

    workflow.transform(name='call_genome_segment',
                       axes=('regions', ),
                       func=soil.wrappers.strelka.tasks.call_genome_segment,
                       args=(
                           mgd.TempInputFile('chrom_depth_merged.txt'),
                           mgd.InputFile(normal_bam_file),
                           mgd.InputFile(tumour_bam_file),
                           mgd.InputFile(ref_genome_fasta_file),
                           mgd.TempOutputFile('indels.vcf', 'regions'),
                           mgd.TempOutputFile('snvs.vcf', 'regions'),
                           mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                           mgd.TempInputObj('config', 'regions'),
                           mgd.TempInputObj('genome_size'),
                       ),
                       kwargs={
                           'is_exome': is_exome,
                       })

    workflow.transform(
        name='merge_indels',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('indels.vcf', 'regions'),
            mgd.TempOutputFile('indels.vcf.gz'),
        ),
    )

    workflow.transform(
        name='merge_snvs',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('snvs.vcf', 'regions'),
            mgd.TempOutputFile('snvs.vcf.gz'),
        ),
    )

    workflow.transform(
        name='merge_all',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            [
                mgd.TempInputFile('indels.vcf.gz'),
                mgd.TempInputFile('snvs.vcf.gz')
            ],
            mgd.TempOutputFile('merged.vcf.gz'),
        ),
        kwargs={
            'allow_overlap': True,
        },
    )

    workflow.commandline(name='filter_vcf',
                         ctx=low_mem_ctx,
                         args=(
                             'bcftools',
                             'view',
                             '-O',
                             'z',
                             '-f',
                             '.,PASS',
                             '-o',
                             mgd.OutputFile(out_file),
                             mgd.TempInputFile('merged.vcf.gz'),
                         ))

    workflow.transform(name='index_vcf',
                       ctx=low_mem_ctx,
                       func=soil.wrappers.samtools.tasks.index_vcf,
                       args=(
                           mgd.InputFile(out_file),
                           mgd.OutputFile(out_file + '.tbi'),
                       ))

    return workflow
Пример #9
0
def create_ref_panel_phase_workflow(genetic_map_file, ref_file, target_file, out_file):
    """ Run EAGLE using a reference panel.
    """

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'eagle'])

    workflow = pypeliner.workflow.Workflow(default_ctx=default_ctx, default_sandbox=sandbox)

    workflow.setobj(
        obj=mgd.TempOutputObj('chrom', 'chrom'),
        value=get_chromosomes(target_file)
    )

    workflow.transform(
        name='split_ref',
        axes=('chrom',),
        func=tasks.get_chrom_variant_file,
        args=(
            mgd.TempInputObj('chrom', 'chrom'),
            mgd.InputFile(ref_file),
            mgd.TempOutputFile('ref.bcf', 'chrom')
        )
    )

    workflow.transform(
        name='split_target',
        axes=('chrom',),
        func=tasks.get_chrom_variant_file,
        args=(
            mgd.TempInputObj('chrom', 'chrom'),
            mgd.InputFile(target_file),
            mgd.TempOutputFile('target.bcf', 'chrom')
        )
    )

    workflow.transform(
        name='run_eagle',
        axes=('chrom',),
        func=tasks.run_eagle,
        args=(
            mgd.InputFile(genetic_map_file),
            mgd.TempInputFile('ref.bcf', 'chrom'),
            mgd.TempInputFile('target.bcf', 'chrom'),
            mgd.TempOutputFile('phased.bcf', 'chrom'),
            mgd.TempSpace('eagle_tmp', 'chrom')
        )
    )

    workflow.transform(
        name='concat_results',
        func=tasks.concat_results,
        args=(
            mgd.TempInputFile('phased.bcf', 'chrom'),
            mgd.OutputFile(out_file)
        )
    )

    workflow.commandline(
        name='index',
        args=(
            'bcftools',
            'index',
            '-t',
            '-o', mgd.OutputFile(out_file + '.tbi'),
             mgd.InputFile(out_file)
        )
    )

    return workflow
Пример #10
0
def create_multiple_lane_align_workflow(fastq_files_1,
                                        fastq_files_2,
                                        ref_genome_dir,
                                        out_bam_file,
                                        add_xs_tag=False,
                                        align_threads=1,
                                        merge_threads=1,
                                        read_group_info=None,
                                        sort_threads=1):

    if read_group_info is None:
        read_group_info = {}

        for key in fastq_files_1:
            read_group_info[key] = None

    sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('read_group_info', 'lane'),
                    value=read_group_info)

    workflow.subworkflow(name='align',
                         axes=('lane', ),
                         func=create_align_workflow,
                         args=(
                             mgd.InputFile('R1.fq.gz',
                                           'lane',
                                           fnames=fastq_files_1),
                             mgd.InputFile('R2.fq.gz',
                                           'lane',
                                           fnames=fastq_files_2),
                             ref_genome_dir,
                             mgd.TempOutputFile('lane.bam', 'lane'),
                         ),
                         kwargs={
                             'add_xs_tag':
                             add_xs_tag,
                             'align_threads':
                             align_threads,
                             'read_group_info':
                             mgd.TempInputObj('read_group_info', 'lane'),
                             'sort_threads':
                             sort_threads,
                         })

    workflow.transform(name='markdups_and_merge',
                       axes=(),
                       ctx={
                           'mem': 24,
                           'mem_retry_increment': 8,
                           'num_retry': 3,
                           'threads': merge_threads
                       },
                       func=soil.wrappers.sambamba.tasks.markdups,
                       args=(
                           mgd.TempInputFile('lane.bam', 'lane'),
                           mgd.OutputFile(out_bam_file),
                           mgd.TempSpace('markdup_tmp'),
                       ),
                       kwargs={
                           'threads': merge_threads,
                       })

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))
    return workflow
Пример #11
0
def create_topiary_workflow(hla_alleles,
                            in_file,
                            out_file,
                            copy_pyensembl_cache_dir=False,
                            iedb_dir=None,
                            genome='GRCh37',
                            predictor='netmhc',
                            pyensembl_cache_dir=None):
    """ Run topiary.

    Parameters
    ----------
    hla_alleles: list
        List of HLA alleles i.e. A*02:01.
    in_file: str
        Path to VCF file with variants.
    out_file: str
        Path where output will be written in tsv format.
    """
    sandbox = soil.utils.workflow.get_sandbox([
        'topiary',
    ])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('raw_hla_alleles'),
                    value=hla_alleles)

    workflow.setobj(obj=mgd.OutputChunks('pep_len'), value=[8, 9, 10, 11])

    workflow.transform(name='filter_hla_alleles',
                       func=tasks.filter_hla_alleles,
                       args=(mgd.TempInputObj('raw_hla_alleles'), ),
                       kwargs={
                           'iedb_dir': iedb_dir,
                           'predictor': predictor,
                       },
                       ret=mgd.TempOutputObj('hla_alleles'))

    workflow.transform(name='run_topiary',
                       axes=('pep_len', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=tasks.run_topiary,
                       args=(mgd.TempInputObj('hla_alleles'),
                             mgd.InputFile(in_file),
                             mgd.TempOutputFile('raw.tsv', 'pep_len')),
                       kwargs={
                           'copy_pyensembl_cache_dir':
                           copy_pyensembl_cache_dir,
                           'iedb_dir': iedb_dir,
                           'genome': genome,
                           'peptide_length':
                           mgd.Template('{pep_len}', 'pep_len'),
                           'predictor': predictor,
                           'pyensembl_cache_dir': pyensembl_cache_dir
                       })

    workflow.transform(name='reformat_output',
                       axes=(),
                       func=tasks.reformat_output,
                       args=(mgd.TempInputFile('raw.tsv', 'pep_len'),
                             mgd.OutputFile(out_file)))

    return workflow
Пример #12
0
def create_mutect_paired_workflow(normal_bam_file,
                                  tumour_bam_file,
                                  ref_genome_fasta_file,
                                  out_file,
                                  chromosomes=None,
                                  normal_name='normal',
                                  split_size=int(1e7),
                                  tumour_name='tumour'):

    normal_name = get_sample(normal_bam_file, normal_name)

    tumour_name = get_sample(tumour_bam_file, tumour_name)

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'gatk', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx,
                                           default_sandbox=sandbox)

    workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'),
                    value=soil.utils.genome.get_bam_regions(
                        normal_bam_file, split_size, chromosomes=chromosomes))

    workflow.transform(name='run_mutect',
                       axes=('regions', ),
                       ctx=med_mem_ctx,
                       func=tasks.run_mutect_paired,
                       args=(mgd.InputFile(normal_bam_file),
                             mgd.InputFile(tumour_bam_file),
                             mgd.InputFile(ref_genome_fasta_file),
                             mgd.TempInputObj('config', 'regions'),
                             mgd.TempOutputFile('region.vcf', 'regions')),
                       kwargs={
                           'normal_name': normal_name,
                           'tumour_name': tumour_name
                       })

    workflow.transform(name='run_mutect_filter',
                       axes=('regions', ),
                       ctx=med_mem_ctx,
                       func=tasks.run_filter_mutect,
                       args=(mgd.TempInputFile('region.vcf', 'regions'),
                             mgd.TempOutputFile('flagged.vcf', 'regions')))

    workflow.transform(name='concatenate_vcfs',
                       func=soil.wrappers.samtools.tasks.concatenate_vcf,
                       args=(
                           mgd.TempInputFile('flagged.vcf', 'regions'),
                           mgd.TempOutputFile('merged.vcf.gz'),
                       ))

    workflow.commandline(name='filter_vcf',
                         ctx=low_mem_ctx,
                         args=(
                             'bcftools',
                             'view',
                             '-O',
                             'z',
                             '-f',
                             '.,PASS',
                             '-o',
                             mgd.OutputFile(out_file),
                             mgd.TempInputFile('merged.vcf.gz'),
                         ))

    return workflow
Пример #13
0
def crete_download_ref_data_workflow(config,
                                     out_dir,
                                     cosmic=False,
                                     local_download=False):
    """ Download reference files.

    This workflow mainly retrieves files from the internet. There are some light to moderately heavy computational tasks
    as well.
    """
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    ref_data_paths = soil.ref_data.paths.SoilRefDataPaths(out_dir)

    with open(ref_data_paths.config_file, 'w') as fh:
        yaml.dump(config, fh)

    if cosmic:
        cosmic_user = click.prompt('Please enter COSMIC user ID')

        cosmic_password = click.prompt('Please enter COSMIC password',
                                       hide_input=True)

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    for key in config:
        if key.endswith('url') or key.endswith('urls'):
            workflow.setobj(obj=mgd.TempOutputObj(key), value=config[key])

    workflow.setobj(mgd.TempOutputObj('snpeff_url'),
                    value=config['snpeff']['url'])

    workflow.subworkflow(
        name='download_ref_gene_annotations',
        func=_create_download_decompress_concat_workflow,
        args=(mgd.TempInputObj('ref_gene_annotations_gtf_urls'),
              mgd.OutputFile(ref_data_paths.gene_annotations_gtf_file)),
        kwargs={'local_download': local_download})

    workflow.subworkflow(name='download_ref_genome',
                         func=_create_download_decompress_concat_workflow,
                         args=(mgd.TempInputObj('ref_genome_fasta_urls'),
                               mgd.TempOutputFile('raw_ref.fasta')),
                         kwargs={'local_download': local_download})

    workflow.transform(name='lexsort_ref_genome',
                       func=tasks.lex_sort_fasta,
                       args=(mgd.TempInputFile('raw_ref.fasta'),
                             mgd.OutputFile(ref_data_paths.genome_fasta_file)))

    workflow.subworkflow(name='download_ref_proteome',
                         func=_create_download_decompress_concat_workflow,
                         args=(mgd.TempInputObj('ref_proteome_fasta_urls'),
                               mgd.TempOutputFile('raw_ref_prot.fasta')),
                         kwargs={'local_download': local_download})

    workflow.transform(name='filter_bad_proteins',
                       func=tasks.filter_bad_proiteins,
                       args=(mgd.TempInputFile('raw_ref_prot.fasta'),
                             mgd.OutputFile(
                                 ref_data_paths.proteome_fasta_file)))

    workflow.subworkflow(
        name='download_ref_transcriptome',
        func=_create_download_decompress_concat_workflow,
        args=(mgd.TempInputObj('ref_transcriptome_fasta_urls'),
              mgd.OutputFile(ref_data_paths.transcriptome_fasta_file)),
        kwargs={'local_download': local_download})

    workflow.transform(name='download_dbsnp',
                       ctx={'local': local_download},
                       func=tasks.download,
                       args=(mgd.TempInputObj('dbsnp_url'),
                             mgd.OutputFile(ref_data_paths.dbsnp_vcf_file)))

    if cosmic:
        workflow.subworkflow(
            name='download_cosmic',
            func=_create_download_cosmic_workflow,
            args=(config['cosmic']['ref_genome_version'],
                  mgd.OutputFile(ref_data_paths.cosmic_vcf_file), cosmic_user,
                  cosmic_password),
            kwargs={'local_download': local_download})

    workflow.transform(name='download_snpeff_db',
                       ctx={'local': local_download},
                       func=tasks.download,
                       args=(mgd.TempInputObj('snpeff_url'),
                             mgd.TempOutputFile('snpeff.zip')))

    workflow.transform(
        name='unzip_snpeff',
        func=tasks.unzip_file,
        args=(mgd.TempInputFile('snpeff.zip'),
              mgd.OutputFile(
                  os.path.join(os.path.dirname(ref_data_paths.snpeff_data_dir),
                               'done.txt')), mgd.TempSpace('snpeff_tmp')))

    workflow.transform(name='download_genetic_map',
                       ctx={'local': local_download},
                       func=tasks.download,
                       args=(mgd.TempInputObj('genetic_map_txt_url'),
                             mgd.OutputFile(ref_data_paths.genetic_map_file)))

    workflow.subworkflow(
        name='ref_haplotype_panel',
        func=soil.ref_data.haplotype.workflows.create_eagle_ref_data_workflow,
        args=(mgd.TempInputObj('haplotype_vcf_template_url'),
              mgd.OutputFile(ref_data_paths.haplotypes_bcf)),
        kwargs={'local_download': local_download})

    workflow.transform(name='download_iedb_mhc_one',
                       ctx={'local': local_download},
                       func=tasks.download,
                       args=(mgd.TempInputObj('iedb_mhc_one_url'),
                             mgd.TempOutputFile('mhc1.tar.gz')))

    workflow.transform(name='extract_iedb_mhc_one',
                       func=tasks.extract_tar_file,
                       args=(mgd.TempInputFile('mhc1.tar.gz'),
                             mgd.OutputFile(
                                 os.path.join(ref_data_paths.iedb_mhc_one_dir,
                                              'extract.done'))))

    workflow.transform(name='config_iedb_mhc_one',
                       func=tasks.configure_iedb_module,
                       args=(mgd.InputFile(
                           os.path.join(ref_data_paths.iedb_mhc_one_dir,
                                        'extract.done')),
                             mgd.OutputFile(
                                 os.path.join(ref_data_paths.iedb_mhc_one_dir,
                                              'configure.done'))))

    workflow.transform(name='download_vep_cache',
                       ctx={'local': local_download},
                       func=tasks.download,
                       args=(mgd.TempInputObj('vep_cache_url'),
                             mgd.TempOutputFile('vep.tar.gz')))

    workflow.transform(name='extract_vep_cache',
                       func=tasks.extract_tar_file,
                       args=(mgd.TempInputFile('vep.tar.gz'),
                             mgd.OutputFile(
                                 os.path.join(ref_data_paths.vep_cache_dir,
                                              'homo_sapiens',
                                              'extract.done'))))

    workflow.subworkflow(name='download_vep_plugins',
                         func=_create_download_vep_plugins_workflow,
                         args=(mgd.TempInputObj('vep_plugins_urls'),
                               ref_data_paths.vep_plugins_dir),
                         kwargs={'local_download': local_download})

    workflow.setobj(obj=mgd.TempOutputObj('pyensembl_version'),
                    value=config['pyensembl']['version'])

    workflow.transform(name='download_pyensembl_cache',
                       ctx={'local': local_download},
                       func=tasks.download_pyensembl_cache,
                       args=(mgd.TempInputObj('pyensembl_version'),
                             mgd.OutputFile(
                                 os.path.join(
                                     ref_data_paths.pyensembl_cache_dir,
                                     'download.done'))),
                       sandbox=soil.utils.workflow.get_sandbox(['pyensembl']))

    return workflow
Пример #14
0
def create_vardict_paired_workflow(normal_bam_file,
                                   tumour_bam_file,
                                   ref_genome_fasta_file,
                                   out_file,
                                   chromosomes=None,
                                   split_size=int(5e6)):

    sandbox = soil.utils.workflow.get_sandbox(
        ['bcftools', 'samtools', 'vardict', 'vardict-java'])

    workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx,
                                           default_sandbox=sandbox)

    workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'),
                    value=soil.utils.genome.get_bam_regions(
                        normal_bam_file, split_size, chromosomes=chromosomes))

    workflow.transform(name='run_vardict',
                       axes=('regions', ),
                       ctx=med_mem_ctx,
                       func=tasks.run_vardict_paired,
                       args=(mgd.InputFile(normal_bam_file),
                             mgd.InputFile(tumour_bam_file),
                             mgd.InputFile(ref_genome_fasta_file),
                             mgd.TempInputObj('config', 'regions'),
                             mgd.TempOutputFile('call.tsv', 'regions')))

    workflow.transform(name='test_somatic',
                       axes=('regions', ),
                       func=tasks.run_test_somatic,
                       args=(mgd.TempInputFile('call.tsv', 'regions'),
                             mgd.TempOutputFile('somatic.tsv', 'regions')))

    workflow.transform(name='write_vcf',
                       axes=('regions', ),
                       func=tasks.run_build_paired_vcf,
                       args=(mgd.TempInputFile('somatic.tsv', 'regions'),
                             mgd.TempOutputFile('region.vcf', 'regions')))

    workflow.commandline(name='compress_vcf',
                         axes=('regions', ),
                         args=('bcftools', 'view', '-O', 'z', '-o',
                               mgd.TempOutputFile('region.vcf.gz', 'regions'),
                               mgd.TempInputFile('region.vcf', 'regions')))

    workflow.transform(name='concatenate_vcfs',
                       func=soil.wrappers.samtools.tasks.concatenate_vcf,
                       args=(
                           mgd.TempInputFile('region.vcf.gz', 'regions'),
                           mgd.TempOutputFile('merged.vcf.gz'),
                       ))

    workflow.commandline(name='filter_vcf',
                         args=(
                             'bcftools',
                             'view',
                             '-O',
                             'z',
                             '-f',
                             '.,PASS',
                             '-o',
                             mgd.TempOutputFile('filtered.vcf.gz'),
                             mgd.TempInputFile('merged.vcf.gz'),
                         ))

    workflow.commandline(name='filter_somatics',
                         args=('bcftools', 'filter', '-i',
                               'INFO/STATUS[0]="StrongSomatic"', '-O', 'z',
                               '-o', mgd.OutputFile(out_file),
                               mgd.TempInputFile('filtered.vcf.gz')))

    return workflow