예제 #1
0
파일: realign.py 프로젝트: diljotgrewal/wgs
def realign_bam_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(docker_image=config.containers('wgs')))

    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    yamldata = yaml.safe_load(open(args['input_yaml']))

    samples = list(yamldata.keys())

    input_bams = {sample: yamldata[sample]['input'] for sample in samples}

    output_bams = os.path.join(outdir, '{sample_id}', '{sample_id}.bam')
    metrics = os.path.join(outdir, '{sample_id}', '{sample_id}.txt')
    metrics_tar = os.path.join(outdir, '{sample_id}', '{sample_id}.tar')

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name="realign",
        func=realign_bams,
        ctx=helpers.get_default_ctx(),
        args=(
            samples,
            mgd.InputFile("input.bam", 'sample_id', fnames=input_bams,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile("realigned.bam", 'sample_id', template=output_bams,
                           extensions=['.bai', '.tdf'], axes_origin=[]),
            mgd.OutputFile("realigned.txt", 'sample_id', template=metrics,
                           extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile("realigned.tar", 'sample_id', template=metrics_tar,
                           extensions=['.bai'], axes_origin=[]),
            args['refdir'],
        ),
        kwargs={'single_node': args['single_node']}
    )

    outputted_filenames = helpers.expand_list([output_bams, metrics, metrics_tar], samples, 'sample_id')

    workflow.transform(
        name='generate_meta_files_results',
        func='wgs.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            args["out_dir"],
            outputted_filenames,
            mgd.OutputFile(meta_yaml)
        ),
        kwargs={
            'input_yaml_data': helpers.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'realignment'}
        }
    )

    pyp.run(workflow)
예제 #2
0
파일: workflow.py 프로젝트: mwkyuen/remixt
def create_remixt_bam_workflow(
    breakpoint_filename,
    bam_filenames,
    results_filenames,
    raw_data_directory,
    config,
    ref_data_dir,
    normal_id=None,
):
    sample_ids = bam_filenames.keys()
    
    tumour_ids = bam_filenames.keys()
    if normal_id is not None:
        tumour_ids.remove(normal_id)

    seqdata_template = os.path.join(raw_data_directory, 'seqdata', 'sample_{sample_id}.h5')

    results_filenames = dict([(tumour_id, results_filenames[tumour_id]) for tumour_id in tumour_ids])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=sample_ids,
    )

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_id'),
        value=tumour_ids,
    )

    workflow.subworkflow(
        name='extract_seqdata_workflow',
        axes=('sample_id',),
        func=remixt.workflow.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile('bam', 'sample_id', fnames=bam_filenames),
            mgd.OutputFile('seqdata', 'sample_id', template=seqdata_template),
            config,
            ref_data_dir,
        ),
    )

    workflow.subworkflow(
        name='remixt_seqdata_workflow',
        func=create_remixt_seqdata_workflow,
        args=(
            mgd.InputFile(breakpoint_filename),
            mgd.InputFile('seqdata', 'sample_id', template=seqdata_template),
            mgd.OutputFile('results', 'tumour_id', fnames=results_filenames, axes_origin=[]),
            raw_data_directory,
            config,
            ref_data_dir,
        ),
        kwargs={
            'normal_id': normal_id,
        },
    )

    return workflow
예제 #3
0
파일: realign.py 프로젝트: diljotgrewal/wgs
def realign_bams(samples, inputs, outputs, metrics, metrics_tar, refdir, single_node=False):
    outputs = dict([(sampid, outputs[sampid])
                    for sampid in samples])
    inputs = dict([(sampid, inputs[sampid])
                   for sampid in samples])

    metrics = dict([(sampid, metrics[sampid])
                    for sampid in samples])
    metrics_tar = dict([(sampid, metrics_tar[sampid])
                        for sampid in samples])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name='realign_bam_file',
        func=realignment.realign_bam_files,
        args=(
            mgd.InputFile("input.bam", "sample_id", axes_origin=[], fnames=inputs),
            mgd.OutputFile("output.bam", "sample_id", axes_origin=[], fnames=outputs),
            mgd.OutputFile("output.txt", "sample_id", axes_origin=[], fnames=metrics),
            mgd.OutputFile("output.tar", "sample_id", axes_origin=[], fnames=metrics_tar),
            refdir,
            samples
        ),
        kwargs={'single_node': single_node}
    )

    return workflow
예제 #4
0
def fastqc_workflow(fastq_r1, fastq_r2, r1_html, r1_plot, r2_html, r2_plot):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="fastqc_r1",
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        func='wgs.workflows.alignment.tasks.run_fastqc',
        args=(
            mgd.InputFile(fastq_r1),
            mgd.OutputFile(r1_html),
            mgd.OutputFile(r1_plot),
            mgd.TempSpace('fastqc_R1'),
        ),
    )

    workflow.transform(
        name="fastqc_r2",
        func='wgs.workflows.alignment.tasks.run_fastqc',
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        args=(
            mgd.InputFile(fastq_r2),
            mgd.OutputFile(r2_html),
            mgd.OutputFile(r2_plot),
            mgd.TempSpace('fastqc_R2'),
        ),
    )

    return workflow
예제 #5
0
def run_Strelka(config, normal_bam, tumour_bam, snv_output_file,
                indel_output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='configure_bed',
                       func=tasks.configure_bed,
                       args=(mgd.TempSpace('bed_space'),
                             mgd.InputFile(config['bed_file']),
                             mgd.TempOutputFile('bed.gz'),
                             mgd.TempOutputFile('bed.gz.tbi')))

    workflow.transform(name='run_strelka',
                       ctx={
                           'mem': 10,
                           'ncpus': 1,
                           'walltime': '08:00'
                       },
                       func=tasks.run_strelka,
                       args=(
                           config,
                           mgd.InputFile(normal_bam),
                           mgd.InputFile(tumour_bam),
                           mgd.TempInputFile('bed.gz'),
                           mgd.TempInputFile('bed.gz.tbi'),
                           mgd.TempSpace('strelka_workspace'),
                           mgd.OutputFile(snv_output_file),
                           mgd.OutputFile(indel_output_file),
                       ))

    return workflow
예제 #6
0
def create_cohort_qc_report(cohort_label, out_dir, filtered_cohort_maf,
                            cna_table, report_path):

    oncoplot = os.path.join(out_dir, cohort_label, "cohort_oncoplot.png")
    somatic_interactions_plot = os.path.join(out_dir, cohort_label,
                                             "somatic_interactions.png")
    summary_plot = os.path.join(out_dir, cohort_label, "summary.png")
    burden_plot = os.path.join(out_dir, cohort_label, "mutation_burden.png")

    workflow = pypeliner.workflow.Workflow()

    non_synonymous_labels = [
        "Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site",
        "Translation_Start_Site", "Nonsense_Mutation", "Nonstop_Mutation",
        "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation"
    ]

    workflow.transform(
        name='postprocess_maf',
        func='wgs.workflows.cohort_qc.tasks.prepare_maf_for_maftools',
        args=(cohort_label, mgd.InputFile(filtered_cohort_maf),
              mgd.TempOutputFile("prepared_maf"), non_synonymous_labels,
              mgd.TempOutputFile("vcNames")),
    )

    workflow.transform(
        name='burden_plot',
        func='wgs.workflows.cohort_qc.tasks.plot_mutation_burden',
        args=(
            mgd.InputFile(filtered_cohort_maf),
            mgd.OutputFile(burden_plot),
        ),
    )

    workflow.transform(
        name='build_gene_list',
        func='wgs.workflows.cohort_qc.tasks.build_gene_list',
        args=(mgd.InputFile(cna_table), mgd.TempOutputFile("genelist")),
    )
    workflow.transform(
        name='make_cohort_plots',
        func='wgs.workflows.cohort_qc.tasks.make_R_cohort_plots',
        args=(mgd.TempInputFile("prepared_maf"), mgd.InputFile(cna_table),
              mgd.OutputFile(oncoplot),
              mgd.OutputFile(somatic_interactions_plot),
              mgd.OutputFile(summary_plot), mgd.TempInputFile("vcNames"),
              mgd.TempInputFile("genelist")))

    workflow.transform(name='make_report',
                       func='wgs.workflows.cohort_qc.tasks.make_report',
                       args=(
                           cohort_label,
                           mgd.InputFile(oncoplot),
                           mgd.InputFile(somatic_interactions_plot),
                           mgd.InputFile(summary_plot),
                           mgd.InputFile(burden_plot),
                           mgd.OutputFile(report_path),
                       ))

    return workflow
예제 #7
0
def _create_download_cosmic_workflow(ref_data_version,
                                     out_file,
                                     user,
                                     password,
                                     host='sftp-cancer.sanger.ac.uk',
                                     local_download=False):

    host_base_path = '/files/{}/cosmic/v83/VCF'.format(
        ref_data_version.lower())

    coding_host_path = '/'.join([host_base_path, 'CosmicCodingMuts.vcf.gz'])

    non_coding_host_path = '/'.join(
        [host_base_path, 'CosmicNonCodingVariants.vcf.gz'])

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('coding_host_path'),
                    value=coding_host_path)

    workflow.setobj(obj=mgd.TempOutputObj('non_coding_host_path'),
                    value=non_coding_host_path)

    workflow.subworkflow(name='download_coding',
                         func=_create_download_cosmic_file_subworkflow,
                         args=(
                             host,
                             mgd.TempInputObj('coding_host_path'),
                             user,
                             password,
                             mgd.TempOutputFile('coding.vcf.gz'),
                         ),
                         kwargs={'local_download': local_download})

    workflow.subworkflow(name='download_non_coding',
                         func=_create_download_cosmic_file_subworkflow,
                         args=(
                             host,
                             mgd.TempInputObj('non_coding_host_path'),
                             user,
                             password,
                             mgd.TempOutputFile('non_coding.vcf.gz'),
                         ),
                         kwargs={'local_download': local_download})

    workflow.transform(name='merge_files',
                       func=soil.wrappers.samtools.tasks.concatenate_vcf,
                       args=([
                           mgd.TempInputFile('coding.vcf.gz'),
                           mgd.TempInputFile('non_coding.vcf.gz')
                       ], mgd.OutputFile(out_file)),
                       kwargs={
                           'allow_overlap': True,
                           'index_file': mgd.OutputFile(out_file + '.tbi')
                       })

    return workflow
예제 #8
0
def destruct_preprocess_workflow(normal_bam_files,
                                 normal_stats,
                                 normal_reads_1,
                                 normal_reads_2,
                                 normal_sample_1,
                                 normal_sample_2,
                                 ref_data_directory,
                                 destruct_config,
                                 config,
                                 tag=False):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name="get_destruct_config",
                       func="destruct.defaultconfig.get_config",
                       ctx={
                           'docker_image': config['docker']['destruct'],
                           'disk': 200
                       },
                       ret=mgd.TempOutputObj("destruct_config"),
                       args=(ref_data_directory, destruct_config))

    if isinstance(normal_bam_files, str):
        workflow.subworkflow(name='process_individual_cells',
                             func=process_cells_destruct,
                             args=(
                                 mgd.TempInputObj("destruct_config"),
                                 config,
                                 mgd.InputFile(normal_bam_files),
                                 mgd.OutputFile(normal_reads_1),
                                 mgd.OutputFile(normal_reads_2),
                                 mgd.OutputFile(normal_sample_1),
                                 mgd.OutputFile(normal_sample_2),
                                 mgd.OutputFile(normal_stats),
                             ),
                             kwargs={'tag': tag})
    else:
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_bam_files.keys()),
        )

        workflow.subworkflow(name='process_individual_cells',
                             func=process_cells_destruct,
                             args=(
                                 mgd.TempInputObj("destruct_config"),
                                 config,
                                 mgd.InputFile('bam',
                                               'normal_cell_id',
                                               fnames=normal_bam_files),
                                 mgd.OutputFile(normal_reads_1),
                                 mgd.OutputFile(normal_reads_2),
                                 mgd.OutputFile(normal_sample_1),
                                 mgd.OutputFile(normal_sample_2),
                                 mgd.OutputFile(normal_stats),
                             ),
                             kwargs={'tag': tag})

    return workflow
예제 #9
0
def destruct_preprocess_workflow(normal_bam_files,
                                 normal_stats,
                                 normal_reads_1,
                                 normal_reads_2,
                                 normal_sample_1,
                                 normal_sample_2,
                                 ref_data_directory,
                                 destruct_config,
                                 tag=False):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name="get_destruct_config",
                       func="destruct.defaultconfig.get_config",
                       ret=mgd.TempOutputObj("destruct_config"),
                       args=(ref_data_directory, destruct_config))

    if isinstance(normal_bam_files, str):
        workflow.transform(
            name='bamdisc_normal',
            func=
            "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads",
            ctx={
                'io': 1,
                'mem': 8,
                'disk': 200
            },
            args=(
                mgd.TempInputObj("destruct_config"),
                mgd.InputFile(normal_bam_files),
                mgd.OutputFile(normal_stats),
                mgd.OutputFile(normal_reads_1),
                mgd.OutputFile(normal_reads_2),
                mgd.OutputFile(normal_sample_1),
                mgd.OutputFile(normal_sample_2),
                mgd.TempSpace('bamdisc_normal_tempspace'),
            ))
    else:
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_bam_files.keys()),
        )

        workflow.subworkflow(name='process_normal_cells',
                             func=process_cells_destruct,
                             args=(
                                 mgd.TempInputObj("destruct_config"),
                                 mgd.InputFile('bam',
                                               'normal_cell_id',
                                               fnames=normal_bam_files),
                                 mgd.OutputFile(normal_reads_1),
                                 mgd.OutputFile(normal_reads_2),
                                 mgd.OutputFile(normal_sample_1),
                                 mgd.OutputFile(normal_sample_2),
                                 mgd.OutputFile(normal_stats),
                             ),
                             kwargs={'tag': tag})

    return workflow
예제 #10
0
def create_align_workflow(fastq_file_1,
                          fastq_file_2,
                          ref_genome_dir,
                          out_bam_file,
                          add_xs_tag=False,
                          align_threads=1,
                          read_group_info=None,
                          sort_threads=1):

    sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='star_align',
                       ctx={
                           'mem': 32,
                           'mem_retry_increment': 16,
                           'num_retry': 3,
                           'threads': align_threads
                       },
                       func=tasks.align,
                       args=(
                           mgd.InputFile(fastq_file_1),
                           mgd.InputFile(fastq_file_2),
                           ref_genome_dir,
                           mgd.TempOutputFile('aligned.bam'),
                           mgd.TempSpace('align_tmp'),
                       ),
                       kwargs={
                           'add_xs_tag': add_xs_tag,
                           'read_group_info': read_group_info,
                           'threads': align_threads,
                       })

    workflow.transform(name='sort',
                       ctx={
                           'mem': 32,
                           'mem_retry_increment': 16,
                           'num_retry': 3,
                           'threads': sort_threads
                       },
                       func=soil.wrappers.sambamba.tasks.sort,
                       args=(
                           mgd.TempInputFile('aligned.bam'),
                           mgd.OutputFile(out_bam_file),
                           mgd.TempSpace('sort_tmp'),
                       ),
                       kwargs={'threads': sort_threads})

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))

    return workflow
예제 #11
0
def create_lumpy_workflow(config, normal_bam, tumour_cell_bams,
                          lumpy_breakpoints_csv, lumpy_breakpoints_evidence,
                          lumpy_breakpoints_bed):
    ctx = {'docker_image': config['docker']['single_cell_pipeline']}
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.subworkflow(
        name='normal_preprocess_lumpy',
        func='single_cell.workflows.lumpy.lumpy_preprocess_workflow',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        args=(normal_bam, config,
              mgd.TempOutputFile('normal.discordants.sorted.bam'),
              mgd.TempOutputFile('normal.splitters.sorted.bam'),
              mgd.TempOutputFile('hist_normal_formatted.csv'),
              mgd.TempOutputFile('normal_mean_stdev.yaml')),
    )

    workflow.subworkflow(
        name='tumour_preprocess_lumpy',
        func='single_cell.workflows.lumpy.lumpy_preprocess_workflow',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        args=(mgd.InputFile('tumour_cells.bam',
                            'cell_id',
                            extensions=['.bai'],
                            fnames=tumour_cell_bams), config,
              mgd.TempOutputFile('tumour.discordants.sorted.bam'),
              mgd.TempOutputFile('tumour.splitters.sorted.bam'),
              mgd.TempOutputFile('hist_tumour_formatted.csv'),
              mgd.TempOutputFile('tumour_mean_stdev.yaml')),
    )

    workflow.subworkflow(
        name='lumpy',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        func="single_cell.workflows.lumpy.lumpy_calling_workflow",
        args=(
            config,
            mgd.TempInputFile('normal.discordants.sorted.bam'),
            mgd.TempInputFile('normal.splitters.sorted.bam'),
            mgd.TempInputFile('hist_normal_formatted.csv'),
            mgd.TempInputFile('normal_mean_stdev.yaml'),
            mgd.TempInputFile('tumour.discordants.sorted.bam'),
            mgd.TempInputFile('tumour.splitters.sorted.bam'),
            mgd.TempInputFile('hist_tumour_formatted.csv'),
            mgd.TempInputFile('tumour_mean_stdev.yaml'),
            mgd.OutputFile(lumpy_breakpoints_bed),
            mgd.OutputFile(lumpy_breakpoints_csv, extensions=['.yaml']),
            mgd.OutputFile(lumpy_breakpoints_evidence, extensions=['.yaml']),
        ),
    )

    return workflow
예제 #12
0
파일: cna_calling.py 프로젝트: wisekh6/wgs
def call_copynumber(
        samples, config, tumours, normals, breakpoints,
        titan_raw_dir, remixt_results,
        remixt_raw_dir, titan_segments, titan_params, titan_markers
):
    breakpoints = dict([(sampid, breakpoints[sampid])
                        for sampid in samples])
    remixt_results = dict([(sampid, remixt_results[sampid])
                           for sampid in samples])
    titan_segments = dict([(sampid, titan_segments[sampid])
                           for sampid in samples])
    titan_params = dict([(sampid, titan_params[sampid])
                         for sampid in samples])
    titan_markers = dict([(sampid, titan_markers[sampid])
                          for sampid in samples])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name='titan',
        func=titan.create_titan_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.Template(titan_raw_dir, 'sample_id'),
            mgd.OutputFile('titan_segments', 'sample_id', fnames=titan_segments),
            mgd.OutputFile('titan_params', 'sample_id', fnames=titan_params),
            mgd.OutputFile('titan_markers', 'sample_id', fnames=titan_markers),
            config['globals'],
            config['cna_calling'],
            config['cna_calling']['titan_intervals'],
        ),
    )

    workflow.subworkflow(
        name='remixt',
        func=remixt.create_remixt_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.InputFile('breakpoints', 'sample_id', fnames=breakpoints),
            mgd.InputInstance('sample_id'),
            config['cna_calling']['remixt_refdata'],
            mgd.OutputFile('remixt_results', 'sample_id', fnames=remixt_results),
            mgd.Template(remixt_raw_dir, 'sample_id'),
            config['cna_calling']['min_num_reads']
        ),
    )

    return workflow
예제 #13
0
def create_tophat_transcriptome_index_workflow(
        ref_genome_fasta_file,
        transcript_gtf_file,
        ref_genome_index_prefix,
        transcriptome_index_prefix,
        copy_ref_genome=False):

    workflow = Workflow()

    local_ref_genome_fasta_path = ref_genome_index_prefix + '.fa'

    if copy_ref_genome:
        workflow.commandline(
            name='copy_genome',
            ctx={'local': True},
            args=(
                'cp',
                mgd.InputFile(ref_genome_fasta_file),
                mgd.OutputFile(local_ref_genome_fasta_path),
            ),
        )

    else:
        workflow.commandline(
            name='link_genome',
            ctx={'local': True},
            args=(
                'ln',
                '-s',
                mgd.InputFile(ref_genome_fasta_file),
                mgd.OutputFile(local_ref_genome_fasta_path),
            ),
        )

    workflow.transform(
        name='build_bowtie_index',
        ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8},
        func=tasks.build_genome_index,
        args=(
            mgd.InputFile(local_ref_genome_fasta_path),
            mgd.OutputFile(ref_genome_index_prefix),
        )
    )

    workflow.transform(
        name='build_tophat_index',
        ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8},
        func=tasks.build_transcriptome_index,
        args=(
            mgd.InputFile(ref_genome_index_prefix),
            mgd.InputFile(transcript_gtf_file),
            mgd.OutputFile(transcriptome_index_prefix),
        )
    )

    return workflow
예제 #14
0
def count_haps_workflow(args):
    config = inpututils.load_config(args)
    config = config['count_haps']
    baseimage = config['docker']['single_cell_pipeline']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               docker_image=baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    allele_counts_filename = os.path.join(args["out_dir"],
                                          "allele_counts.csv.gz")

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    haplotypes_filename, tumour_cells = inpututils.load_count_haps_input(
        args['input_yaml'])

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    workflow.subworkflow(
        name='extract_allele_readcounts',
        func=
        'single_cell.workflows.extract_allele_readcounts.extract_allele_readcounts',
        args=(
            mgd.InputFile(haplotypes_filename, extensions=['.yaml']),
            mgd.InputFile('tumour_cells.bam',
                          'tumour_cell_id',
                          extensions=['.bai'],
                          axes_origin=[],
                          fnames=tumour_cells),
            mgd.OutputFile(allele_counts_filename),
            config,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], [allele_counts_filename],
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'count_haps'
            }
        })

    return workflow
예제 #15
0
def infer_haps_workflow(args):
    config = inpututils.load_config(args)
    config = config['infer_haps']
    baseimage = config['docker']['single_cell_pipeline']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               docker_image=baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    haplotypes_filename = os.path.join(args["out_dir"], "haplotypes.tsv")

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    normal_data = inpututils.load_infer_haps_input(args['input_yaml'])

    if isinstance(normal_data, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_data.keys()),
        )
        bam_file = mgd.InputFile('normal.bam',
                                 'normal_cell_id',
                                 fnames=normal_data,
                                 extensions=['.bai'])
    else:
        bam_file = mgd.InputFile(normal_data, extensions=['.bai'])

    workflow.subworkflow(
        name='infer_haps',
        func=infer_haps,
        args=(
            bam_file,
            mgd.OutputFile(haplotypes_filename),
            config,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], [haplotypes_filename],
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'infer_haps'
            }
        })

    return workflow
예제 #16
0
def patient_workflow(config, patient_id, patient_input, output_file):
    workflow = pypeliner.workflow.Workflow()

    patient_bam_dir = config["bam_directory"] + patient_id
    patient_result_dir = config["results_dir"] + patient_id

    helpers.makedirs(patient_bam_dir)
    helpers.makedirs(patient_result_dir)

    input_args = helpers.create_input_args(patient_input, patient_bam_dir)

    workflow.setobj(obj=mgd.OutputChunks('sample_id', ),
                    value=input_args['all_samples'])

    workflow.subworkflow(name='align_samples',
                         func=alignment.align_sample,
                         axes=('sample_id', ),
                         args=(
                             config,
                             mgd.InputFile('fastq_1',
                                           'sample_id',
                                           fnames=input_args['fastqs_r1']),
                             mgd.InputFile('fastq_2',
                                           'sample_id',
                                           fnames=input_args['fastqs_r2']),
                             mgd.InputInstance('sample_id'),
                             mgd.OutputFile('sample.bam',
                                            'sample_id',
                                            fnames=input_args['all_bams']),
                             mgd.OutputFile('sample.bam.bai',
                                            'sample_id',
                                            fnames=input_args['all_bais']),
                         ))

    workflow.subworkflow(name='run_analyses',
                         func=analysis.partition_tumour,
                         args=(
                             config,
                             input_args,
                             patient_id,
                             patient_result_dir,
                             mgd.InputFile('sample.bam',
                                           'sample_id',
                                           fnames=input_args['all_bams'],
                                           axes_origin=[]),
                             mgd.InputFile('sample.bam.bai',
                                           'sample_id',
                                           fnames=input_args['all_bais'],
                                           axes_origin=[]),
                             mgd.OutputFile(output_file),
                         ))

    return workflow
예제 #17
0
파일: workflows.py 프로젝트: aroth85/soil
def create_eagle_ref_data_workflow(vcf_url_template,
                                   out_file,
                                   local_download=False):

    chrom_map_file = soil.utils.package_data.load_data_file(
        'ref_data/data/GRCh37/chrom_map.tsv')

    chrom_map = pd.read_csv(chrom_map_file, sep='\t')

    chrom_map = chrom_map[chrom_map['ncbi'].isin(
        [str(x) for x in range(1, 23)])]

    chrom_map['url'] = chrom_map['ncbi'].apply(
        lambda x: vcf_url_template.format(chrom=x))

    vcf_urls = chrom_map['url'].to_dict()

    sandbox = soil.utils.workflow.get_sandbox(['bcftools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls)

    workflow.transform(name='download_vcf_files',
                       axes=('chrom', ),
                       ctx={'local': local_download},
                       func=soil.ref_data.tasks.download,
                       args=(mgd.TempInputObj('vcf_url', 'chrom'),
                             mgd.TempOutputFile('raw.vcf.gz', 'chrom')))

    workflow.transform(name='write_chrom_map',
                       func=tasks.write_chrom_map_file,
                       args=(mgd.InputFile(chrom_map_file),
                             mgd.TempOutputFile('chrom_map.tsv')))

    workflow.transform(name='rename_chroms',
                       axes=('chrom', ),
                       func=soil.wrappers.bcftools.tasks.rename_chroms,
                       args=(mgd.TempInputFile('chrom_map.tsv'),
                             mgd.TempInputFile('raw.vcf.gz', 'chrom'),
                             mgd.TempOutputFile('renamed.bcf', 'chrom')))

    workflow.transform(name='concat_vcfs',
                       func=soil.wrappers.bcftools.tasks.concatenate_vcf,
                       args=(mgd.TempInputFile('renamed.bcf', 'chrom'),
                             mgd.OutputFile(out_file)),
                       kwargs={'bcf_output': True})

    workflow.commandline(name='index',
                         args=('bcftools', 'index', mgd.InputFile(out_file),
                               '-o', mgd.OutputFile(out_file + '.csi')))

    return workflow
예제 #18
0
def create_patient_workflow(pseudo_bulk_group, mafs, sample_all_snv_csvs,
                            mutationreport, merged_maf, high_impact_maf,
                            merged_snvs, merged_high_impact_snvs):
    ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1}
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.transform(
        name='merge_mafs',
        func='single_cell.workflows.pseudo_bulk_qc.tasks.merge_mafs',
        args=(
            mafs,
            mgd.OutputFile(merged_maf),
        ),
        kwargs={"id_colname": True})
    workflow.transform(
        name='filter_merged_maf',
        func=
        'single_cell.workflows.pseudo_bulk_qc.tasks.filter_maf_for_high_impact',
        args=(
            mgd.InputFile(merged_maf),
            mgd.OutputFile(high_impact_maf),
        ),
    )
    workflow.transform(
        name='merge_snvs',
        func='single_cell.workflows.pseudo_bulk_qc.tasks.merge_snvs',
        args=(
            sample_all_snv_csvs,
            mgd.OutputFile(merged_snvs),
        ),
        kwargs={"id_colname": True})
    workflow.transform(
        name='filter_snvs',
        func=
        'single_cell.workflows.pseudo_bulk_qc.tasks.filter_snvs_for_high_impact',
        args=(
            mgd.InputFile(merged_snvs),
            mgd.OutputFile(merged_high_impact_snvs),
        ),
    )

    workflow.transform(
        name='mutationreport',
        func=
        'single_cell.workflows.pseudo_bulk_qc.tasks.create_mutation_report',
        args=(pseudo_bulk_group, mgd.InputFile(merged_maf),
              mgd.InputFile(high_impact_maf),
              mgd.InputFile(merged_high_impact_snvs),
              mgd.OutputFile(mutationreport), mgd.TempSpace("mutationreport")),
    )

    return workflow
예제 #19
0
def conversion_workflow(args):
    docker = docker_containers()

    converted_dir = args["out_dir"]

    cell_ids, cfse_images, livedead_images = get_cell_images(
        args['input_yaml'])

    converted_image_template = os.path.join(converted_dir, '{cell_id}.png')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': docker['microscope_image_converter']})

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.transform(
        name='convert',
        func='microscope_image_converter.tasks.convert',
        axes=('cell_id', ),
        args=(
            mgd.InputFile('livedead.tif', 'cell_id', fnames=livedead_images),
            mgd.InputFile('cfse.tif', 'cell_id', fnames=cfse_images),
            mgd.OutputFile('converted.png',
                           'cell_id',
                           template=converted_image_template,
                           axes_origin=[]),
        ),
    )

    converted_meta = os.path.join(converted_dir, 'metadata.yaml')
    input_yaml_blob = os.path.join(converted_dir, 'input.yaml')
    workflow.transform(
        name='generate_meta_files_results',
        func='microscope_image_converter.tasks.generate_and_upload_metadata',
        args=(sys.argv[0:], converted_dir,
              mgd.Template('converted.png',
                           'cell_id',
                           template=converted_image_template),
              mgd.OutputFile(converted_meta)),
        kwargs={
            'input_yaml_data': load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'cell_ids': cell_ids,
                'type': 'dlp_microscope_merged',
            }
        })

    return workflow
예제 #20
0
def infer_haps_workflow(args):
    config = helpers.load_config(args)
    config = config['infer_haps']
    baseimage = config['docker']['single_cell_pipeline']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               baseimage=baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    haps_dir = os.path.join(args["out_dir"], "infer_haps")
    haplotypes_filename = os.path.join(haps_dir, "results", "haplotypes.tsv")
    allele_counts_filename = os.path.join(haps_dir, "results",
                                          "allele_counts.tsv")

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    tumour_wgs = data['tumour_wgs']
    normal_wgs = data['normal_wgs']
    tumour_cells = data['tumour_cells']
    normal_cells = data['normal_cells']

    if args['normal']:
        bam_file = normal_cells if normal_cells else normal_wgs
    else:
        bam_file = tumour_cells if tumour_cells else tumour_wgs

    if isinstance(bam_file, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=list(bam_file.keys()),
        )
        bam_file = mgd.InputFile('tumour.bam',
                                 'cell_id',
                                 fnames=bam_file,
                                 extensions=['.bai'])
    else:
        bam_file = mgd.InputFile(bam_file, extensions=['.bai'])

    workflow.subworkflow(
        name='infer_haps',
        func=infer_haps,
        args=(
            bam_file,
            mgd.OutputFile(haplotypes_filename),
            mgd.OutputFile(allele_counts_filename),
            config,
        ),
        kwargs={'normal': args['normal']},
    )

    return workflow
예제 #21
0
def create_merge_bams_workflow(
        input_bams,
        merged_bams,
        regions,
        config,
):
    merged_bams = dict([(region, merged_bams[region])
                        for region in regions])


    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(input_bams.keys()),
    )

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=regions,
    )

    one_split_job = config["one_split_job"]

    if one_split_job:
        workflow.transform(
            name='merge_bams',
            ctx={'mem': config['memory']['med'], 'ncpus': config['max_cores']},
            func="single_cell.workflows.merge_bams.tasks.merge_bams",
            args=(
                mgd.InputFile('bam', 'cell_id', fnames=input_bams, extensions=['.bai']),
                mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[], extensions=['.bai']),
                regions,
                mgd.TempSpace("merge_bams_tempdir")
            ),
            kwargs={"ncores": config["max_cores"]}
        )
    else:
        workflow.transform(
            name='split_merge_tumour',
            func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams',
            axes=('region',),
            args=(
                mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=input_bams),
                mgd.OutputFile(
                    'tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=merged_bams),
                mgd.Instance('region'),
            ),
        )

    return workflow
예제 #22
0
def create_sample_qc_workflow_normal_only(
        sample_id,
        refdir,
        normal_bam,
        roh,
        germline_calls,
        genome_wide_plot,
        normal_coverage,
        chromosomes,
        bins,
        mapping_qual_threshold,
        single_node=False
):

    workflow = pypeliner.workflow.Workflow()

    workflow.subworkflow(
        name='coverage_normal_data',
        func=get_coverage_data,
        args=(
            mgd.InputFile(normal_bam),
            mgd.OutputFile(normal_coverage),
            refdir,
            chromosomes,
            mapping_qual_threshold,
            bins,
        ),
        kwargs={'single_node': single_node}
    )



    workflow.transform(
        name='generate_genome_wide_plot',
        ctx=helpers.get_default_ctx(
            memory=10,
        ),
        func="wgs.workflows.sample_qc.tasks.genome_wide",
        args=(
            sample_id,
            mgd.InputFile(roh),
            mgd.InputFile(germline_calls),
            mgd.InputFile(normal_coverage),
            chromosomes,
            mgd.OutputFile(genome_wide_plot),
        ),
        kwargs={"normal_only":True}
    )

    return workflow
예제 #23
0
파일: workflows.py 프로젝트: aroth85/soil
def create_custom_dna_proteome_from_fastq_workflow(normal_fastq_file_1,
                                                   normal_fastq_file_2,
                                                   tumour_fastq_file_1,
                                                   tumour_fastq_file_2,
                                                   ref_genome_fasta_file,
                                                   ref_proteome_fasta_file,
                                                   normal_bam_file,
                                                   tumour_bam_file,
                                                   custom_proteome_file,
                                                   strelka_file,
                                                   genome_version='GRCh37',
                                                   is_exome=False,
                                                   pyensembl_cache_dir=None,
                                                   threads=1):

    workflow = pypeliner.workflow.Workflow()

    workflow.subworkflow(name='align_normal',
                         func=create_align_workflow,
                         args=(mgd.InputFile(normal_fastq_file_1),
                               mgd.InputFile(normal_fastq_file_2),
                               mgd.InputFile(ref_genome_fasta_file),
                               mgd.OutputFile(normal_bam_file)),
                         kwargs={'threads': threads})

    workflow.subworkflow(name='align_tumour',
                         func=create_align_workflow,
                         args=(mgd.InputFile(tumour_fastq_file_1),
                               mgd.InputFile(tumour_fastq_file_2),
                               mgd.InputFile(ref_genome_fasta_file),
                               mgd.OutputFile(tumour_bam_file)),
                         kwargs={'threads': threads})

    workflow.subworkflow(name='create_db',
                         func=create_custom_proteom_from_bam_workflow,
                         args=(mgd.InputFile(normal_bam_file),
                               mgd.InputFile(tumour_bam_file),
                               mgd.InputFile(ref_genome_fasta_file),
                               mgd.InputFile(ref_proteome_fasta_file),
                               mgd.OutputFile(custom_proteome_file),
                               mgd.OutputFile(strelka_file)),
                         kwargs={
                             'genome_version': genome_version,
                             'is_exome': is_exome,
                             'pyensembl_cache_dir': pyensembl_cache_dir
                         })

    return workflow
예제 #24
0
def create_vcf2maf_workflow(vcf_file,
                            maf_file,
                            reference,
                            tumour_id=None,
                            normal_id=None):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='vcf2maf',
                       func='wgs.workflows.vcf2maf.tasks.run_vcf2maf',
                       args=(mgd.InputFile(vcf_file),
                             mgd.TempOutputFile('maf_file.maf'),
                             mgd.TempSpace('vcf2maf_temp'), reference),
                       kwargs={
                           'tumour_id': tumour_id,
                           'normal_id': normal_id
                       })

    workflow.transform(name='update_ids',
                       func='wgs.workflows.vcf2maf.tasks.update_ids',
                       args=(
                           mgd.TempInputFile('maf_file.maf'),
                           tumour_id,
                           normal_id,
                           mgd.OutputFile(maf_file),
                       ))

    return workflow
예제 #25
0
def run_MutationSeq(config, normal_bam, tumour_bam, output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X'])))

    workflow.transform(
        name='run_museq_paired',
        ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'},
        axes=('interval',),
        func=tasks.run_museq,
        args=(
            config,
            mgd.InputFile(normal_bam),
            mgd.InputFile(tumour_bam),
            mgd.InputInstance('interval'),
            mgd.TempOutputFile('museq.vcf', 'interval'),
            mgd.TempOutputFile('museq.log', 'interval'),
            )
        )

    workflow.transform(
        name='merge_vcfs',
        func=tasks.merge_vcfs,
        args=(
            mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]),
            mgd.OutputFile(output_file),
            mgd.TempSpace('merge_vcf'),
            )
        )

    return workflow
예제 #26
0
def create_workflow_1(input_filename, output_filename):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1})

    # Read data into a managed object
    workflow.transform(name='read',
                       func=read_stuff,
                       ret=mgd.TempOutputObj('input_data'),
                       args=(mgd.InputFile(input_filename), ))

    # Extract a property of the managed object, modify it
    # and store the result in another managed object
    workflow.transform(
        name='do',
        func=do_stuff,
        ret=mgd.TempOutputObj('output_data'),
        args=(mgd.TempInputObj('input_data').prop('some_string'), ))

    # Write the object to an output file
    workflow.transform(name='write',
                       func=write_stuff,
                       args=(mgd.TempInputObj('output_data'),
                             mgd.TempOutputFile('output_file')))

    # Recursive workflow
    workflow.subworkflow(name='sub_workflow_2',
                         func=create_workflow_2,
                         args=(mgd.TempInputFile('output_file'),
                               mgd.OutputFile(output_filename)))

    return workflow
예제 #27
0
def create_lumpy_workflow(lumpy_vcf,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=False):
    workflow = pypeliner.workflow.Workflow()

    lumpy_job_name = 'run_lumpy'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam)
        normal_disc = mgd.TempInputFile('normal.discordants.sorted.bam')
        normal_split = mgd.TempInputFile('normal.splitters.sorted.bam')
        lumpy_job_name += '_normal'
    else:
        normal_disc = None
        normal_split = None

    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam)
        tumour_disc = mgd.TempInputFile('tumour.discordants.sorted.bam')
        tumour_split = mgd.TempInputFile('tumour.splitters.sorted.bam')
        lumpy_job_name += '_tumour'
    else:
        tumour_disc = None
        tumour_split = None

    if normal_bam:
        workflow.subworkflow(
            name='preprocess_lumpy_normal',
            func=lumpy_preprocess_workflow,
            args=(normal_bam,
                  mgd.TempOutputFile('normal.discordants.sorted.bam'),
                  mgd.TempOutputFile('normal.splitters.sorted.bam')),
            kwargs={'single_node': single_node})

    if tumour_bam:
        workflow.subworkflow(
            name='preprocess_lumpy_tumour',
            func=lumpy_preprocess_workflow,
            args=(tumour_bam,
                  mgd.TempOutputFile('tumour.discordants.sorted.bam'),
                  mgd.TempOutputFile('tumour.splitters.sorted.bam')),
            kwargs={'single_node': single_node})

    workflow.transform(
        name=lumpy_job_name,
        ctx=helpers.get_default_ctx(memory=10, disk=500, walltime='72:00'),
        func='wgs.workflows.lumpy.tasks.run_lumpyexpress',
        args=(mgd.OutputFile(lumpy_vcf),
              config.default_params('breakpoint_calling')['lumpy_paths']),
        kwargs={
            'tumour_bam': tumour_bam,
            'tumour_discordants': tumour_disc,
            'tumour_splitters': tumour_split,
            'normal_bam': normal_bam,
            'normal_discordants': normal_disc,
            'normal_splitters': normal_split,
            'docker_image': config.containers('lumpy')
        })

    return workflow
예제 #28
0
def create_db_annotation_workflow(in_vcf_file,
                                  out_csv_file,
                                  db_vcf_file,
                                  split_size=1e4):
    workflow = pypeliner.workflow.Workflow(
        ctx=dict(mem=2, num_retry=3, mem_retry_increment=2))

    workflow.transform(name='split_vcf',
                       func='single_cell.utils.vcfutils.split_vcf',
                       args=(mgd.InputFile(in_vcf_file),
                             mgd.TempOutputFile('split.vcf', 'split')),
                       kwargs={'lines_per_file': split_size})

    workflow.transform(
        name='annotate_db_status',
        axes=('split', ),
        func='single_cell.workflows.db_annotation.tasks.annotate_db_status',
        args=(db_vcf_file, mgd.TempInputFile('split.vcf', 'split'),
              mgd.TempOutputFile('annotated.csv.gz',
                                 'split',
                                 extensions=['.yaml'])))

    workflow.transform(name='merge_tables',
                       func='single_cell.utils.csvutils.concatenate_csv',
                       args=(mgd.TempInputFile('annotated.csv.gz',
                                               'split',
                                               extensions=['.yaml']),
                             mgd.OutputFile(out_csv_file,
                                            extensions=['.yaml'])))

    return workflow
예제 #29
0
def create_extract_seqdata_workflow(
    bam_filename,
    seqdata_filename,
    remixt_config,
    remixt_ref_data_dir,
    config,
):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'docker_image': config['docker']['single_cell_pipeline'],
        'mem': config["memory"]['high']
    }

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='create_cell_seqdata',
        ctx=ctx,
        func=
        "single_cell.workflows.extract_seqdata.tasks.create_chromosome_seqdata",
        args=(
            mgd.OutputFile(seqdata_filename),
            mgd.InputFile(bam_filename, extensions=['.bai']),
            mgd.TempSpace("extract_seqdata_temp"),
            remixt_config,
            remixt_ref_data_dir,
        ),
        kwargs={'chromosomes': config['chromosomes']})

    return workflow
예제 #30
0
파일: workflows.py 프로젝트: aroth85/soil
def create_db_workflow(in_file,
                       ref_proteome_fasta_file,
                       out_file,
                       genome_version='GRCh37',
                       pyensembl_cache_dir=None):

    sandbox = pypeliner.sandbox.CondaSandbox(pip_packages=['varcode'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='clean_ref_fasta',
                       func=tasks.clean_ref_proteome_ids,
                       args=(mgd.InputFile(ref_proteome_fasta_file),
                             mgd.TempOutputFile('ref.fasta')))

    workflow.transform(name='build_variant_table',
                       func=tasks.build_variant_table,
                       args=(mgd.InputFile(in_file),
                             mgd.TempOutputFile('variant_table.tsv.gz')),
                       kwargs={
                           'genome_version': genome_version,
                           'pyensembl_cache_dir': pyensembl_cache_dir
                       })

    workflow.transform(name='build_variant_fasta',
                       func=tasks.build_variant_fasta,
                       args=(mgd.TempInputFile('variant_table.tsv.gz'),
                             mgd.TempOutputFile('var.fasta')))

    workflow.commandline(name='build_db',
                         args=('cat', mgd.TempInputFile('ref.fasta'),
                               mgd.TempInputFile('var.fasta'), '>',
                               mgd.OutputFile(out_file)))

    return workflow