예제 #1
0
def run_Strelka(config, normal_bam, tumour_bam, snv_output_file,
                indel_output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='configure_bed',
                       func=tasks.configure_bed,
                       args=(mgd.TempSpace('bed_space'),
                             mgd.InputFile(config['bed_file']),
                             mgd.TempOutputFile('bed.gz'),
                             mgd.TempOutputFile('bed.gz.tbi')))

    workflow.transform(name='run_strelka',
                       ctx={
                           'mem': 10,
                           'ncpus': 1,
                           'walltime': '08:00'
                       },
                       func=tasks.run_strelka,
                       args=(
                           config,
                           mgd.InputFile(normal_bam),
                           mgd.InputFile(tumour_bam),
                           mgd.TempInputFile('bed.gz'),
                           mgd.TempInputFile('bed.gz.tbi'),
                           mgd.TempSpace('strelka_workspace'),
                           mgd.OutputFile(snv_output_file),
                           mgd.OutputFile(indel_output_file),
                       ))

    return workflow
예제 #2
0
def fastqc_workflow(fastq_r1, fastq_r2, r1_html, r1_plot, r2_html, r2_plot):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="fastqc_r1",
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        func='wgs.workflows.alignment.tasks.run_fastqc',
        args=(
            mgd.InputFile(fastq_r1),
            mgd.OutputFile(r1_html),
            mgd.OutputFile(r1_plot),
            mgd.TempSpace('fastqc_R1'),
        ),
    )

    workflow.transform(
        name="fastqc_r2",
        func='wgs.workflows.alignment.tasks.run_fastqc',
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        args=(
            mgd.InputFile(fastq_r2),
            mgd.OutputFile(r2_html),
            mgd.OutputFile(r2_plot),
            mgd.TempSpace('fastqc_R2'),
        ),
    )

    return workflow
예제 #3
0
def create_optitype_workflow(bam_file, hla_type_file, is_rna=False, threads=1):
    if check_chr_prefix(bam_file):
        chrom_str = 'chr6'
    else:
        chrom_str = '6'

    sandbox = soil.utils.workflow.get_sandbox(
        ['optitype', 'razers3', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.commandline(
        name='extract_chr6',
        args=(
            'samtools',
            'view',
            '-bh',
            '-f',
            '2',
            '-F',
            '4',
            mgd.InputFile(bam_file),
            chrom_str,
            '|',
            'samtools',
            'collate',
            '-O',
            '-',
            mgd.TempSpace('chr6_collate_temp'),
            '|',
            'samtools',
            'bam2fq',
            '-1',
            mgd.TempOutputFile('chr6_reads_1.fq'),
            '-2',
            mgd.TempOutputFile('chr6_reads_2.fq'),
            '-',
        ),
    )

    workflow.transform(name='optitype',
                       ctx={
                           'mem': 24,
                           'mem_retry_increment': 8,
                           'num_retry': 3,
                           'threads': threads
                       },
                       func=tasks.run_optitype,
                       args=(
                           mgd.TempInputFile('chr6_reads_1.fq'),
                           mgd.TempInputFile('chr6_reads_2.fq'),
                           mgd.OutputFile(hla_type_file),
                           mgd.TempSpace('optitype_temp'),
                       ),
                       kwargs={
                           'is_rna': is_rna,
                           'threads': threads,
                       })

    return workflow
예제 #4
0
def create_align_workflow(fastq_file_1,
                          fastq_file_2,
                          ref_genome_dir,
                          out_bam_file,
                          add_xs_tag=False,
                          align_threads=1,
                          read_group_info=None,
                          sort_threads=1):

    sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='star_align',
                       ctx={
                           'mem': 32,
                           'mem_retry_increment': 16,
                           'num_retry': 3,
                           'threads': align_threads
                       },
                       func=tasks.align,
                       args=(
                           mgd.InputFile(fastq_file_1),
                           mgd.InputFile(fastq_file_2),
                           ref_genome_dir,
                           mgd.TempOutputFile('aligned.bam'),
                           mgd.TempSpace('align_tmp'),
                       ),
                       kwargs={
                           'add_xs_tag': add_xs_tag,
                           'read_group_info': read_group_info,
                           'threads': align_threads,
                       })

    workflow.transform(name='sort',
                       ctx={
                           'mem': 32,
                           'mem_retry_increment': 16,
                           'num_retry': 3,
                           'threads': sort_threads
                       },
                       func=soil.wrappers.sambamba.tasks.sort,
                       args=(
                           mgd.TempInputFile('aligned.bam'),
                           mgd.OutputFile(out_bam_file),
                           mgd.TempSpace('sort_tmp'),
                       ),
                       kwargs={'threads': sort_threads})

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))

    return workflow
예제 #5
0
def create_vcf2maf_workflow(vcf_file,
                            maf_file,
                            reference,
                            tumour_id=None,
                            normal_id=None):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='vcf2maf',
                       func='wgs.workflows.vcf2maf.tasks.run_vcf2maf',
                       args=(mgd.InputFile(vcf_file),
                             mgd.TempOutputFile('maf_file.maf'),
                             mgd.TempSpace('vcf2maf_temp'), reference),
                       kwargs={
                           'tumour_id': tumour_id,
                           'normal_id': normal_id
                       })

    workflow.transform(name='update_ids',
                       func='wgs.workflows.vcf2maf.tasks.update_ids',
                       args=(
                           mgd.TempInputFile('maf_file.maf'),
                           tumour_id,
                           normal_id,
                           mgd.OutputFile(maf_file),
                       ))

    return workflow
예제 #6
0
def run_MutationSeq(config, normal_bam, tumour_bam, output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X'])))

    workflow.transform(
        name='run_museq_paired',
        ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'},
        axes=('interval',),
        func=tasks.run_museq,
        args=(
            config,
            mgd.InputFile(normal_bam),
            mgd.InputFile(tumour_bam),
            mgd.InputInstance('interval'),
            mgd.TempOutputFile('museq.vcf', 'interval'),
            mgd.TempOutputFile('museq.log', 'interval'),
            )
        )

    workflow.transform(
        name='merge_vcfs',
        func=tasks.merge_vcfs,
        args=(
            mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]),
            mgd.OutputFile(output_file),
            mgd.TempSpace('merge_vcf'),
            )
        )

    return workflow
예제 #7
0
def create_svaba_workflow(
    tumour_bam,
    normal_bam,
    svaba_vcf,
    reference,
):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='run_svaba',
        ctx=helpers.get_default_ctx(memory=10,
                                    walltime='72:00',
                                    ncpus='8',
                                    disk=300),
        func='wgs.workflows.svaba.tasks.run_svaba',
        args=(mgd.InputFile(tumour_bam), mgd.InputFile(normal_bam),
              mgd.TempOutputFile('germline.indel.vcf.gz'),
              mgd.TempOutputFile('germline.sv.vcf.gz'),
              mgd.TempOutputFile('somatic.indel.vcf.gz'),
              mgd.OutputFile(svaba_vcf),
              mgd.TempOutputFile('unfiltered.germline.indel.vcf.gz'),
              mgd.TempOutputFile('unfiltered.germline.sv.vcf.gz'),
              mgd.TempOutputFile('unfiltered.somatic.indel.vcf.gz'),
              mgd.TempOutputFile('unfiltered.somatic.sv.vcf.gz'), reference,
              mgd.TempSpace('svaba_tempdir_full')),
        kwargs={
            'ncores': 8,
        })

    return workflow
예제 #8
0
def create_extract_seqdata_workflow(
    bam_filename,
    seqdata_filename,
    remixt_config,
    remixt_ref_data_dir,
    config,
):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'docker_image': config['docker']['single_cell_pipeline'],
        'mem': config["memory"]['high']
    }

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='create_cell_seqdata',
        ctx=ctx,
        func=
        "single_cell.workflows.extract_seqdata.tasks.create_chromosome_seqdata",
        args=(
            mgd.OutputFile(seqdata_filename),
            mgd.InputFile(bam_filename, extensions=['.bai']),
            mgd.TempSpace("extract_seqdata_temp"),
            remixt_config,
            remixt_ref_data_dir,
        ),
        kwargs={'chromosomes': config['chromosomes']})

    return workflow
예제 #9
0
def destruct_preprocess_workflow(normal_bam_files,
                                 normal_stats,
                                 normal_reads_1,
                                 normal_reads_2,
                                 normal_sample_1,
                                 normal_sample_2,
                                 ref_data_directory,
                                 destruct_config,
                                 tag=False):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name="get_destruct_config",
                       func="destruct.defaultconfig.get_config",
                       ret=mgd.TempOutputObj("destruct_config"),
                       args=(ref_data_directory, destruct_config))

    if isinstance(normal_bam_files, str):
        workflow.transform(
            name='bamdisc_normal',
            func=
            "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads",
            ctx={
                'io': 1,
                'mem': 8,
                'disk': 200
            },
            args=(
                mgd.TempInputObj("destruct_config"),
                mgd.InputFile(normal_bam_files),
                mgd.OutputFile(normal_stats),
                mgd.OutputFile(normal_reads_1),
                mgd.OutputFile(normal_reads_2),
                mgd.OutputFile(normal_sample_1),
                mgd.OutputFile(normal_sample_2),
                mgd.TempSpace('bamdisc_normal_tempspace'),
            ))
    else:
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_bam_files.keys()),
        )

        workflow.subworkflow(name='process_normal_cells',
                             func=process_cells_destruct,
                             args=(
                                 mgd.TempInputObj("destruct_config"),
                                 mgd.InputFile('bam',
                                               'normal_cell_id',
                                               fnames=normal_bam_files),
                                 mgd.OutputFile(normal_reads_1),
                                 mgd.OutputFile(normal_reads_2),
                                 mgd.OutputFile(normal_sample_1),
                                 mgd.OutputFile(normal_sample_2),
                                 mgd.OutputFile(normal_stats),
                             ),
                             kwargs={'tag': tag})

    return workflow
예제 #10
0
def pre_alignment(fastq_r1, fastq_r2, metrics_tar):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="fastqc_r1",
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        func='alignment.workflows.pre_alignment.tasks.run_fastqc',
        args=(
            mgd.InputFile(fastq_r1),
            mgd.TempOutputFile('R1.html'),
            mgd.TempOutputFile('R1.pdf'),
            mgd.TempSpace('fastqc_R1'),
        ),
        kwargs={
            'docker_image': config.containers("fastqc"),
        })

    workflow.transform(
        name="fastqc_r2",
        func='alignment.workflows.pre_alignment.tasks.run_fastqc',
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        args=(
            mgd.InputFile(fastq_r2),
            mgd.TempOutputFile('R2.html'),
            mgd.TempOutputFile('R2.pdf'),
            mgd.TempSpace('fastqc_R2'),
        ),
        kwargs={
            'docker_image': config.containers('fastqc'),
        })

    workflow.transform(name='tar',
                       func='alignment.utils.helpers.make_tar_from_files',
                       axes=('sample_id', ),
                       args=(mgd.OutputFile(metrics_tar), [
                           mgd.TempInputFile('R2.html'),
                           mgd.TempInputFile('R2.pdf'),
                           mgd.TempInputFile('R2.html'),
                           mgd.TempInputFile('R2.pdf'),
                       ], mgd.TempSpace('wgs_metrics')))

    return workflow
예제 #11
0
def create_patient_workflow(pseudo_bulk_group, mafs, sample_all_snv_csvs,
                            mutationreport, merged_maf, high_impact_maf,
                            merged_snvs, merged_high_impact_snvs):
    ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1}
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.transform(
        name='merge_mafs',
        func='single_cell.workflows.pseudo_bulk_qc.tasks.merge_mafs',
        args=(
            mafs,
            mgd.OutputFile(merged_maf),
        ),
        kwargs={"id_colname": True})
    workflow.transform(
        name='filter_merged_maf',
        func=
        'single_cell.workflows.pseudo_bulk_qc.tasks.filter_maf_for_high_impact',
        args=(
            mgd.InputFile(merged_maf),
            mgd.OutputFile(high_impact_maf),
        ),
    )
    workflow.transform(
        name='merge_snvs',
        func='single_cell.workflows.pseudo_bulk_qc.tasks.merge_snvs',
        args=(
            sample_all_snv_csvs,
            mgd.OutputFile(merged_snvs),
        ),
        kwargs={"id_colname": True})
    workflow.transform(
        name='filter_snvs',
        func=
        'single_cell.workflows.pseudo_bulk_qc.tasks.filter_snvs_for_high_impact',
        args=(
            mgd.InputFile(merged_snvs),
            mgd.OutputFile(merged_high_impact_snvs),
        ),
    )

    workflow.transform(
        name='mutationreport',
        func=
        'single_cell.workflows.pseudo_bulk_qc.tasks.create_mutation_report',
        args=(pseudo_bulk_group, mgd.InputFile(merged_maf),
              mgd.InputFile(high_impact_maf),
              mgd.InputFile(merged_high_impact_snvs),
              mgd.OutputFile(mutationreport), mgd.TempSpace("mutationreport")),
    )

    return workflow
예제 #12
0
def create_merge_bams_workflow(
        input_bams,
        merged_bams,
        regions,
        config,
):
    merged_bams = dict([(region, merged_bams[region])
                        for region in regions])


    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(input_bams.keys()),
    )

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=regions,
    )

    one_split_job = config["one_split_job"]

    if one_split_job:
        workflow.transform(
            name='merge_bams',
            ctx={'mem': config['memory']['med'], 'ncpus': config['max_cores']},
            func="single_cell.workflows.merge_bams.tasks.merge_bams",
            args=(
                mgd.InputFile('bam', 'cell_id', fnames=input_bams, extensions=['.bai']),
                mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[], extensions=['.bai']),
                regions,
                mgd.TempSpace("merge_bams_tempdir")
            ),
            kwargs={"ncores": config["max_cores"]}
        )
    else:
        workflow.transform(
            name='split_merge_tumour',
            func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams',
            axes=('region',),
            args=(
                mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=input_bams),
                mgd.OutputFile(
                    'tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=merged_bams),
                mgd.Instance('region'),
            ),
        )

    return workflow
예제 #13
0
def create_somatic_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            out_file,
                            chromosomes=None,
                            split_size=int(1e7)):

    regions = utils.get_bam_regions(normal_bam_file,
                                    split_size,
                                    chromosomes=chromosomes)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'),
                    value=regions)

    workflow.transform(
        name='run_somatic',
        axes=('regions', ),
        ctx={
            'mem': 6,
            'mem_retry_increment': 2,
            'num_retry': 3
        },
        func=tasks.run_somatic,
        args=(
            mgd.InputFile(normal_bam_file),
            mgd.InputFile(tumour_bam_file),
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('region.vcf.gz', 'regions'),
            mgd.TempInputObj('config', 'regions'),
            mgd.TempSpace('varscan_tmp', 'regions'),
        ),
    )

    workflow.transform(
        name='merge',
        axes=(),
        ctx={
            'mem': 2,
            'mem_retry_increment': 2,
            'num_retry': 3
        },
        func=vcf_tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('region.vcf.gz', 'regions'),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
예제 #14
0
def circos_plot(titan_calls, remixt_calls, sample_id, breakpoints,
           circos_plot_remixt, circos_plot_titan):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='prep_titan',
        func='wgs_qc_utils.reader.read_titan.make_for_circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.InputFile(titan_calls),
            mgd.TempOutputFile("titan_prepped"),
        )
    )

    workflow.transform(
        name='prep_remixt',
        func='wgs_qc_utils.reader.read_remixt.make_for_circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.InputFile(remixt_calls),
            sample_id,
            mgd.TempOutputFile("remixt_prepped"),
        )
    )
    workflow.transform(
        name='circos_plot',
        func='wgs.workflows.sample_qc.tasks.circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.TempInputFile("titan_prepped"),
            mgd.TempInputFile("remixt_prepped"),
            sample_id,
            breakpoints,
            mgd.OutputFile(circos_plot_remixt),
            mgd.OutputFile(circos_plot_titan),
            mgd.TempSpace("circos")
        )
    )

    return workflow
예제 #15
0
파일: workflows.py 프로젝트: aroth85/soil
def create_align_workflow(fastq_file_1,
                          fastq_file_2,
                          ref_genome_fasta_file,
                          out_bam_file,
                          threads=1):
    sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.subworkflow(
        name='align',
        func=soil.wrappers.bwa.workflows.create_align_workflow,
        args=(mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2),
              mgd.InputFile(ref_genome_fasta_file),
              mgd.TempOutputFile('aligned.bam')),
        kwargs={
            'align_threads': threads,
            'sort_threads': threads
        })

    workflow.transform(name='mark_dups',
                       func=soil.wrappers.sambamba.tasks.markdups,
                       args=(mgd.TempInputFile('aligned.bam'),
                             mgd.OutputFile(out_bam_file),
                             mgd.TempSpace('mark_dups_tmp')),
                       kwargs={'threads': threads})

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))

    return workflow
예제 #16
0
def create_destruct_workflow(
    bam_filenames,
    breakpoint_table,
    breakpoint_library_table,
    breakpoint_read_table,
    config,
    ref_data_dir,
    raw_data_dir=None,
):
    # Optionally cache raw reads for quicker rerun
    if raw_data_dir is not None:
        mgd_stats = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_stats.txt'), 'bylibrary')
        mgd_reads_1 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_reads1.fq.gz'),
            'bylibrary')
        mgd_reads_2 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_reads2.fq.gz'),
            'bylibrary')
        mgd_sample_1 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_sample1.fq.gz'),
            'bylibrary')
        mgd_sample_2 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_sample2.fq.gz'),
            'bylibrary')

    else:
        mgd_stats = mgd.TempFile('stats.txt', 'bylibrary')
        mgd_reads_1 = mgd.TempFile('reads1.fq.gz', 'bylibrary')
        mgd_reads_2 = mgd.TempFile('reads2.fq.gz', 'bylibrary')
        mgd_sample_1 = mgd.TempFile('sample1.fq.gz', 'bylibrary')
        mgd_sample_2 = mgd.TempFile('sample2.fq.gz', 'bylibrary')

    config = destruct.defaultconfig.get_config(ref_data_dir, config)

    workflow = pypeliner.workflow.Workflow()

    # Set the library ids

    workflow.setobj(
        obj=mgd.TempOutputObj('library_id', 'bylibrary'),
        value=destruct.tasks.create_library_ids(bam_filenames.keys()),
    )

    # Retrieve discordant reads and stats from bam files

    workflow.commandline(
        name='bamdisc',
        axes=('bylibrary', ),
        ctx={
            'io': 1,
            'mem': 8
        },
        args=(
            'destruct_bamdiscordantfastq',
            '-r',
            '-c',
            config['bam_max_soft_clipped'],
            '-f',
            config['bam_max_fragment_length'],
            '-b',
            mgd.InputFile('bam', 'bylibrary', fnames=bam_filenames),
            '-s',
            mgd_stats.as_output(),
            '--fastq1',
            mgd_reads_1.as_output(),
            '--fastq2',
            mgd_reads_2.as_output(),
            '-t',
            mgd.TempSpace('bamdisc.tempspace', 'bylibrary'),
            '-n',
            config['num_read_samples'],
            '--sample1',
            mgd_sample_1.as_output(),
            '--sample2',
            mgd_sample_2.as_output(),
        ),
    )

    workflow.subworkflow(
        name='destruct_fastq',
        func=create_destruct_fastq_workflow,
        args=(
            mgd_reads_1.as_input(),
            mgd_reads_2.as_input(),
            mgd_sample_1.as_input(),
            mgd_sample_2.as_input(),
            mgd_stats.as_input(),
            mgd.OutputFile(breakpoint_table),
            mgd.OutputFile(breakpoint_library_table),
            mgd.OutputFile(breakpoint_read_table),
            config,
            ref_data_dir,
        ),
        kwargs={
            'raw_data_dir': raw_data_dir,
        },
    )

    return workflow
예제 #17
0
def create_destruct_fastq_workflow(
    fastq1_filenames,
    fastq2_filenames,
    sample1_filenames,
    sample2_filenames,
    stats_filenames,
    breakpoint_table,
    breakpoint_library_table,
    breakpoint_read_table,
    config,
    ref_data_dir,
    raw_data_dir=None,
):
    workflow = pypeliner.workflow.Workflow()

    # Set the library ids

    workflow.setobj(
        obj=mgd.TempOutputObj('library_id', 'bylibrary'),
        value=destruct.tasks.create_library_ids(fastq1_filenames.keys()),
    )

    workflow.transform(
        name='readstats',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.read_stats',
        ret=mgd.TempOutputObj('stats', 'bylibrary'),
        args=(
            mgd.InputFile('stats.txt', 'bylibrary', fnames=stats_filenames),
            config['fragment_length_num_stddevs'],
        ),
    )

    # Align a sample of reads and calculate alignment statistics

    workflow.transform(
        name='prepseed_sample',
        axes=('bylibrary', ),
        ctx=medmem,
        func='destruct.tasks.prepare_seed_fastq',
        args=(
            mgd.InputFile('sample1.fq.gz',
                          'bylibrary',
                          fnames=sample1_filenames),
            mgd.InputFile('sample2.fq.gz',
                          'bylibrary',
                          fnames=sample2_filenames),
            36,
            mgd.TempOutputFile('sample.seed', 'bylibrary'),
        ),
    )

    workflow.commandline(
        name='bwtrealign_sample',
        axes=('bylibrary', ),
        ctx=medmem,
        args=(
            'bowtie',
            config['genome_fasta'],
            mgd.TempInputFile('sample.seed', 'bylibrary'),
            '--chunkmbs',
            '512',
            '-k',
            '1000',
            '-m',
            '1000',
            '--strata',
            '--best',
            '-S',
            '|',
            'destruct_aligntrue',
            '-a',
            '-',
            '-1',
            mgd.InputFile('sample1.fq.gz',
                          'bylibrary',
                          fnames=sample1_filenames),
            '-2',
            mgd.InputFile('sample2.fq.gz',
                          'bylibrary',
                          fnames=sample2_filenames),
            '-r',
            config['genome_fasta'],
            '-g',
            config['gap_score'],
            '-x',
            config['mismatch_score'],
            '-m',
            config['match_score'],
            '--flmin',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'),
            '--flmax',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'),
            '-s',
            mgd.TempOutputFile('samples.align.true', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='scorestats',
        axes=('bylibrary', ),
        ctx=medmem,
        func='destruct.score_stats.create_score_stats',
        args=(
            mgd.TempInputFile('samples.align.true', 'bylibrary'),
            config['match_score'],
            mgd.TempOutputFile('score.stats', 'bylibrary'),
        ),
    )

    # Split discordant fastqs and align

    workflow.transform(
        name='splitfastq1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.split_fastq',
        args=(
            mgd.InputFile('reads1.fq.gz', 'bylibrary',
                          fnames=fastq1_filenames),
            int(config['reads_per_split']),
            mgd.TempOutputFile('reads1', 'bylibrary', 'byread'),
        ),
    )

    workflow.transform(
        name='splitfastq2',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.split_fastq',
        args=(
            mgd.InputFile('reads2.fq.gz', 'bylibrary',
                          fnames=fastq2_filenames),
            int(config['reads_per_split']),
            mgd.TempOutputFile('reads2', 'bylibrary', 'byread',
                               axes_origin=[]),
        ),
    )

    workflow.transform(
        name='prepseed',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        func='destruct.tasks.prepare_seed_fastq',
        args=(
            mgd.TempInputFile('reads1', 'bylibrary', 'byread'),
            mgd.TempInputFile('reads2', 'bylibrary', 'byread'),
            36,
            mgd.TempOutputFile('reads.seed', 'bylibrary', 'byread'),
        ),
    )

    workflow.commandline(
        name='bwtrealign',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        args=(
            'bowtie',
            config['genome_fasta'],
            mgd.TempInputFile('reads.seed', 'bylibrary', 'byread'),
            '--chunkmbs',
            '512',
            '-k',
            '1000',
            '-m',
            '1000',
            '--strata',
            '--best',
            '-S',
            '|',
            'destruct_realign2',
            '-l',
            mgd.TempInputObj('library_id', 'bylibrary'),
            '-a',
            '-',
            '-1',
            mgd.TempInputFile('reads1', 'bylibrary', 'byread'),
            '-2',
            mgd.TempInputFile('reads2', 'bylibrary', 'byread'),
            '-r',
            config['genome_fasta'],
            '-g',
            config['gap_score'],
            '-x',
            config['mismatch_score'],
            '-m',
            config['match_score'],
            '--flmin',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'),
            '--flmax',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'),
            '--tchimer',
            config['chimeric_threshold'],
            '--talign',
            config['alignment_threshold'],
            '--pchimer',
            config['chimeric_prior'],
            '--tvalid',
            config['readvalid_threshold'],
            '-z',
            mgd.TempInputFile('score.stats', 'bylibrary'),
            '--span',
            mgd.TempOutputFile('spanning.alignments', 'bylibrary', 'byread'),
            '--split',
            mgd.TempOutputFile('split.alignments', 'bylibrary', 'byread'),
        ),
    )

    workflow.transform(
        name='merge_spanning_1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.merge_files_by_line',
        args=(
            mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'),
            mgd.TempOutputFile('spanning.alignments_1', 'bylibrary'),
        ),
    )

    workflow.commandline(
        name='filterreads',
        axes=('bylibrary', ),
        ctx=lowmem,
        args=(
            'destruct_filterreads',
            '-n',
            '2',
            '-a',
            mgd.TempInputFile('spanning.alignments_1', 'bylibrary'),
            '-r',
            config['satellite_regions'],
            '>',
            mgd.TempOutputFile('spanning.alignments', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='merge_split_1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.merge_files_by_line',
        args=(
            mgd.TempInputFile('split.alignments', 'bylibrary', 'byread'),
            mgd.TempOutputFile('split.alignments', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='merge_spanning_2',
        ctx=lowmem,
        func='destruct.tasks.merge_alignment_files',
        args=(
            mgd.TempInputFile('spanning.alignments', 'bylibrary'),
            mgd.TempOutputFile('spanning.alignments'),
            mgd.TempInputObj('library_id', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='merge_split_2',
        ctx=lowmem,
        func='destruct.tasks.merge_alignment_files',
        args=(
            mgd.TempInputFile('split.alignments', 'bylibrary'),
            mgd.TempOutputFile('split.alignments'),
            mgd.TempInputObj('library_id', 'bylibrary'),
        ),
    )

    # Cluster spanning reads

    workflow.setobj(
        obj=mgd.TempOutputObj('chrom.args', 'bychromarg'),
        value=destruct.tasks.generate_chromosome_args(config['chromosomes']),
    )

    workflow.transform(
        name='write_stats_table',
        ctx=lowmem,
        func='destruct.tasks.write_stats_table',
        args=(
            mgd.TempInputObj('library_id', 'bylibrary'),
            mgd.TempInputObj('stats', 'bylibrary'),
            mgd.TempOutputFile('libstats.tsv'),
        ),
    )

    workflow.commandline(
        name='cluster',
        axes=('bychromarg', ),
        ctx=medmem,
        args=(
            'destruct_mclustermatepairs',
            '-a',
            mgd.TempInputFile('spanning.alignments'),
            '-s',
            mgd.TempInputFile('libstats.tsv'),
            '-c',
            mgd.TempOutputFile('clusters', 'bychromarg'),
            mgd.TempInputObj('chrom.args', 'bychromarg'),
            '--clustmin',
            config['cluster_readcount_threshold'],
            '--fragmax',
            config['fragment_length_max'],
        ),
    )

    # Predict breakpoints from split reads

    workflow.transform(
        name='predict_breaks',
        axes=('bychromarg', ),
        ctx=medmem,
        func='destruct.predict_breaks.predict_breaks',
        args=(
            mgd.TempInputFile('clusters', 'bychromarg'),
            mgd.TempInputFile('spanning.alignments'),
            mgd.TempInputFile('split.alignments'),
            mgd.TempOutputFile('breakpoints_2', 'bychromarg'),
        ),
    )

    workflow.transform(
        name='merge_clusters',
        ctx=lowmem,
        func='destruct.tasks.merge_clusters',
        args=(
            mgd.TempInputFile('clusters', 'bychromarg'),
            mgd.TempInputFile('breakpoints_2', 'bychromarg'),
            mgd.TempOutputFile('clusters'),
            mgd.TempOutputFile('breakpoints_2'),
            mgd.TempOutputFile('merge_clusters.debug'),
        ),
    )

    # Realign reads to breakpoints

    workflow.commandline(
        name='realigntobreaks',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        args=(
            'destruct_realigntobreaks2',
            '-r',
            config['genome_fasta'],
            '-b',
            mgd.TempInputFile('breakpoints_2'),
            '-c',
            mgd.TempInputFile('clusters'),
            '-g',
            config['gap_score'],
            '-x',
            config['mismatch_score'],
            '-m',
            config['match_score'],
            '--flmax',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'),
            '--span',
            mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'),
            '-1',
            mgd.TempInputFile('reads1', 'bylibrary', 'byread'),
            '-2',
            mgd.TempInputFile('reads2', 'bylibrary', 'byread'),
            '--realignments',
            mgd.TempOutputFile('realignments', 'bylibrary', 'byread'),
        ),
    )

    # Calculate likelihoods based on realignments

    workflow.transform(
        name='calculate_realignment_likelihoods',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        func='destruct.predict_breaks.calculate_realignment_likelihoods',
        args=(
            mgd.TempInputFile('breakpoints_2'),
            mgd.TempInputFile('realignments', 'bylibrary', 'byread'),
            mgd.TempInputFile('score.stats', 'bylibrary'),
            mgd.TempOutputFile('likelihoods_2', 'bylibrary', 'byread'),
            config['match_score'],
            mgd.TempInputObj('stats',
                             'bylibrary').prop('fragment_length_mean'),
            mgd.TempInputObj('stats',
                             'bylibrary').prop('fragment_length_stddev'),
        ),
    )

    workflow.transform(
        name='merge_likelihoods_1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.merge_sorted_files_by_line',
        args=(
            mgd.TempInputFile('likelihoods_2', 'bylibrary', 'byread'),
            mgd.TempOutputFile('likelihoods_2', 'bylibrary'),
            mgd.TempSpace('merge_likelihoods_1_temp', 'bylibrary'),
            '1',
        ),
    )

    workflow.transform(
        name='merge_likelihoods_2',
        ctx=lowmem,
        func='destruct.tasks.merge_sorted_files_by_line',
        args=(
            mgd.TempInputFile('likelihoods_2', 'bylibrary'),
            mgd.TempOutputFile('likelihoods_2'),
            mgd.TempSpace('merge_likelihoods_2_temp'),
            '1',
        ),
    )

    # Set cover for multi mapping reads

    workflow.transform(
        name='calc_weights',
        ctx=medmem,
        func='destruct.predict_breaks.calculate_cluster_weights',
        args=(
            mgd.TempInputFile('breakpoints_2'),
            mgd.TempOutputFile('cluster_weights'),
        ),
    )

    workflow.commandline(
        name='setcover',
        ctx=medmem,
        args=(
            'destruct_setcover',
            '-c',
            mgd.TempInputFile('clusters'),
            '-w',
            mgd.TempInputFile('cluster_weights'),
            '-a',
            mgd.TempOutputFile('clusters_setcover'),
        ),
    )

    # Select cluster based on setcover

    workflow.transform(
        name='select_clusters',
        ctx=medmem,
        func='destruct.predict_breaks.select_clusters',
        args=(
            mgd.TempInputFile('clusters_setcover'),
            mgd.TempInputFile('breakpoints_2'),
            mgd.TempOutputFile('breakpoints_1'),
            mgd.TempInputFile('likelihoods_2'),
            mgd.TempOutputFile('likelihoods_1'),
        ),
    )

    # Select prediction based on max likelihood

    workflow.transform(
        name='select_predictions',
        ctx=himem,
        func='destruct.predict_breaks.select_predictions',
        args=(
            mgd.TempInputFile('breakpoints_1'),
            mgd.TempOutputFile('breakpoints'),
            mgd.TempInputFile('likelihoods_1'),
            mgd.TempOutputFile('likelihoods'),
            config['mate_score_threshold'],
            config['template_length_min_threshold'],
            config['min_alignment_log_likelihood'],
        ),
    )

    # Optionally tabulate supporting reads

    workflow.transform(
        name='tabreads',
        ctx=medmem,
        func='destruct.tasks.tabulate_reads',
        args=(
            mgd.TempInputFile('clusters_setcover'),
            mgd.TempInputFile('likelihoods'),
            mgd.TempInputObj('library_id', 'bylibrary'),
            mgd.InputFile('reads1.fq.gz', 'bylibrary',
                          fnames=fastq1_filenames),
            mgd.InputFile('reads2.fq.gz', 'bylibrary',
                          fnames=fastq2_filenames),
            mgd.TempOutputFile('breakreads.table.unsorted'),
        ),
    )

    workflow.commandline(
        name='sortreads',
        ctx=medmem,
        args=(
            'sort',
            '-n',
            mgd.TempInputFile('breakreads.table.unsorted'),
            '>',
            mgd.OutputFile(breakpoint_read_table),
        ),
    )

    # Tabulate results

    workflow.transform(
        name='tabulate',
        ctx=himem,
        func='destruct.tasks.tabulate_results',
        args=(
            mgd.TempInputFile('breakpoints'),
            mgd.TempInputFile('likelihoods'),
            mgd.TempInputObj('library_id', 'bylibrary'),
            config['genome_fasta'],
            config['gtf_filename'],
            config['dgv_filename'],
            mgd.OutputFile(breakpoint_table),
            mgd.OutputFile(breakpoint_library_table),
        ),
    )

    return workflow
예제 #18
0
파일: workflows.py 프로젝트: aroth85/soil
def create_search_workflow(in_fasta_file,
                           in_mzml_file,
                           out_file,
                           add_decoys=True,
                           fixed_mods=None,
                           max_mods=1,
                           precursor_mass_tolerance='20ppm',
                           search_mem=5,
                           split_size=1000,
                           variable_mods=None):

    sandbox = soil.utils.workflow.get_sandbox(['msgf_plus', 'proteowizard'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='index_db',
                       ctx={
                           'mem': 4,
                           'mem_retry_increment': 8,
                           'num_retry': 3
                       },
                       func=tasks.build_index_sentinel,
                       args=(mgd.InputFile(in_fasta_file),
                             mgd.TempOutputFile('db.sentinel')),
                       kwargs={'add_decoys': add_decoys})

    workflow.transform(name='split_mzml_file',
                       func=soil.wrappers.proteowizard.tasks.split_mzml_file,
                       args=(
                           mgd.InputFile(in_mzml_file),
                           mgd.TempOutputFile('spec_data.mzml', 'split'),
                           mgd.TempSpace('split_tmp'),
                       ),
                       kwargs={
                           'split_size': split_size,
                       })

    workflow.transform(name='run_msgf_plus',
                       axes=('split', ),
                       ctx={
                           'mem': search_mem + 3,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=tasks.run_search_sentinel,
                       args=(
                           mgd.TempInputFile('db.sentinel'),
                           mgd.TempInputFile('spec_data.mzml', 'split'),
                           mgd.TempOutputFile('search.mzid', 'split'),
                           mgd.TempSpace('msgf_tmp', 'split'),
                       ),
                       kwargs={
                           'add_decoys': False,
                           'fixed_mods': fixed_mods,
                           'max_mods': max_mods,
                           'mem': search_mem,
                           'precursor_mass_tolerance':
                           precursor_mass_tolerance,
                           'variable_mods': variable_mods
                       })

    workflow.transform(name='convert_to_tsv',
                       axes=('split', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=tasks.convert_mzid_to_tsv,
                       args=(
                           mgd.TempInputFile('search.mzid', 'split'),
                           mgd.TempOutputFile('search.tsv', 'split'),
                       ))

    workflow.transform(name='merge_results',
                       func=tasks.merge_results,
                       args=(mgd.TempInputFile('search.tsv', 'split'),
                             mgd.TempOutputFile('merged.tsv')))

    workflow.transform(name='convert_output',
                       func=tasks.convert_msgf_to_final,
                       args=(mgd.TempInputFile('merged.tsv'),
                             mgd.TempOutputFile('final.tsv.gz')))

    workflow.transform(name='clean_up',
                       func=tasks.clean_up,
                       args=(mgd.TempInputFile('db.sentinel'),
                             mgd.TempInputFile('final.tsv.gz'),
                             mgd.OutputFile(out_file)))

    return workflow
예제 #19
0
파일: workflows.py 프로젝트: aroth85/soil
def create_percolator_workflow(in_fasta_file,
                               in_mzml_file,
                               out_file,
                               fixed_mods=None,
                               max_mods=1,
                               split_size=1000,
                               variable_mods=None):

    sandbox = soil.utils.workflow.get_sandbox(
        ['msgf_plus', 'percolator', 'proteowizard'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='build_decoy_db',
                       func=tasks.build_decoy_db,
                       args=(
                           mgd.InputFile(in_fasta_file),
                           mgd.TempOutputFile('decoy.no_index.fasta'),
                       ),
                       kwargs={'decoy_prefix': 'XXX_'})

    workflow.transform(name='index_decoy_db',
                       func=tasks.build_index,
                       args=(mgd.TempInputFile('decoy.no_index.fasta'),
                             mgd.TempOutputFile('decoy.fasta')),
                       kwargs={'add_decoys': False})

    workflow.transform(name='index_target_db',
                       func=tasks.build_index,
                       args=(mgd.InputFile(in_fasta_file),
                             mgd.TempOutputFile('target.fasta')),
                       kwargs={'add_decoys': False})

    workflow.transform(name='split_mzml_file',
                       func=soil.wrappers.proteowizard.tasks.split_mzml_file,
                       args=(
                           mgd.InputFile(in_mzml_file),
                           mgd.TempOutputFile('spec_data.mzml', 'split'),
                           mgd.TempSpace('split_tmp'),
                       ),
                       kwargs={
                           'split_size': split_size,
                       })

    workflow.transform(name='run_msgf_plus_decoy',
                       axes=('split', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=tasks.run_search,
                       args=(
                           mgd.TempInputFile('decoy.fasta'),
                           mgd.TempInputFile('spec_data.mzml', 'split'),
                           mgd.TempOutputFile('decoy_search_results.mzid',
                                              'split'),
                           mgd.TempSpace('msgf_decoy_tmp', 'split'),
                       ),
                       kwargs={
                           'add_decoys': False,
                           'add_features': True,
                           'fixed_mods': fixed_mods,
                           'max_mods': max_mods,
                           'variable_mods': variable_mods
                       })

    workflow.transform(name='run_msgf_plus_target',
                       axes=('split', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=tasks.run_search,
                       args=(
                           mgd.TempInputFile('target.fasta'),
                           mgd.TempInputFile('spec_data.mzml', 'split'),
                           mgd.TempOutputFile('target_search_results.mzid',
                                              'split'),
                           mgd.TempSpace('msgf_target_tmp', 'split'),
                       ),
                       kwargs={
                           'add_decoys': False,
                           'add_features': True,
                           'fixed_mods': fixed_mods,
                           'max_mods': max_mods,
                           'variable_mods': variable_mods
                       })

    workflow.transform(name='convert_to_tsv_decoy',
                       axes=('split', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=tasks.convert_mzid_to_tsv,
                       args=(
                           mgd.TempInputFile('decoy_search_results.mzid',
                                             'split'),
                           mgd.TempOutputFile('decoy_search.tsv', 'split'),
                       ))

    workflow.transform(name='convert_to_tsv_target',
                       axes=('split', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=tasks.convert_mzid_to_tsv,
                       args=(
                           mgd.TempInputFile('target_search_results.mzid',
                                             'split'),
                           mgd.TempOutputFile('target_search.tsv', 'split'),
                       ))

    workflow.transform(name='merge_results',
                       func=tasks.merge_results,
                       args=([
                           mgd.TempInputFile('decoy_search.tsv', 'split'),
                           mgd.TempInputFile('target_search.tsv', 'split')
                       ], mgd.TempOutputFile('merged.tsv')))

    workflow.transform(name='convert_output',
                       func=tasks.convert_msgf_to_final,
                       args=(mgd.TempInputFile('merged.tsv'),
                             mgd.OutputFile(
                                 out_file.replace('.tsv', '.msgf.tsv.gz'))))

    workflow.transform(name='run_msgf2pin',
                       ctx={
                           'mem': 4,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=soil.wrappers.percolator.tasks.convert_msgf_to_pin,
                       args=(mgd.TempInputFile('decoy_search_results.mzid',
                                               'split'),
                             mgd.TempInputFile('target_search_results.mzid',
                                               'split'),
                             mgd.TempOutputFile('percolator_input.tsv'),
                             mgd.TempSpace('msgf2pin_tmp')))

    workflow.transform(name='run_percolator',
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=soil.wrappers.percolator.tasks.run_percolator,
                       args=(mgd.TempInputFile('percolator_input.tsv'),
                             mgd.TempOutputFile('final.tsv')))

    workflow.transform(name='clean_up_decoy',
                       func=tasks.clean_up,
                       args=([
                           mgd.TempInputFile('decoy.fasta'),
                           mgd.TempInputFile('target.fasta')
                       ], mgd.TempInputFile('final.tsv'),
                             mgd.OutputFile(out_file)))

    return workflow
예제 #20
0
def infer_haps(
    bam_file,
    haplotypes_filename,
    allele_counts_filename,
    config,
    normal=False,
):
    baseimage = {'docker_image': config['docker']['single_cell_pipeline']}

    remixt_config = config.get('extract_seqdata', {})
    remixt_ref_data_dir = config['ref_data_dir']

    chromosomes = config['chromosomes']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               **baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    if isinstance(bam_file, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=list(bam_file.keys()),
        )

        # dont parallelize over chromosomes for per cell bams
        workflow.subworkflow(
            name="extract_seqdata",
            axes=('cell_id', ),
            func=
            'single_cell.workflows.extract_seqdata.create_extract_seqdata_workflow',
            args=(
                mgd.InputFile('bam_markdups',
                              'cell_id',
                              fnames=bam_file,
                              extensions=['.bai']),
                mgd.TempOutputFile('seqdata_cell.h5', 'cell_id'),
                config.get('extract_seqdata', {}),
                config['ref_data_dir'],
                config,
            ))
        workflow.transform(
            name='merge_all_seqdata',
            func="single_cell.workflows.titan.tasks.merge_overlapping_seqdata",
            args=(mgd.TempOutputFile('seqdata_file.h5'),
                  mgd.TempInputFile("seqdata_cell.h5",
                                    "cell_id"), config["chromosomes"]),
        )

    else:
        # if its a single bam, then its probably whole genome
        # so parallelize over chromosomes
        workflow.subworkflow(
            name='extract_seqdata',
            func='remixt.workflow.create_extract_seqdata_workflow',
            ctx={'disk': 150},
            args=(
                mgd.InputFile(bam_file, extensions=['.bai']),
                mgd.TempOutputFile('seqdata_file.h5'),
                remixt_config,
                remixt_ref_data_dir,
            ),
        )

    workflow.setobj(
        obj=mgd.OutputChunks('chromosome'),
        value=chromosomes,
    )

    if normal:
        func = 'remixt.analysis.haplotype.infer_snp_genotype_from_normal'
    else:
        func = 'remixt.analysis.haplotype.infer_snp_genotype_from_tumour'

    workflow.transform(
        name='infer_snp_genotype',
        axes=('chromosome', ),
        ctx=dict(mem=16, **ctx),
        func=func,
        args=(
            mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'),
            mgd.TempInputFile('seqdata_file.h5'),
            mgd.InputInstance('chromosome'),
            config,
        ),
    )

    workflow.transform(
        name='infer_haps',
        axes=('chromosome', ),
        ctx=dict(mem=16, **ctx),
        func='remixt.analysis.haplotype.infer_haps',
        args=(
            mgd.TempOutputFile('haplotypes.tsv', 'chromosome'),
            mgd.TempInputFile('snp_genotype.tsv', 'chromosome'),
            mgd.InputInstance('chromosome'),
            mgd.TempSpace('haplotyping', 'chromosome'),
            remixt_config,
            remixt_ref_data_dir,
        ),
    )

    workflow.transform(name='merge_haps',
                       ctx=dict(mem=16, **ctx),
                       func='remixt.utils.merge_tables',
                       args=(
                           mgd.OutputFile(haplotypes_filename),
                           mgd.TempInputFile('haplotypes.tsv', 'chromosome'),
                       ))

    workflow.transform(
        name='create_segments',
        ctx=dict(mem=16, **ctx),
        func='remixt.analysis.segment.create_segments',
        args=(
            mgd.TempOutputFile('segments.tsv'),
            remixt_config,
            config['ref_data_dir'],
        ),
    )

    workflow.transform(
        name='haplotype_allele_readcount',
        ctx=dict(mem=16, **ctx),
        func='remixt.analysis.readcount.haplotype_allele_readcount',
        args=(mgd.OutputFile(allele_counts_filename),
              mgd.TempInputFile('segments.tsv'),
              mgd.TempInputFile('seqdata_file.h5'),
              mgd.InputFile(haplotypes_filename), remixt_config),
    )

    return workflow
예제 #21
0
def run_LoLoPicker(config, args, normal_bam, tumour_bam, output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('region', ),
                    value=list(map(str,
                                   range(1, 23) + ['X'])))

    workflow.transform(name='create_axes_beds',
                       axes=('region', ),
                       func=tasks.create_axes_beds,
                       args=(mgd.InputFile(config["bed_file"]),
                             mgd.InputInstance('region'),
                             mgd.TempOutputFile('region.bed', 'region')))

    workflow.transform(name='LoLoPicker_somatic',
                       axes=('region', ),
                       func=tasks.LoLoPicker_somatic,
                       args=(config, mgd.InputFile(tumour_bam),
                             mgd.InputFile(normal_bam),
                             mgd.TempInputFile('region.bed', 'region'),
                             mgd.TempSpace('LoLoPicker_somatic_temp',
                                           'region'),
                             mgd.TempOutputFile("raw_somatic_varants.txt",
                                                'region')))

    workflow.transform(name='make_sample_list',
                       func=tasks.make_sample_list,
                       args=(
                           args,
                           mgd.TempOutputFile('samplelist.txt'),
                       ))

    workflow.transform(name='LoLoPicker_control',
                       axes=('region', ),
                       func=tasks.LoLoPicker_control,
                       args=(config, mgd.TempInputFile('samplelist.txt'),
                             mgd.TempSpace('LoLoPicker_control_temp',
                                           'region'),
                             mgd.TempInputFile("raw_somatic_varants.txt",
                                               'region'),
                             mgd.TempOutputFile("control_stats.txt",
                                                'region')))

    workflow.transform(name='LoLoPicker_stats',
                       axes=('region', ),
                       func=tasks.LoLoPicker_stats,
                       args=(
                           mgd.TempSpace('LoLoPicker_stats_temp', 'region'),
                           mgd.TempInputFile("raw_somatic_varants.txt",
                                             'region'),
                           mgd.TempInputFile("control_stats.txt", 'region'),
                           mgd.TempOutputFile("stats_calls.txt", 'region'),
                       ))

    workflow.transform(name='merge_LoLoPicker',
                       func=tasks.merge_LoLoPicker,
                       args=(mgd.TempSpace("merge_LoLo"),
                             mgd.TempInputFile("stats_calls.txt",
                                               'region',
                                               axes_origin=[]),
                             mgd.OutputFile(output_file)))

    return workflow
예제 #22
0
def analyze_tumour_normal(config, input_args, results_dir, normal_bam,
                          tumour_sample, tumour_bam, snv_tsv, indel_tsv,
                          snv_vcf, indel_vcf):
    workflow = pypeliner.workflow.Workflow()

    matched_results_dir = os.path.join(results_dir, tumour_sample)

    helpers.makedirs(matched_results_dir)

    workflow.subworkflow(name='run_deepSNV',
                         func=deepSNV.run_deepSNV,
                         args=(config, mgd.InputFile(normal_bam),
                               mgd.InputFile(tumour_bam),
                               mgd.OutputFile(
                                   os.path.join(matched_results_dir,
                                                'deepSNV_out.tsv'))))

    workflow.subworkflow(name='run_VarScan',
                         func=VarScan.run_VarScan,
                         args=(
                             config,
                             mgd.InputFile(normal_bam),
                             mgd.InputFile(tumour_bam),
                             mgd.OutputFile(
                                 os.path.join(matched_results_dir,
                                              'VarScan_out.vcf')),
                             mgd.OutputFile(
                                 os.path.join(matched_results_dir,
                                              'VarScan_indel_out.vcf')),
                         ))

    workflow.subworkflow(name='run_MutationSeq',
                         func=MutationSeq.run_MutationSeq,
                         args=(
                             config,
                             mgd.InputFile(normal_bam),
                             mgd.InputFile(tumour_bam),
                             mgd.OutputFile(
                                 os.path.join(matched_results_dir,
                                              'museq_out.vcf')),
                         ))

    workflow.subworkflow(name='run_Strelka',
                         func=Strelka.run_Strelka,
                         args=(config, mgd.InputFile(normal_bam),
                               mgd.InputFile(tumour_bam),
                               mgd.OutputFile(
                                   os.path.join(matched_results_dir,
                                                'strelka_out.vcf')),
                               mgd.OutputFile(
                                   os.path.join(matched_results_dir,
                                                'strelka_indel_out.vcf'))))

    workflow.subworkflow(name='run_LoLoPicker',
                         func=LoLoPicker.run_LoLoPicker,
                         args=(
                             config,
                             input_args,
                             mgd.InputFile(normal_bam),
                             mgd.InputFile(tumour_bam),
                             mgd.OutputFile(
                                 os.path.join(matched_results_dir,
                                              'LoLoPicker_out.tsv')),
                         ))

    workflow.transform(
        name='create_result_dict',
        func=union.create_result_dict,
        ret=mgd.TempOutputObj('result_dict'),
        args=(
            mgd.InputFile(os.path.join(matched_results_dir,
                                       'deepSNV_out.tsv')),
            mgd.InputFile(os.path.join(matched_results_dir,
                                       'VarScan_out.vcf')),
            mgd.InputFile(os.path.join(matched_results_dir, 'museq_out.vcf')),
            mgd.InputFile(os.path.join(matched_results_dir,
                                       'strelka_out.vcf')),
            mgd.InputFile(
                os.path.join(matched_results_dir, 'LoLoPicker_out.tsv')),
        ))

    workflow.transform(name='union_results',
                       func=union.union_results,
                       args=(
                           config,
                           mgd.InputFile(normal_bam),
                           mgd.InputFile(tumour_bam),
                           mgd.TempInputObj('result_dict'),
                           mgd.TempSpace('union_space'),
                           mgd.OutputFile(snv_tsv),
                           mgd.OutputFile(snv_vcf),
                       ))

    workflow.transform(name='union_indels',
                       func=union.union_indels,
                       args=(
                           config,
                           mgd.InputFile(
                               os.path.join(matched_results_dir,
                                            'strelka_indel_out.vcf')),
                           mgd.InputFile(
                               os.path.join(matched_results_dir,
                                            'VarScan_indel_out.vcf')),
                           mgd.OutputFile(indel_tsv),
                           mgd.OutputFile(indel_vcf),
                       ))

    return workflow
예제 #23
0
def create_delly_wrapper_workflow(bam_filenames, output_filename, raw_data_dir, control_id=None, ref_genome_fasta_file=None, delly_excl_chrom=None):
    bams = list()
    for lib_id, bam_filename in bam_filenames.items():
        bams += [destruct.benchmark.wrappers.utils.symlink(bam_filename, link_name='{0}.bam'.format(lib_id), link_directory=raw_data_dir)]
        destruct.benchmark.wrappers.utils.symlink(bam_filename+'.bai', link_name='{0}.bam.bai'.format(lib_id), link_directory=raw_data_dir)

    workflow = pypeliner.workflow.Workflow()
    
    workflow.transform(
        name='get_sv_types',
        func=destruct.benchmark.wrappers.delly.tasks.get_sv_types,
        ret=pypeliner.managed.OutputChunks('sv_type'),
        args=(
            mgd.InputFile(ref_genome_fasta_file),
        ),
    )

    workflow.transform(
        name='delly_call',
        axes=('sv_type',),
        ctx={'mem': 64, 'num_retry': 2, 'mem_retry_factor': 2},
        func=destruct.benchmark.wrappers.delly.tasks.run_delly_call,
        args=(
            mgd.Instance('sv_type'),
            delly_excl_chrom,
            ref_genome_fasta_file,
            [mgd.InputFile(bam) for bam in bams],
            mgd.TempOutputFile('out.bcf', 'sv_type'),
        ),
    )

    if control_id is None:
        concat_input = mgd.TempInputFile('out.bcf', 'sv_type')

    else:
        workflow.transform(
            name='delly_filter_somatic',
            axes=('sv_type',),
            ctx={'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2},
            func=destruct.benchmark.wrappers.delly.tasks.run_delly_filter,
            args=(
                mgd.Instance('sv_type'),
                bam_filenames.keys(),
                control_id, 
                mgd.TempSpace('samples.tsv'),
                ref_genome_fasta_file,
                mgd.TempInputFile('out.bcf', 'sv_type'),
                mgd.TempOutputFile('somatic.bcf', 'sv_type'),
            ),
        )

        concat_input = mgd.TempInputFile('somatic.bcf', 'sv_type')

    workflow.transform(
        name='concatenate_vcf',
        func=destruct.benchmark.wrappers.tasks.concatenate_bcf,
        ctx={'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2},
        args=(
            concat_input,
            mgd.TempOutputFile('somatic.bcf'),
        ),
    )

    workflow.transform(
        name='convert_vcf',
        func=destruct.benchmark.wrappers.delly.tasks.convert_vcf,
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        args=(
            mgd.TempInputFile('somatic.bcf'),
            mgd.OutputFile(output_filename),
        ),
        kwargs={
            'control_id': control_id,
        }
    )

    return workflow
예제 #24
0
파일: workflows.py 프로젝트: aroth85/soil
def create_allele_counts_workflow(normal_bam_file,
                                  tumour_bam_file,
                                  dbsnp_vcf_file,
                                  ref_genome_fasta_file,
                                  allele_counts_file,
                                  chromosomes='autosomes'):

    chromosomes = soil.utils.genome.load_bam_chromosome_lengths(
        normal_bam_file, chromosomes)

    sandbox = soil.utils.workflow.get_sandbox(['snpsift'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.subworkflow(
        name='call_snps',
        func=soil.wrappers.platypus.workflows.create_single_sample_workflow,
        args=(
            mgd.InputFile(normal_bam_file),
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('normal.vcf.gz'),
        ),
        kwargs={
            'chromosomes': chromosomes,
            'split_size': int(1e7)
        })

    workflow.commandline(name='annotate_dbsnp_status',
                         ctx={
                             'mem': 6,
                             'mem_retry_increment': 4,
                             'num_retry': 3
                         },
                         args=('SnpSift', 'annotate',
                               mgd.InputFile(dbsnp_vcf_file),
                               mgd.TempInputFile('normal.vcf.gz'), '>',
                               mgd.TempOutputFile('normal.dbsnp.vcf')))

    workflow.commandline(name='annotate_variant_type',
                         ctx={
                             'mem': 6,
                             'mem_retry_increment': 4,
                             'num_retry': 3
                         },
                         args=('SnpSift', 'varType',
                               mgd.TempInputFile('normal.dbsnp.vcf'), '>',
                               mgd.TempOutputFile('normal.dbsnp.vartype.vcf')))

    workflow.commandline(
        name='filter_het_snps',
        ctx={
            'mem': 6,
            'mem_retry_increment': 4,
            'num_retry': 3
        },
        args=('SnpSift', 'filter',
              "isHet(GEN[0]) & ((exists ID) & ( ID =~ 'rs' )) & (exists SNP)",
              mgd.TempInputFile('normal.dbsnp.vartype.vcf'), '>',
              mgd.TempOutputFile('het.snps.vcf')))

    workflow.transform(name='split_vcf',
                       ctx={
                           'mem': 6,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=tasks.split_vcf,
                       args=(mgd.TempInputFile('het.snps.vcf'),
                             mgd.TempOutputFile('split.vcf', 'split'),
                             mgd.TempSpace('split_tmp')),
                       kwargs={'split_size': int(1e4)})

    workflow.transform(name='get_allele_counts',
                       axes=('split', ),
                       func=tasks.get_snv_allele_counts_for_vcf_targets,
                       args=(mgd.InputFile(tumour_bam_file),
                             mgd.TempInputFile('split.vcf', 'split'),
                             mgd.TempOutputFile('split.tsv', 'split')))

    workflow.transform(name='merge_counts',
                       func=tasks.merge_counts,
                       args=(mgd.TempInputFile('split.tsv', 'split'),
                             mgd.OutputFile(allele_counts_file)))

    return workflow
예제 #25
0
파일: workflows.py 프로젝트: aroth85/soil
def create_titan_workflow(normal_bam_file,
                          tumour_bam_file,
                          dbsnp_vcf_file,
                          mappability_file,
                          ref_genome_fasta_file,
                          out_file,
                          exome_bed_file=None,
                          sample='Tumour',
                          threads=1):

    sandbox = soil.utils.workflow.get_sandbox(
        ['hmmcopy', 'hmmcopy_utils', 'titan'])

    sandbox.channels.append('conda-forge')

    sandbox.packages.extend(['pandas', 'rpy2'])

    chromosomes = soil.utils.genome.load_bam_chromosome_lengths(
        normal_bam_file, 'autosomes')

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('init_params', 'param_idx'),
                    value=tasks.create_intialization_parameters())

    workflow.subworkflow(name='get_allele_counts',
                         func=create_allele_counts_workflow,
                         args=(mgd.InputFile(normal_bam_file),
                               mgd.InputFile(tumour_bam_file),
                               mgd.InputFile(dbsnp_vcf_file),
                               mgd.InputFile(ref_genome_fasta_file),
                               mgd.TempOutputFile('allele_counts.tsv')),
                         kwargs={'chromosomes': 'autosomes'})

    workflow.commandline(name='build_normal_wig',
                         args=('readCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(normal_bam_file), '>',
                               mgd.TempOutputFile('normal.wig')))

    workflow.commandline(name='build_tumour_wig',
                         args=('readCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(tumour_bam_file), '>',
                               mgd.TempOutputFile('tumour.wig')))

    workflow.commandline(name='build_gc_wig',
                         args=('gcCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(ref_genome_fasta_file), '>',
                               mgd.TempOutputFile('gc.wig')))

    workflow.commandline(name='build_mappability_wig',
                         args=('mapCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(mappability_file), '>',
                               mgd.TempOutputFile('mappability.wig')))

    workflow.transform(name='build_coverage_file',
                       func=tasks.build_coverage_file,
                       args=(mgd.TempInputFile('normal.wig'),
                             mgd.TempInputFile('tumour.wig'),
                             mgd.TempInputFile('gc.wig'),
                             mgd.TempInputFile('mappability.wig'),
                             mgd.TempOutputFile('coverage.wig')),
                       kwargs={'target_file': exome_bed_file})

    workflow.transform(name='run_titan',
                       axes=('param_idx', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3,
                           'threads': threads
                       },
                       func=tasks.run_titan,
                       args=(mgd.TempInputFile('coverage.wig'),
                             mgd.TempInputFile('allele_counts.tsv'),
                             mgd.TempInputObj('init_params', 'param_idx'),
                             mgd.TempOutputFile('run.tar.gz', 'param_idx'),
                             mgd.TempSpace('titan_tmp', 'param_idx')),
                       kwargs={
                           'is_exome': (exome_bed_file is not None),
                           'sample': sample,
                           'threads': threads
                       })

    workflow.transform(name='build_run_stats_file',
                       func=tasks.build_run_stats_file,
                       args=(mgd.TempInputFile('run.tar.gz', 'param_idx'),
                             mgd.TempInputObj('init_params', 'param_idx'),
                             mgd.TempOutputFile('stats.tsv')))

    workflow.transform(name='build_output',
                       func=tasks.build_final_results_file,
                       args=(mgd.TempInputFile('coverage.wig'),
                             mgd.TempInputFile('allele_counts.tsv'),
                             mgd.TempInputFile('run.tar.gz', 'param_idx'),
                             mgd.TempInputFile('stats.tsv'),
                             mgd.OutputFile(out_file),
                             mgd.TempSpace('build_results')))

    return workflow
예제 #26
0
def infer_haps(
        bam_file,
        haplotypes_filename,
        config,
        from_tumour=False,
):
    baseimage = {'docker_image': config['docker']['single_cell_pipeline']}

    remixt_image = config['docker']['remixt']

    remixt_config = config.get('extract_seqdata', {})
    remixt_ref_data_dir = config['ref_data_dir']

    chromosomes = config['chromosomes']
    remixt_config['chromosomes'] = chromosomes

    ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, **baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    if isinstance(bam_file, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=list(bam_file.keys()),
        )

        # dont parallelize over chromosomes for per cell bams
        workflow.subworkflow(
            name="extract_seqdata",
            axes=('cell_id',),
            func='remixt.workflow.create_extract_seqdata_workflow',
            ctx={'docker_image': remixt_image},
            args=(
                mgd.InputFile(
                    'bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']
                ),
                mgd.TempOutputFile('seqdata_cell.h5', 'cell_id'),
                remixt_config,
                remixt_ref_data_dir,
            ),
            kwargs={'no_parallelism': True}
        )
        workflow.transform(
            name='merge_all_seqdata',
            func="remixt.seqdataio.merge_overlapping_seqdata",
            ctx={'docker_image': remixt_image},
            args=(
                mgd.TempOutputFile('seqdata_file.h5'),
                mgd.TempInputFile("seqdata_cell.h5", "cell_id"),
                config["chromosomes"]
            ),
        )
    else:
        workflow.subworkflow(
            name='extract_seqdata',
            func='remixt.workflow.create_extract_seqdata_workflow',
            ctx={'disk': 150, 'docker_image': remixt_image},
            args=(
                mgd.InputFile(bam_file, extensions=['.bai']),
                mgd.TempOutputFile('seqdata_file.h5'),
                remixt_config,
                remixt_ref_data_dir,
            ),
        )

    workflow.setobj(
        obj=mgd.OutputChunks('chromosome'),
        value=chromosomes,
    )

    if from_tumour:
        func = 'remixt.analysis.haplotype.infer_snp_genotype_from_tumour'
    else:
        func = 'remixt.analysis.haplotype.infer_snp_genotype_from_normal'

    workflow.transform(
        name='infer_snp_genotype',
        axes=('chromosome',),
        ctx={'mem': 16, 'docker_image': remixt_image},
        func=func,
        args=(
            mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'),
            mgd.TempInputFile('seqdata_file.h5'),
            mgd.InputInstance('chromosome'),
            config,
        ),
    )

    workflow.transform(
        name='infer_haps',
        axes=('chromosome',),
        ctx={'mem': 16, 'docker_image': remixt_image},
        func='remixt.analysis.haplotype.infer_haps',
        args=(
            mgd.TempOutputFile('haplotypes.tsv', 'chromosome'),
            mgd.TempInputFile('snp_genotype.tsv', 'chromosome'),
            mgd.InputInstance('chromosome'),
            mgd.TempSpace('haplotyping', 'chromosome'),
            remixt_config,
            remixt_ref_data_dir,
        ),
    )

    workflow.transform(
        name='merge_haps',
        ctx={'mem': 16, 'docker_image': remixt_image},
        func='remixt.utils.merge_tables',
        args=(
            mgd.TempOutputFile('haplotypes_merged.tsv'),
            mgd.TempInputFile('haplotypes.tsv', 'chromosome'),
        )
    )

    workflow.transform(
        name='finalize_csv',
        ctx={'mem': 16},
        func='single_cell.utils.csvutils.rewrite_csv_file',
        args=(
            mgd.TempInputFile('haplotypes_merged.tsv'),
            mgd.OutputFile(haplotypes_filename, extensions=['.yaml']),
        ),
        kwargs={
            'write_header': True,
            'dtypes': dtypes()['haplotypes']
        },
    )

    return workflow
예제 #27
0
def create_qc_annotation_workflow(
    hmmcopy_metrics,
    hmmcopy_reads,
    alignment_metrics,
    gc_metrics,
    segs_tar,
    merged_metrics,
    qc_report,
    corrupt_tree,
    consensus_tree,
    phylo_csv,
    rank_trees,
    filtered_data,
    corrupt_tree_pdf,
    pass_segs,
    fail_segs,
    corrupt_tree_heatmap_output,
    plot_heatmap_ec_filt_output,
    config,
    library_id,
    no_corrupt_tree=False,
):
    ctx = {'docker_image': config['docker']['single_cell_pipeline']}

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.transform(
        name='cell_cycle_classifier',
        func="single_cell.workflows.qc_annotation.tasks.cell_cycle_classifier",
        args=(mgd.InputFile(hmmcopy_reads),
              mgd.InputFile(hmmcopy_metrics, extensions=['.yaml']),
              mgd.InputFile(alignment_metrics),
              mgd.TempOutputFile('cell_state_classifier.csv.gz',
                                 extensions=['.yaml']),
              mgd.TempSpace('tempdata_cell_cycle')),
        kwargs={'docker_image': config['docker']['cell_cycle_classifier']})

    workflow.transform(
        name="add_quality",
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.workflows.qc_annotation.tasks.add_quality",
        args=(mgd.TempInputFile('cell_state_classifier.csv.gz',
                                extensions=['.yaml']),
              mgd.InputFile(alignment_metrics, extensions=['.yaml']),
              mgd.TempOutputFile("hmmcopy_quality_metrics.csv.gz",
                                 extensions=['.yaml']),
              config['classifier_training_data'],
              mgd.TempSpace("hmmcopy_classify_tempdir")),
    )

    workflow.transform(
        name='merge_alignment_hmmcopy_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.workflows.qc_annotation.tasks.merge_metrics",
        args=(mgd.TempInputFile("hmmcopy_quality_metrics.csv.gz",
                                extensions=['.yaml']),
              mgd.InputFile(alignment_metrics, extenstions=['.yaml']),
              mgd.TempOutputFile('merged_metrics.csv.gz',
                                 extensions=['.yaml'])))

    workflow.transform(
        name='generate_qc_report',
        func="single_cell.workflows.qc_annotation.tasks.generate_qc_report",
        args=(mgd.TempSpace("QC_report_singlecellpipeline"),
              config['reference_gc'],
              mgd.TempInputFile('merged_metrics.csv.gz', extensions=['.yaml']),
              mgd.InputFile(gc_metrics,
                            extensions=['.yaml']), mgd.OutputFile(qc_report)))

    workflow.transform(
        name='filter_segs_plots',
        func="single_cell.workflows.qc_annotation.tasks.filter_plot_tar",
        args=(mgd.TempInputFile('merged_metrics.csv.gz',
                                extensions=['.yaml']), mgd.InputFile(segs_tar),
              mgd.OutputFile(pass_segs), mgd.OutputFile(fail_segs),
              mgd.TempSpace("filter_seg_plots"), config['good_cells']))

    workflow.transform(
        name='plot_heatmap_ec_filtered',
        func="single_cell.workflows.qc_annotation.tasks.plot_pcolor",
        args=(
            mgd.InputFile(hmmcopy_reads, extensions=['.yaml']),
            mgd.TempInputFile('merged_metrics.csv.gz', extensions=['.yaml']),
            mgd.OutputFile(plot_heatmap_ec_filt_output),
        ),
        kwargs={
            'plot_title': 'QC pipeline metrics',
            'column_name': 'state',
            'plot_by_col': 'experimental_condition',
            'color_by_col': 'cell_call',
            'chromosomes': config['chromosomes'],
            'max_cn': config['num_states'],
            'scale_by_cells': False,
            'cell_filters': config["good_cells"],
            'mappability_threshold': config["map_cutoff"]
        })

    if no_corrupt_tree:
        workflow.transform(
            name='finalize_metrics',
            ctx={
                'mem': config['memory']['med'],
                'ncpus': 1,
                'num_retry': 1
            },
            func="single_cell.utils.csvutils.finalize_csv",
            args=(
                mgd.TempInputFile('merged_metrics.csv.gz',
                                  extensions=['.yaml']),
                mgd.OutputFile(merged_metrics, extensions=['.yaml']),
            ),
        )
    else:

        workflow.transform(name='finalize_metrics',
                           ctx={
                               'mem': config['memory']['med'],
                               'ncpus': 1,
                               'num_retry': 1
                           },
                           func="single_cell.utils.csvutils.finalize_csv",
                           args=(mgd.TempInputFile('merged_metrics.csv.gz',
                                                   extensions=['.yaml']),
                                 mgd.TempOutputFile(
                                     'merged_metrics_with_header.csv.gz',
                                     extensions=['.yaml'])))

        workflow.subworkflow(
            name='corrupt_tree',
            func=
            'single_cell.workflows.corrupt_tree.create_corrupt_tree_workflow',
            args=(mgd.TempInputFile('merged_metrics_with_header.csv.gz',
                                    extensions=['.yaml']),
                  mgd.InputFile(hmmcopy_reads), mgd.OutputFile(corrupt_tree),
                  mgd.OutputFile(consensus_tree), mgd.OutputFile(phylo_csv),
                  mgd.OutputFile(rank_trees), mgd.OutputFile(filtered_data),
                  mgd.OutputFile(corrupt_tree_pdf), library_id, config))

        workflow.transform(
            name="add_corrupt_tree_order",
            ctx={
                'mem': config['memory']['med'],
                'ncpus': 1
            },
            func=
            "single_cell.workflows.qc_annotation.tasks.add_corrupt_tree_order",
            args=(mgd.InputFile(corrupt_tree),
                  mgd.TempInputFile('merged_metrics_with_header.csv.gz',
                                    extensions=['.yaml']),
                  mgd.OutputFile(merged_metrics, extensions=['.yaml'])),
        )

        workflow.transform(
            name='plot_heatmap_corrupt_tree',
            func="single_cell.workflows.qc_annotation.tasks.plot_pcolor",
            args=(
                mgd.InputFile(hmmcopy_reads, extensions=['.yaml']),
                mgd.TempInputFile('merged_metrics.csv.gz',
                                  extensions=['.yaml']),
                mgd.OutputFile(corrupt_tree_heatmap_output),
            ),
            kwargs={
                'plot_title': 'QC pipeline metrics',
                'column_name': 'state',
                'plot_by_col': 'experimental_condition',
                'color_by_col': 'cell_call',
                'chromosomes': config['chromosomes'],
                'max_cn': config['num_states'],
                'scale_by_cells': False,
                'corrupt_tree': mgd.InputFile(corrupt_tree),
            })

    return workflow
예제 #28
0
def create_remixt_workflow(
    tumour_path,
    normal_path,
    breakpoints,
    sample_id,
    remixt_results_filename,
    remixt_brk_cn_csv,
    remixt_cn_csv,
    remixt_minor_modes_csv,
    remixt_mix_csv,
    remixt_read_depth_csv,
    remixt_stats_csv,
    remixt_refdata,
    reference,
    single_node=False,
):
    ctx = {'docker_image': config.containers('wgs')}

    params = config.default_params('copynumber_calling')['remixt']

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    remixt_config = {
        'genome_fasta': reference,
        'genome_fai': reference + '.fai',
    }

    if breakpoints is None:
        workflow.setobj(
            obj=mgd.TempOutputObj('emptybreakpoints'),
            value=[],
        )

        workflow.transform(
            name='write_empty_breakpoints',
            func='wgs.workflows.remixt.tasks.write_empty_breakpoints',
            args=(
                mgd.TempInputObj('emptybreakpoints'),
                mgd.TempOutputFile('filtered_breakpoints.csv'),
            ),
        )

    else:
        workflow.transform(
            name='filter_breakpoints',
            func='wgs.workflows.remixt.tasks.filter_destruct_breakpoints',
            ctx=helpers.get_default_ctx(memory=4, walltime='4:00'),
            args=(mgd.InputFile(breakpoints),
                  mgd.TempOutputFile('filtered_breakpoints.csv'),
                  params['min_num_reads']))

    if single_node:
        workflow.transform(
            name='remixt',
            func='wgs.workflows.remixt.tasks.run_remixt_local',
            ctx=helpers.get_default_ctx(memory=15, walltime='120:00', ncpus=8),
            args=(
                mgd.TempSpace("remixt_temp"),
                mgd.TempInputFile('filtered_breakpoints.csv'),
                mgd.InputFile(tumour_path, extensions=['.bai']),
                mgd.InputFile(normal_path, extensions=['.bai']),
                sample_id,
                mgd.OutputFile(remixt_results_filename),
                mgd.TempSpace('remixt_raw_dir'),
                remixt_config,
                remixt_refdata,
            ),
        )
    else:
        workflow.subworkflow(name='remixt',
                             func="remixt.workflow.create_remixt_bam_workflow",
                             ctx={
                                 'docker_image': config.containers('remixt'),
                                 'walltime': '48:00'
                             },
                             args=(
                                 mgd.TempInputFile('filtered_breakpoints.csv'),
                                 {
                                     sample_id:
                                     mgd.InputFile(tumour_path,
                                                   extensions=['.bai']),
                                     sample_id + 'N':
                                     mgd.InputFile(normal_path,
                                                   extensions=['.bai'])
                                 },
                                 {
                                     sample_id:
                                     mgd.OutputFile(remixt_results_filename)
                                 },
                                 mgd.TempSpace('remixt_raw_dir'),
                                 remixt_config,
                                 remixt_refdata,
                             ),
                             kwargs={
                                 'normal_id': sample_id + 'N',
                             })

    workflow.transform(
        name='parse_remixt',
        func='wgs.workflows.remixt.tasks.parse_remixt_file',
        args=(mgd.InputFile(remixt_results_filename), [
            mgd.OutputFile(remixt_brk_cn_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_cn_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_minor_modes_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_mix_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_read_depth_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_stats_csv, extensions=['.yaml']),
        ], ['/brk_cn', '/cn', '/minor_modes', '/mix', '/read_depth',
            '/stats'], mgd.TempSpace('tempdir_parse')))

    return workflow
예제 #29
0
def create_hmmcopy_workflow(
        bam_file, reads, segs, metrics, params, igv_seg_filename,
        segs_pdf, bias_pdf, plot_heatmap_ec_output,
        plot_metrics_output,
        plot_kernel_density_output, hmmcopy_data_tar,
        cell_ids, hmmparams, sample_info
):
    chromosomes = hmmparams["chromosomes"]

    baseimage = hmmparams['docker']['single_cell_pipeline']
    hmmcopy_docker = hmmparams['docker']['hmmcopy']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.setobj(
        obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]),
        value=sample_info)

    workflow.transform(
        name='run_hmmcopy',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.run_hmmcopy",
        axes=('cell_id',),
        args=(
            mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']),
            mgd.TempOutputFile('reads.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('segs.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('params.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('hmm_metrics.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('hmm_data.tar.gz', 'cell_id'),
            mgd.InputInstance('cell_id'),
            hmmparams,
            mgd.TempSpace('hmmcopy_temp', 'cell_id'),
            hmmcopy_docker
        ),
    )

    workflow.transform(
        name='merge_reads',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempOutputFile('reads_merged.csv.gz', extensions=['.yaml']),
        ),
        kwargs={'low_memory': True}
    )

    workflow.transform(
        name='add_mappability_bool',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.get_mappability_col",
        args=(
            mgd.TempInputFile('reads_merged.csv.gz', extensions=['.yaml']),
            mgd.OutputFile(reads, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='merge_segs',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.OutputFile(segs, extensions=['.yaml']),
        ),
        kwargs={'low_memory': True}
    )

    workflow.transform(
        name='merge_metrics',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempOutputFile("hmm_metrics.csv.gz", extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='merge_params',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.OutputFile(params, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='get_max_cn',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.get_max_cn",
        ret=mgd.TempOutputObj('max_cn'),
        args=(
            mgd.InputFile(reads, extensions=['.yaml']),
        )
    )

    workflow.transform(
        name='hmmcopy_plots',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_hmmcopy",
        axes=('cell_id',),
        args=(
            mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            hmmparams['ref_genome'],
            mgd.TempOutputFile('segments.png', 'cell_id', axes_origin=[]),
            mgd.TempOutputFile('bias.png', 'cell_id', axes_origin=[]),
            mgd.InputInstance('cell_id'),
        ),
        kwargs={
            'num_states': hmmparams['num_states'],
            'sample_info': mgd.TempInputObj('sampleinfo', 'cell_id'),
            'max_cn': mgd.TempInputObj("max_cn")
        }
    )

    workflow.transform(
        name='annotate_metrics_with_info_and_clustering',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.add_clustering_order",
        args=(
            mgd.InputFile(reads, extensions=['.yaml']),
            mgd.TempInputFile("hmm_metrics.csv.gz", extensions=['.yaml']),
            mgd.OutputFile(metrics, extensions=['.yaml']),
        ),
        kwargs={
            'chromosomes': hmmparams["chromosomes"],
            'sample_info': sample_info
        }
    )

    workflow.transform(
        name='merge_hmm_copy_plots',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.merge_pdf",
        args=(
            [
                mgd.TempInputFile('segments.png', 'cell_id'),
                mgd.TempInputFile('bias.png', 'cell_id'),
            ],
            [
                mgd.OutputFile(segs_pdf),
                mgd.OutputFile(bias_pdf),
            ],
            mgd.InputFile(metrics, extensions=['.yaml']),
            None,
            mgd.TempSpace("hmmcopy_plot_merge_temp"),
            ['segments', 'bias']
        )
    )

    workflow.transform(
        name='create_igv_seg',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.create_igv_seg",
        args=(
            mgd.InputFile(segs, extensions=['.yaml']),
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(igv_seg_filename),
            hmmparams,
        )
    )

    workflow.transform(
        name='plot_metrics',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_metrics",
        args=(
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(plot_metrics_output),
            'QC pipeline metrics',
        )
    )

    workflow.transform(
        name='plot_kernel_density',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_kernel_density",
        args=(
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(plot_kernel_density_output),
            ',',
            'mad_neutral_state',
            'QC pipeline metrics',
        )
    )

    workflow.transform(
        name='plot_heatmap_ec',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_pcolor",
        args=(
            mgd.InputFile(reads, extensions=['.yaml']),
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(plot_heatmap_ec_output),
        ),
        kwargs={
            'plot_title': 'QC pipeline metrics',
            'column_name': 'state',
            'plot_by_col': 'experimental_condition',
            'color_by_col': 'cell_call',
            'chromosomes': chromosomes,
            'max_cn': hmmparams['num_states'],
            'scale_by_cells': False,
            'mappability_threshold': hmmparams["map_cutoff"]
        }
    )

    workflow.transform(
        name='merge_hmmcopy_data_tars',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.utils.helpers.tar_files",
        args=(
            mgd.TempInputFile('hmm_data.tar.gz', 'cell_id', axes_origin=[]),
            mgd.OutputFile(hmmcopy_data_tar),
            mgd.TempSpace("merge_tarballs")
        ),

    )

    return workflow
예제 #30
0
def partition_tumour(config, input_args, patient_id, results_dir, input_bams,
                     input_bais, output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('tumour_id', ),
                    value=input_args['tumour_samples'])
    workflow.setobj(obj=mgd.OutputChunks('normal_id', ),
                    value=input_args['normal_samples'])

    workflow.transform(name='merge_normal',
                       func=tasks.merge_normal,
                       args=(config,
                             mgd.InputFile('normal.bam',
                                           'normal_id',
                                           fnames=input_args['normal_bams'],
                                           axes_origin=[]),
                             mgd.OutputFile(
                                 os.path.join(input_args['patient_bam_dir'],
                                              'merged_normal.bam')),
                             mgd.OutputFile(
                                 os.path.join(input_args['patient_bam_dir'],
                                              'merged_normal.bam.bai'))))

    workflow.subworkflow(
        name='analyze_tumour',
        func=analyze_tumour_normal,
        axes=('tumour_id', ),
        args=(
            config,
            input_args,
            results_dir,
            mgd.InputFile(
                os.path.join(input_args['patient_bam_dir'],
                             'merged_normal.bam')),
            mgd.InputInstance('tumour_id'),
            mgd.InputFile('tumour.bam', 'tumour_id', fnames=input_bams),
            mgd.OutputFile(
                os.path.join(results_dir, patient_id + '_{tumour_id}.snv.tsv'),
                'tumour_id'),
            mgd.OutputFile(
                os.path.join(results_dir,
                             patient_id + '_{tumour_id}.indel.tsv'),
                'tumour_id'),
            mgd.TempOutputFile('snv.vcf', 'tumour_id'),
            mgd.TempOutputFile('indel.vcf', 'tumour_id'),
        ))

    workflow.transform(name='annotate_snvs',
                       func=tasks.annotate_outputs,
                       axes=('tumour_id', ),
                       args=(
                           config,
                           mgd.TempSpace('snv_annotation_space', 'tumour_id'),
                           mgd.TempInputFile('snv.vcf', 'tumour_id'),
                           mgd.OutputFile(
                               os.path.join(
                                   results_dir,
                                   patient_id + '_{tumour_id}.snv.txt'),
                               'tumour_id'),
                       ))

    workflow.transform(name='annotate_indels',
                       func=tasks.annotate_outputs,
                       axes=('tumour_id', ),
                       args=(
                           config,
                           mgd.TempSpace('indel_annotation_space',
                                         'tumour_id'),
                           mgd.TempInputFile('indel.vcf', 'tumour_id'),
                           mgd.OutputFile(
                               os.path.join(
                                   results_dir,
                                   patient_id + '_{tumour_id}.indel.txt'),
                               'tumour_id'),
                       ))

    workflow.transform(name='vcf_annotate_indels',
                       func=tasks.vcf_annotate_outputs,
                       axes=('tumour_id', ),
                       args=(
                           config,
                           mgd.TempSpace('indel_vcf_annotation_space',
                                         'tumour_id'),
                           mgd.TempInputFile('indel.vcf', 'tumour_id'),
                           mgd.OutputFile(
                               os.path.join(
                                   results_dir,
                                   patient_id + '_{tumour_id}.indel.vcf'),
                               'tumour_id'),
                       ))

    workflow.transform(
        name='vcf_annotate_snvs',
        func=tasks.vcf_annotate_outputs,
        axes=('tumour_id', ),
        args=(
            config,
            mgd.TempSpace('snv_vcf_annotation_space', 'tumour_id'),
            mgd.TempInputFile('snv.vcf', 'tumour_id'),
            mgd.OutputFile(
                os.path.join(results_dir, patient_id + '_{tumour_id}.snv.vcf'),
                'tumour_id'),
        ))

    workflow.transform(
        name='log_patient_analysis',
        func=tasks.log_patient_analysis,
        args=(
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.snv.tsv'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.indel.tsv'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.snv.txt'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.indel.txt'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.snv.vcf'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.indel.vcf'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.OutputFile(output_file),
        ))

    return workflow