示例#1
0
def align_pe(fastq1, fastq2, output, reports, metrics, tempdir, reference,
             instrument, centre, sample_info, cell_id, lane_id, library_id,
             config):

    readgroup = get_readgroup(lane_id, cell_id, library_id, centre,
                              sample_info)

    run_fastqc(fastq1, fastq2, reports, tempdir, config)

    aln_temp = os.path.join(tempdir, "temp_alignments.bam")
    if config["aligner"] == "bwa-mem":
        bwa_mem_paired_end(fastq1, fastq2, aln_temp, reference, readgroup,
                           tempdir, config['containers'])
    elif config["aligner"] == "bwa-aln":
        if not instrument == "N550":
            fastq1, fastq2 = trim_fastqs(fastq1, fastq2, cell_id, tempdir,
                                         config)
        bwa_aln_paired_end(fastq1, fastq2, aln_temp, tempdir, reference,
                           readgroup, config['containers'])
    else:
        raise Exception(
            "Aligner %s not supported, pipeline supports bwa-aln and bwa-mem" %
            config["aligner"])

    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'picard',
                                              docker_only=True)

    picardutils.bam_sort(aln_temp, output, tempdir, **container_ctx)

    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'samtools',
                                              docker_only=True)
    bamutils.bam_flagstat(output, metrics, **container_ctx)
示例#2
0
def merge_bams(inputs, output, output_index, config):
    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'picard',
                                              docker_only=True)
    picardutils.merge_bams(inputs, output, **container_ctx)

    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'samtools',
                                              docker_only=True)
    bamutils.bam_index(output, output_index, **container_ctx)
示例#3
0
def bwa_aln_paired_end(fastq1, fastq2, output, tempdir, reference, readgroup,
                       config):
    container_ctx = helpers.get_container_ctx(config, 'bwa', docker_only=True)

    samfile = os.path.join(tempdir, "bwamem.sam")
    bamutils.bwa_aln_paired_end(fastq1, fastq2, samfile, tempdir, reference,
                                readgroup, **container_ctx)

    container_ctx = helpers.get_container_ctx(config,
                                              'samtools',
                                              docker_only=True)
    bamutils.samtools_sam_to_bam(samfile, output, **container_ctx)
示例#4
0
def create_variant_counting_workflow(
    vcfs,
    tumour_cell_bams,
    results_h5,
    config,
):
    """ Count variant reads for multiple sets of variants across cells.
    """

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=tumour_cell_bams.keys(),
    )

    workflow.transform(name='merge_snvs',
                       func='biowrappers.components.io.vcf.tasks.merge_vcfs',
                       args=([mgd.InputFile(vcf) for vcf in vcfs],
                             mgd.TempOutputFile('all.snv.vcf')))

    workflow.transform(name='finalise_snvs',
                       func="biowrappers.components.io.vcf.tasks.finalise_vcf",
                       args=(mgd.TempInputFile('all.snv.vcf'),
                             mgd.TempOutputFile('all.snv.vcf.gz',
                                                extensions=['.tbi'])),
                       kwargs={
                           'docker_config':
                           helpers.get_container_ctx(config['containers'],
                                                     'vcftools')
                       })

    workflow.subworkflow(
        name='count_alleles',
        func=create_snv_allele_counts_for_vcf_targets_workflow,
        args=(
            config,
            mgd.InputFile('tumour_cells.bam',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams),
            mgd.TempInputFile('all.snv.vcf.gz'),
            mgd.OutputFile(results_h5),
        ),
        kwargs={
            'docker_config':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline')
        },
    )

    return workflow
示例#5
0
def run_hmmcopy_script(corrected_reads, tempdir, cell_id, hmmparams, config):
    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'hmmcopy',
                                              docker_only=True)

    if container_ctx.get("container_type") == 'docker':
        cmd = ["hmmcopy"]
    else:
        cmd = ['Rscript', run_hmmcopy_rscript]

    # run hmmcopy
    cmd += [
        '--corrected_data=' + corrected_reads, '--outdir=' + tempdir,
        '--sample_id=' + cell_id
    ]

    multipliers = ','.join(map(str, hmmparams['multipliers']))

    cmd.append('--param_str=' + str(hmmparams['strength']))
    cmd.append('--param_e=' + str(hmmparams['e']))
    cmd.append('--param_mu=' + str(hmmparams['mu']))
    cmd.append('--param_l=' + str(hmmparams['lambda']))
    cmd.append('--param_nu=' + str(hmmparams['nu']))
    cmd.append('--param_k=' + str(hmmparams['kappa']))
    cmd.append('--param_m=' + str(hmmparams['m']))
    cmd.append('--param_eta=' + str(hmmparams['eta']))
    cmd.append('--param_g=' + str(hmmparams['g']))
    cmd.append('--param_s=' + str(hmmparams['s']))
    cmd.append('--param_multiplier=' + multipliers)

    pypeliner.commandline.execute(*cmd, **container_ctx)
示例#6
0
def realign(input_bams, input_bais, output_bams, tempdir, config, interval):
    container_ctx = helpers.get_container_ctx(config['containers'], 'samtools', docker_only=True)

    # make the dir
    if not os.path.exists(tempdir):
        os.makedirs(tempdir)

    # symlink inputs to tempdir, inputs have same filename but they should be
    # different for mapping file nwayout to work
    # realign
    new_inputs = {}
    for key, bamfile in input_bams.items():
        new_bam = os.path.join(tempdir, key + '.bam')
        new_bai = os.path.join(tempdir, key + '.bam.bai')

        shutil.copy(bamfile, new_bam)
        shutil.copy(bamfile + '.bai', new_bai)
        new_inputs[key] = new_bam

    # save intervals file in tempdir
    targets = os.path.join(tempdir, 'realn_positions.intervals')
    gatkutils.generate_targets(input_bams, config, targets, interval, **container_ctx)

    # run gatk realigner
    gatkutils.gatk_realigner(new_inputs, config, targets, interval, tempdir, **container_ctx)

    # copy generated files in temp dir to the specified output paths
    for key in input_bams.keys():
        realigned_bam = os.path.join(tempdir, key + '_indel_realigned.bam')
        realigned_bai = os.path.join(tempdir, key + '_indel_realigned.bai')
        output_bam_filename = output_bams[key]
        output_bai_filename = output_bam_filename + '.bai'

        shutil.move(realigned_bam, output_bam_filename)
        shutil.move(realigned_bai, output_bai_filename)
示例#7
0
def run_fastqc(fastq1, fastq2, reports, tempdir, config):
    """
    run fastqc on both fastq files
    run trimgalore if needed, copy if not.
    """
    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'fastqc',
                                              docker_only=True)

    reports_dir = os.path.join(tempdir, 'fastqc_reports')
    if not os.path.exists(reports_dir):
        helpers.makedirs(reports_dir)

    out_html = os.path.join(reports_dir, 'fastqc_R1.html')
    out_plot = os.path.join(reports_dir, 'fastqc_R1.zip')
    if not os.path.getsize(fastq1) == 0:
        bamutils.produce_fastqc_report(fastq1, out_html, out_plot, tempdir,
                                       **container_ctx)
    else:
        warnings.warn("fastq file %s is empty, skipping fastqc" % fastq1)

    out_html = os.path.join(reports_dir, 'fastqc_R2.html')
    out_plot = os.path.join(reports_dir, 'fastqc_R2.zip')
    if not os.path.getsize(fastq2) == 0:
        bamutils.produce_fastqc_report(fastq2, out_html, out_plot, tempdir,
                                       **container_ctx)
    else:
        warnings.warn("fastq file %s is empty, skipping fastqc" % fastq1)

    helpers.make_tarfile(reports, reports_dir)
def create_museq_workflow(
        normal_bam, tumour_bam, ref_genome, snv_vcf,
        config):

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=normal_bam.keys(),
    )

    workflow.transform(
        name='run_museq',
        ctx=dict(mem=config["memory"]['med'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        axes=('region',),
        func='single_cell.workflows.mutationseq.tasks.run_museq',
        args=(
            mgd.InputFile('merged_bam', 'region', fnames=tumour_bam),
            mgd.InputFile('normal.split.bam', 'region', fnames=normal_bam),
            mgd.TempOutputFile('museq.vcf', 'region'),
            mgd.TempOutputFile('museq.log', 'region'),
            mgd.InputInstance('region'),
            config,
        ),
        kwargs={'docker_kwargs': helpers.get_container_ctx(config['containers'], 'mutationseq')}
    )

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=config["memory"]['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func='biowrappers.components.io.vcf.tasks.concatenate_vcf',
        args=(
            mgd.TempInputFile('museq.vcf', 'region'),
            mgd.OutputFile(snv_vcf),
        ),
    )

    return workflow
示例#9
0
def bam_collect_gc_metrics(bam_filename, ref_genome, metrics_filename,
                           summary_filename, chart_filename, tempdir, config):
    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'picard',
                                              docker_only=True)

    picardutils.bam_collect_gc_metrics(bam_filename, ref_genome,
                                       metrics_filename, summary_filename,
                                       chart_filename, tempdir,
                                       **container_ctx)
def create_merge_bams_workflow(
    input_bams,
    merged_bams,
    cell_ids,
    config,
    regions):

    merged_bams = dict([(region, merged_bams[region])
                         for region in regions])

    ctx = {'mem_retry_increment': 2}
    docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=regions,
    )


    workflow.transform(
        name='merge_bams',
        ctx=dict(mem=config['memory']['high'], pool_id=config['pools']['multicore'],
                 ncpus=config['max_cores'], **ctx),
        func="single_cell.workflows.merge_bams.tasks.merge_bams",
        args=(
            mgd.InputFile('bam', 'cell_id', fnames=input_bams),
            mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[]),
            regions,
            helpers.get_container_ctx(config['containers'], 'samtools')
        ),
        kwargs = {"ncores": config["max_cores"]}
    )

    return workflow
示例#11
0
def postprocess_bam(infile, outfile, outfile_index, tempdir, config,
                    markdups_metrics, flagstat_metrics):

    if not os.path.exists(tempdir):
        helpers.makedirs(tempdir)

    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'picard',
                                              docker_only=True)
    sorted_bam = os.path.join(tempdir, 'sorted.bam')
    picardutils.bam_sort(infile, sorted_bam, tempdir, **container_ctx)

    picardutils.bam_markdups(sorted_bam, outfile, markdups_metrics, tempdir,
                             **container_ctx)

    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'samtools',
                                              docker_only=True)
    bamutils.bam_index(outfile, outfile_index, **container_ctx)
    bamutils.bam_flagstat(outfile, flagstat_metrics, **container_ctx)
示例#12
0
def get_postprocess_metrics(infile, infile_bai, tempdir, config,
                            markdups_metrics, flagstat_metrics):

    if not os.path.exists(tempdir):
        helpers.makedirs(tempdir)

    outfile = os.path.join(tempdir, 'markdps.bam')
    outfile_index = outfile + '.bai'

    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'picard',
                                              docker_only=True)

    picardutils.bam_markdups(infile, outfile, markdups_metrics, tempdir,
                             **container_ctx)

    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'samtools',
                                              docker_only=True)
    bamutils.bam_index(outfile, outfile_index, **container_ctx)
    bamutils.bam_flagstat(outfile, flagstat_metrics, **container_ctx)
示例#13
0
def bam_collect_insert_metrics(bam_filename, flagstat_metrics_filename,
                               metrics_filename, histogram_filename, tempdir,
                               config):
    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'picard',
                                              docker_only=True)

    picardutils.bam_collect_insert_metrics(bam_filename,
                                           flagstat_metrics_filename,
                                           metrics_filename,
                                           histogram_filename, tempdir,
                                           **container_ctx)
示例#14
0
def variant_calling_workflow(args):

    config = helpers.load_config(args)

    ctx = {'num_retry': 3, 'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    meta_yaml = os.path.join(args['out_dir'], 'info.yaml')

    bam_files, bai_files = helpers.get_bams(args['input_yaml'])

    cellids = helpers.get_samples(args['input_yaml'])

    varcalls_dir = os.path.join(args['out_dir'], 'results', 'variant_calling')

    museq_vcf = os.path.join(varcalls_dir, 'museq_snv.vcf.gz')
    strelka_snv_vcf = os.path.join(varcalls_dir, 'strelka_snv.vcf.gz')
    strelka_indel_vcf = os.path.join(varcalls_dir, 'strelka_indel.vcf.gz')
    snv_h5 = os.path.join(varcalls_dir, 'snv_annotations.h5')
    raw_data_dir = os.path.join(varcalls_dir, 'raw')

    wgs_bam_template = args["tumour_template"]
    normal_bam_template = args["normal_template"]

    regions = refgenome.get_split_regions(config["split_size"])

    tumour_region_bams = {
        r: wgs_bam_template.format(region=r)
        for r in regions
    }
    normal_region_bams = {
        r: normal_bam_template.format(region=r)
        for r in regions
    }

    return create_variant_calling_workflow(
        bam_files,
        tumour_region_bams,
        normal_region_bams,
        museq_vcf,
        strelka_snv_vcf,
        strelka_indel_vcf,
        snv_h5,
        config,
        raw_data_dir,
    )
def create_extract_seqdata_workflow(
    bam_filename,
    seqdata_filename,
    remixt_config,
    remixt_ref_data_dir,
    config,
    multiprocess=False,
):
    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='create_chromosome_seqdata',
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func=
        "single_cell.workflows.extract_seqdata.tasks.create_chromosome_seqdata",
        args=(
            mgd.TempOutputFile('seqdata', 'chromosome'),
            mgd.InputFile(bam_filename),
            remixt_config,
            remixt_ref_data_dir,
        ),
        kwargs={
            'multiprocess': multiprocess,
            'ncores': config['max_cores']
        })

    workflow.transform(
        name='merge_seqdata',
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func="remixt.seqdataio.merge_seqdata",
        args=(
            mgd.OutputFile(seqdata_filename),
            mgd.TempInputFile('seqdata', 'chromosome'),
        ),
    )

    return workflow
示例#16
0
def run_correction_hmmcopy(bam_file, correct_reads_out, readcount_wig, config,
                           hmmparams):

    container_ctx = helpers.get_container_ctx(config['containers'],
                                              'hmmcopy',
                                              docker_only=True)

    run_readcount_rscript = os.path.join(scripts_directory,
                                         'correct_read_count.R')

    rc = ReadCounter(bam_file,
                     readcount_wig,
                     hmmparams['bin_size'],
                     config['chromosomes'],
                     hmmparams['min_mqual'],
                     excluded=hmmparams['exclude_list'])
    rc.main()

    if hmmparams["smoothing_function"] == 'loess':
        cmd = [
            'Rscript', run_readcount_rscript, readcount_wig,
            hmmparams['gc_wig_file'], hmmparams['map_wig_file'],
            correct_reads_out
        ]
        pypeliner.commandline.execute(*cmd, **container_ctx)
    elif hmmparams["smoothing_function"] == 'modal':
        CorrectReadCount(hmmparams["gc_wig_file"],
                         hmmparams['map_wig_file'],
                         readcount_wig,
                         correct_reads_out,
                         mappability=hmmparams['map_cutoff']).main()
    else:
        raise Exception(
            "smoothing function %s not supported. pipeline supports loess and modal"
            % hmmparams["smoothing_function"])

    return correct_reads_out
def merge_bams_workflow(workflow, args):

    input_yaml = args["input_yaml"]
    output_template = args["merged_bam_template"]

    info_file = os.path.join(args["out_dir"], 'results', 'merge_bams',
                             "info.yaml")
    config = helpers.load_config(args)
    bam_files, bai_files = helpers.get_bams(input_yaml)
    cellids = helpers.get_samples(input_yaml)

    wgs_bam_template = output_template
    wgs_bai_template = wgs_bam_template + ".bai"

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    ctx.update(
        helpers.get_container_ctx(config['containers'],
                                  'single_cell_pipeline'))

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cellids,
    )

    workflow.transform(
        name="get_regions",
        ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx),
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.TempOutputObj('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.subworkflow(name="wgs_merge_workflow",
                         func=merge_bams.create_merge_bams_workflow,
                         args=(
                             mgd.InputFile('bam_markdups',
                                           'cell_id',
                                           fnames=bam_files,
                                           extensions=['.bai']),
                             mgd.OutputFile("merged_bam",
                                            "region",
                                            axes_origin=[],
                                            template=wgs_bam_template,
                                            extensions=['.bai']),
                             cellids,
                             config,
                             mgd.TempInputObj("region"),
                         ))

    workflow.transform(name="get_files",
                       ctx=dict(mem=2,
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func='single_cell.utils.helpers.resolve_template',
                       ret=pypeliner.managed.TempOutputObj('outputs'),
                       args=(pypeliner.managed.TempInputObj('region'),
                             wgs_bam_template, 'region'))

    inputs = {k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems()}

    metadata = {
        'merge_bams': {
            'name': 'merge_bams',
            'ref_genome': config["ref_genome"],
            'version': single_cell.__version__,
            'containers': config['containers'],
            'output_datasets': pypeliner.managed.TempInputObj('outputs'),
            'input_datasets': inputs,
            'results': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(mem=2,
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
def germline_calling_workflow(workflow, args):

    config = helpers.load_config(args)

    ctx = {
        'mem_retry_increment': 2,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'pool_id': config['pools']['standard'],
    }
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    bam_files, bai_files = helpers.get_bams(args['input_yaml'])

    sampleids = helpers.get_samples(args['input_yaml'])

    normal_bam_template = args["input_template"]
    normal_bai_template = args["input_template"] + ".bai"

    if "{reads}" in normal_bam_template:
        raise ValueError(
            "input template for germline calling only support region based splits"
        )

    varcalls_dir = os.path.join(args['out_dir'], 'results', 'germline_calling')

    samtools_germline_vcf = os.path.join(varcalls_dir, 'raw',
                                         'samtools_germline.vcf.gz')
    snpeff_vcf_filename = os.path.join(varcalls_dir, 'snpeff.vcf')
    normal_genotype_filename = os.path.join(varcalls_dir, 'raw',
                                            'normal_genotype.h5')
    mappability_filename = os.path.join(varcalls_dir, 'raw', 'mappability.h5')
    counts_template = os.path.join(varcalls_dir, 'counts', 'raw', 'counts.h5')
    germline_h5_filename = os.path.join(varcalls_dir, 'germline.h5')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=bam_files.keys(),
    )

    workflow.transform(
        name="get_regions",
        ctx=ctx,
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.OutputChunks('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.subworkflow(name='samtools_germline',
                         func=germline.create_samtools_germline_workflow,
                         args=(
                             mgd.InputFile("normal.split.bam",
                                           "region",
                                           template=normal_bam_template),
                             mgd.InputFile("normal.split.bam.bai",
                                           "region",
                                           template=normal_bai_template),
                             config['ref_genome'],
                             mgd.OutputFile(samtools_germline_vcf,
                                            extensions=['.tbi']),
                             config,
                         ),
                         kwargs={
                             'chromosomes':
                             config["chromosomes"],
                             'base_docker':
                             helpers.get_container_ctx(config['containers'],
                                                       'single_cell_pipeline'),
                             'vcftools_docker':
                             helpers.get_container_ctx(config['containers'],
                                                       'vcftools'),
                             'samtools_docker':
                             helpers.get_container_ctx(config['containers'],
                                                       'samtools'),
                         })

    workflow.subworkflow(
        name='annotate_mappability',
        func=
        "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow",
        args=(
            config['databases']['mappability']['local_path'],
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(mappability_filename),
        ),
        kwargs={
            'base_docker':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline')
        })

    workflow.transform(
        name='annotate_genotype',
        func="single_cell.workflows.germline.tasks.annotate_normal_genotype",
        ctx=ctx,
        args=(
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(normal_genotype_filename),
            config["chromosomes"],
        ),
    )

    workflow.subworkflow(
        name='snpeff',
        func=
        "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow",
        args=(
            config['databases']['snpeff']['db'],
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(snpeff_vcf_filename),
        ),
        kwargs={
            'hdf5_output':
            False,
            'base_docker':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline'),
            'vcftools_docker':
            helpers.get_container_ctx(config['containers'], 'vcftools'),
            'snpeff_docker':
            helpers.get_container_ctx(config['containers'], 'snpeff'),
        })

    workflow.subworkflow(
        name='read_counts',
        func=
        "single_cell.variant_calling.create_snv_allele_counts_for_vcf_targets_workflow",
        args=(
            config,
            mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files),
            mgd.InputFile('tumour.bam.bai', 'cell_id', fnames=bai_files),
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(counts_template),
        ),
        kwargs={
            'table_name':
            '/germline_allele_counts',
            'docker_config':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline')
        },
    )

    workflow.transform(
        name='build_results_file',
        func="biowrappers.components.io.hdf5.tasks.concatenate_tables",
        ctx=ctx,
        args=(
            [
                mgd.InputFile(counts_template),
                mgd.InputFile(mappability_filename),
                mgd.InputFile(normal_genotype_filename),
            ],
            pypeliner.managed.OutputFile(germline_h5_filename),
        ),
        kwargs={
            'drop_duplicates': True,
        })

    info_file = os.path.join(args["out_dir"], 'results', 'germline_calling',
                             "info.yaml")

    results = {
        'germline_data': helpers.format_file_yaml(germline_h5_filename),
    }

    input_datasets = {
        k: helpers.format_file_yaml(v)
        for k, v in bam_files.iteritems()
    }

    metadata = {
        'germline_calling': {
            'version': single_cell.__version__,
            'results': results,
            'containers': config['containers'],
            'input_datasets': input_datasets,
            'output_datasets': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                mem_retry_increment=2,
                                ncpus=1),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
示例#19
0
def create_variant_calling_workflow(
    tumour_cell_bams,
    tumour_region_bams,
    normal_region_bams,
    museq_vcf,
    strelka_snv_vcf,
    strelka_indel_vcf,
    snv_h5,
    config,
    raw_data_dir,
):
    workflow = pypeliner.workflow.Workflow()

    workflow.set_filenames('normal_regions.bam',
                           'region',
                           fnames=normal_region_bams)
    workflow.set_filenames('tumour_cells.bam',
                           'cell_id',
                           fnames=tumour_cell_bams)
    workflow.set_filenames('tumour_regions.bam',
                           'region',
                           fnames=tumour_region_bams)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=tumour_cell_bams.keys(),
    )

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=tumour_region_bams.keys(),
    )

    workflow.subworkflow(
        name='museq',
        func=mutationseq.create_museq_workflow,
        args=(
            mgd.InputFile('normal_regions.bam', 'region', extensions=['.bai']),
            mgd.InputFile('tumour_regions.bam', 'region', extensions=['.bai']),
            config['ref_genome'],
            mgd.OutputFile(museq_vcf),
            config,
        ),
    )

    workflow.subworkflow(name='strelka',
                         func=strelka.create_strelka_workflow,
                         args=(
                             mgd.InputFile('normal_regions.bam',
                                           'region',
                                           extensions=['.bai']),
                             mgd.InputFile('tumour_regions.bam',
                                           'region',
                                           extensions=['.bai']),
                             config['ref_genome'],
                             mgd.OutputFile(strelka_indel_vcf),
                             mgd.OutputFile(strelka_snv_vcf),
                             config,
                         ),
                         kwargs={"chromosomes": config["chromosomes"]})

    workflow.transform(
        name='convert_museq_to_hdf5',
        func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5",
        ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx),
        args=(
            mgd.InputFile(museq_vcf),
            mgd.TempOutputFile('museq.h5'),
            '/museq/vcf/',
        ),
        kwargs={
            'score_callback': museq_callback,
        })

    workflow.transform(
        name='convert_strelka_to_hdf5',
        func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5",
        ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx),
        args=(
            mgd.InputFile(strelka_snv_vcf),
            mgd.TempOutputFile('strelka_snv.h5'),
            '/strelka/vcf/',
        ),
        kwargs={
            'score_callback': strelka_snv_callback,
        })

    workflow.transform(name='merge_snvs',
                       func='biowrappers.components.io.vcf.tasks.merge_vcfs',
                       ctx=dict(mem=2,
                                pool_id=config['pools']['standard'],
                                **ctx),
                       args=([
                           mgd.InputFile(museq_vcf),
                           mgd.InputFile(strelka_snv_vcf),
                       ], mgd.TempOutputFile('all.snv.vcf')),
                       kwargs={
                           'docker_config':
                           helpers.get_container_ctx(config['containers'],
                                                     'vcftools')
                       })

    workflow.transform(name='finalise_snvs',
                       func="biowrappers.components.io.vcf.tasks.finalise_vcf",
                       ctx=dict(mem=2,
                                pool_id=config['pools']['standard'],
                                **ctx),
                       args=(mgd.TempInputFile('all.snv.vcf'),
                             mgd.TempOutputFile('all.snv.vcf.gz',
                                                extensions=['.tbi'])),
                       kwargs={
                           'docker_config':
                           helpers.get_container_ctx(config['containers'],
                                                     'vcftools')
                       })

    workflow.subworkflow(
        name='annotate_snvs',
        axes=(),
        func=
        "biowrappers.pipelines.snv_call_and_annotate.create_annotation_workflow",
        args=(
            config,
            mgd.TempInputFile('all.snv.vcf.gz'),
            mgd.TempOutputFile('snv_annotations.h5'),
            os.path.join(raw_data_dir, 'snv'),
        ),
        kwargs={
            'variant_type':
            'snv',
            'docker_config':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline')
        })

    workflow.subworkflow(
        name='count_alleles',
        func=create_snv_allele_counts_for_vcf_targets_workflow,
        args=(
            config,
            mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai']),
            mgd.TempInputFile('all.snv.vcf.gz'),
            mgd.TempOutputFile('snv_counts.h5'),
        ),
        kwargs={
            'chromosomes':
            config['chromosomes'],
            'docker_config':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline')
        })

    workflow.transform(
        name='build_results_file',
        ctx=dict(mem=config['memory']['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func="biowrappers.components.io.hdf5.tasks.concatenate_tables",
        args=(
            [
                mgd.TempInputFile('snv_counts.h5'),
                mgd.TempInputFile('snv_annotations.h5'),
                mgd.TempInputFile('museq.h5'),
                mgd.TempInputFile('strelka_snv.h5'),
            ],
            pypeliner.managed.OutputFile(snv_h5),
        ),
        kwargs={
            'drop_duplicates': True,
            'in_memory': False,
        })

    info_file = os.path.join(args["out_dir"], 'results', 'variant_calling',
                             "info.yaml")
    normals = {
        k: helpers.format_file_yaml(v)
        for k, v in normal_region_bams.iteritems()
    }
    tumours = {
        k: helpers.format_file_yaml(v)
        for k, v in tumour_region_bams.iteritems()
    }
    cells = {
        k: helpers.format_file_yaml(v)
        for k, v in tumour_cell_bams.iteritems()
    }
    inputs = {'normal': normals, 'tumour': tumours, 'cells': cells}

    metadata = {
        'variant_calling': {
            'name': 'variant_calling',
            'version': single_cell.__version__,
            'containers': config['containers'],
            'output_datasets': None,
            'input_datasets': inputs,
            'results': {
                'variant_calling_data': helpers.format_file_yaml(snv_h5)
            }
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard']),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
示例#20
0
def align_workflow(workflow, args):

    config = helpers.load_config(args)

    sampleinfo = helpers.get_sample_info(args['input_yaml'])

    cellids = helpers.get_samples(args['input_yaml'])
    bam_files, bai_files = helpers.get_bams(args['input_yaml'])

    lib = args["library_id"]

    outdir = os.path.join(args["out_dir"], "results", "alignment")

    info_file = os.path.join(outdir, "info.yaml")

    alignment_metrics_h5 = os.path.join(outdir,
                                        '{}_alignment_metrics.h5'.format(lib))

    plots_dir = os.path.join(outdir, 'plots')
    plot_metrics_output = os.path.join(plots_dir,
                                       '{}_plot_metrics.pdf'.format(lib))

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    ctx.update(
        helpers.get_container_ctx(config['containers'],
                                  'single_cell_pipeline'))

    if not args["metrics_only"]:
        fastq1_files, fastq2_files = helpers.get_fastqs(args['input_yaml'])
        instrumentinfo = helpers.get_instrument_info(args['input_yaml'])
        centerinfo = helpers.get_center_info(args['input_yaml'])

        workflow.setobj(
            obj=mgd.OutputChunks('cell_id', 'lane'),
            value=fastq1_files.keys(),
        )

        workflow.subworkflow(
            name='alignment_workflow',
            func=align.create_alignment_workflow,
            args=(
                mgd.InputFile('fastq_1',
                              'cell_id',
                              'lane',
                              fnames=fastq1_files,
                              axes_origin=[]),
                mgd.InputFile('fastq_2',
                              'cell_id',
                              'lane',
                              fnames=fastq2_files,
                              axes_origin=[]),
                mgd.OutputFile('bam_markdups',
                               'cell_id',
                               fnames=bam_files,
                               axes_origin=[]),
                mgd.OutputFile('bai_markdups',
                               'cell_id',
                               fnames=bai_files,
                               axes_origin=[]),
                config['ref_genome'],
                config,
                args,
                instrumentinfo,
                centerinfo,
                sampleinfo,
                cellids,
            ),
        )
    else:
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=cellids,
        )

    workflow.subworkflow(
        name='metrics_workflow',
        func=alignment_metrics.create_alignment_metrics_workflow,
        args=(
            mgd.InputFile('bam_markdups',
                          'cell_id',
                          fnames=bam_files,
                          axes_origin=[]),
            mgd.InputFile('bai_markdups',
                          'cell_id',
                          fnames=bai_files,
                          axes_origin=[]),
            mgd.OutputFile(alignment_metrics_h5),
            mgd.OutputFile(plot_metrics_output),
            config['ref_genome'],
            config,
            args,
            sampleinfo,
            cellids,
        ),
    )

    inputs = helpers.get_fastq_files(args["input_yaml"])
    outputs = {
        k: helpers.format_file_yaml(v)
        for k, v in bam_files.iteritems()
    }

    metadata = {
        'alignment': {
            'name': 'alignment',
            'cell_batch_realign': args["realign"],
            'metrics_table': '/alignment/metrics',
            'gc_metrics_table': '/alignment/gc_metrics',
            'aligner': config["aligner"],
            'adapter': config["adapter"],
            'adapter2': config["adapter2"],
            'picardtools_wgsmetrics_params': config['picard_wgs_params'],
            'ref_genome': config["ref_genome"],
            'version': single_cell.__version__,
            'containers': config['containers'],
            'output_datasets': outputs,
            'input_datasets': inputs,
            'results': {
                'alignment_metrics':
                helpers.format_file_yaml(alignment_metrics_h5),
                'alignment_plots':
                helpers.format_file_yaml(plot_metrics_output),
            },
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
示例#21
0
def create_titan_workflow(normal_seqdata, tumour_seqdata, ref_genome,
                          raw_data_dir, out_file, config, args, tumour_cells,
                          normal_cells, cloneid):

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    results_files = os.path.join(raw_data_dir, 'results', 'sample.h5')
    tumour_alleles_file = os.path.join(raw_data_dir, 'results',
                                       'het_counts.h5')

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=tumour_cells,
    )

    workflow.setobj(
        obj=mgd.OutputChunks('normal_cell_id'),
        value=normal_cells,
    )

    workflow.transform(
        name='merge_all_normal_seqdata',
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func="single_cell.workflows.titan.tasks.merge_overlapping_seqdata",
        args=(mgd.TempOutputFile("seqdata_normal_all_cells_merged.h5"),
              pypeliner.managed.InputFile('normal_sample.h5',
                                          'normal_cell_id',
                                          fnames=normal_seqdata),
              config["titan_params"]["chromosomes"]),
    )

    workflow.transform(
        name='prepare_normal_data',
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func=
        "biowrappers.components.copy_number_calling.titan.tasks.prepare_normal_data",
        args=(
            mgd.TempInputFile("seqdata_normal_all_cells_merged.h5"),
            pypeliner.managed.TempOutputFile('normal.wig'),
            pypeliner.managed.TempOutputFile('het_positions.tsv'),
            config["titan_params"],
        ),
    )

    workflow.transform(
        name='prepare_tumour_data',
        axes=('tumour_cell_id', ),
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func=
        "biowrappers.components.copy_number_calling.titan.tasks.prepare_tumour_data",
        args=(
            pypeliner.managed.InputFile('tumour_sample.h5',
                                        'tumour_cell_id',
                                        fnames=tumour_seqdata),
            pypeliner.managed.TempInputFile('het_positions.tsv'),
            pypeliner.managed.TempOutputFile('tumour.wig', 'tumour_cell_id'),
            pypeliner.managed.TempOutputFile('tumour_alleles.tsv',
                                             'tumour_cell_id'),
            config["titan_params"],
        ),
    )

    workflow.transform(
        name='merge_tumour_alleles',
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func="single_cell.workflows.titan.tasks.merge_tumour_alleles",
        args=(
            pypeliner.managed.TempInputFile('tumour_alleles.tsv',
                                            'tumour_cell_id'),
            pypeliner.managed.TempOutputFile('tumour_alleles.tsv'),
        ),
    )

    workflow.transform(
        name='concat_tumour_alleles',
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func="single_cell.workflows.titan.tasks.concat_tumour_alleles",
        args=(pypeliner.managed.TempInputFile('tumour_alleles.tsv',
                                              'tumour_cell_id'),
              pypeliner.managed.OutputFile(tumour_alleles_file),
              config["titan_params"]['chromosomes']),
    )

    workflow.transform(
        name='merge_wigs_tumour',
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func="single_cell.workflows.titan.tasks.merge_wig_files",
        args=(
            pypeliner.managed.TempInputFile('tumour.wig', 'tumour_cell_id'),
            pypeliner.managed.TempOutputFile('tumour.wig'),
        ),
    )

    workflow.transform(
        name='create_intialization_parameters',
        ctx=dict(mem=config["memory"]['low'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func=
        "biowrappers.components.copy_number_calling.titan.tasks.create_intialization_parameters",
        ret=pypeliner.managed.TempOutputObj('init_params', 'init_param_id'),
        args=(config["titan_params"], ),
    )

    workflow.transform(
        name='run_titan',
        axes=('init_param_id', ),
        func="biowrappers.components.copy_number_calling.titan.tasks.run_titan",
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        args=(
            pypeliner.managed.TempInputObj('init_params', 'init_param_id'),
            pypeliner.managed.TempInputFile('normal.wig'),
            pypeliner.managed.TempInputFile('tumour.wig'),
            pypeliner.managed.TempInputFile('tumour_alleles.tsv'),
            pypeliner.managed.TempOutputFile('cn.tsv', 'init_param_id'),
            pypeliner.managed.TempOutputFile('params.tsv', 'init_param_id'),
            config["titan_params"],
        ),
        kwargs={
            'docker_config':
            helpers.get_container_ctx(config['containers'], 'titan')
        })

    workflow.transform(
        name='select_solution',
        ctx=dict(mem=config["memory"]['low'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func=
        "biowrappers.components.copy_number_calling.titan.tasks.select_solution",
        args=(pypeliner.managed.TempInputObj('init_params', 'init_param_id'),
              pypeliner.managed.TempInputFile('cn.tsv', 'init_param_id'),
              pypeliner.managed.TempInputFile('params.tsv', 'init_param_id'),
              pypeliner.managed.OutputFile('results', template=results_files),
              pypeliner.managed.OutputFile(
                  os.path.join(raw_data_dir, 'output', 'cn_loci.tsv')),
              pypeliner.managed.OutputFile(
                  os.path.join(raw_data_dir, 'output', 'cn_segments.tsv')),
              pypeliner.managed.OutputFile(
                  os.path.join(raw_data_dir, 'output', 'cn_igv.tsv')),
              pypeliner.managed.OutputFile(
                  os.path.join(raw_data_dir, 'output',
                               'params.tsv')), config, cloneid),
        kwargs={
            'docker_config':
            helpers.get_container_ctx(config['containers'], 'titan'),
            'breakpoints_filename':
            None,
        },
    )

    workflow.setobj(
        obj=mgd.OutputChunks('chromosome'),
        value=config['titan_params']["chromosomes"],
    )

    workflow.commandline(
        name='plot_chromosome',
        axes=('chromosome', ),
        ctx=dict(mem=config["memory"]['low'],
                 pool_id=config['pools']['standard'],
                 ncpus=1,
                 num_retry=3,
                 mem_retry_increment=2,
                 **helpers.get_container_ctx(config['containers'], 'titan')),
        args=(
            'plot_titan_chromosome.R',
            pypeliner.managed.Instance('chromosome'),
            pypeliner.managed.InputFile(
                os.path.join(raw_data_dir, 'output', 'cn_loci.tsv')),
            pypeliner.managed.InputFile(
                os.path.join(raw_data_dir, 'output', 'params.tsv')),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output', 'chr_{chromosome}.png'),
                'chromosome'),
        ),
    )

    # just leaving it here in case we parallelize by samples later.
    workflow.transform(
        name='merge_results',
        ctx=dict(mem=config["memory"]['low'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="biowrappers.components.io.hdf5.tasks.merge_hdf5",
        args=(
            {
                cloneid:
                pypeliner.managed.InputFile('results', template=results_files)
            },
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}'.format(cloneid),
        },
    )

    return workflow
def create_alignment_workflow(fastq_1_filename, fastq_2_filename, bam_filename,
                              bai_filename, ref_genome, config, args,
                              instrumentinfo, centerinfo, sample_info,
                              cell_ids):

    out_dir = args['out_dir']

    merge_metrics = os.path.join(out_dir, 'metrics')

    lane_metrics = os.path.join(args['out_dir'], 'metrics_per_lane', '{lane}')

    bam_filename = dict([(cellid, bam_filename[cellid])
                         for cellid in cell_ids])

    bai_filename = dict([(cellid, bai_filename[cellid])
                         for cellid in cell_ids])

    chromosomes = config["chromosomes"]

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id', 'lane'),
        value=fastq_1_filename.keys(),
    )

    workflow.setobj(obj=mgd.TempOutputObj('instrument',
                                          'cell_id',
                                          'lane',
                                          axes_origin=[]),
                    value=instrumentinfo)

    workflow.setobj(obj=mgd.TempOutputObj('center',
                                          'cell_id',
                                          'lane',
                                          axes_origin=[]),
                    value=centerinfo)

    workflow.setobj(obj=mgd.TempOutputObj('sampleinfo',
                                          'cell_id',
                                          axes_origin=[]),
                    value=sample_info)

    fastqc_reports = os.path.join(lane_metrics, "fastqc",
                                  "{cell_id}_reports.tar.gz")
    flagstat_metrics = os.path.join(lane_metrics, 'flagstat', '{cell_id}.txt')
    workflow.transform(
        name='align_reads',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        axes=(
            'cell_id',
            'lane',
        ),
        func="single_cell.workflows.align.tasks.align_pe",
        args=(mgd.InputFile('fastq_1',
                            'cell_id',
                            'lane',
                            fnames=fastq_1_filename),
              mgd.InputFile('fastq_2',
                            'cell_id',
                            'lane',
                            fnames=fastq_2_filename),
              mgd.TempOutputFile('aligned_per_cell_per_lane.sorted.bam',
                                 'cell_id', 'lane'),
              mgd.OutputFile(fastqc_reports, 'cell_id', 'lane'),
              mgd.OutputFile(flagstat_metrics, 'cell_id', 'lane'),
              mgd.TempSpace('alignment_temp', 'cell_id', 'lane'), ref_genome,
              mgd.TempInputObj('instrument', 'cell_id', 'lane'),
              mgd.TempInputObj('center', 'cell_id', 'lane'),
              mgd.TempInputObj('sampleinfo',
                               'cell_id'), mgd.InputInstance('cell_id'),
              mgd.InputInstance('lane'), args['library_id'], config))

    workflow.transform(name='merge_bams',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.workflows.align.tasks.merge_bams",
                       axes=('cell_id', ),
                       args=(mgd.TempInputFile(
                           'aligned_per_cell_per_lane.sorted.bam', 'cell_id',
                           'lane'),
                             mgd.TempOutputFile('merged_lanes.bam', 'cell_id'),
                             mgd.TempOutputFile('merged_lanes.bam.bai',
                                                'cell_id'), config))

    if args['realign']:
        workflow.transform(name='realignment',
                           axes=('chrom', ),
                           ctx=dict(mem=config['memory']['high'],
                                    pool_id=config['pools']['highmem'],
                                    **ctx),
                           func="single_cell.workflows.align.tasks.realign",
                           args=(mgd.TempInputFile('merged_lanes.bam',
                                                   'cell_id'),
                                 mgd.TempInputFile('merged_lanes.bam.bai',
                                                   'cell_id'),
                                 mgd.TempOutputFile('realigned.bam', 'chrom',
                                                    'cell_id'),
                                 mgd.TempSpace('realignment_temp',
                                               'chrom',
                                               cleanup='before'), config,
                                 mgd.InputInstance('chrom')))

        workflow.transform(
            name='merge_realignment',
            ctx=dict(mem=config['memory']['high'],
                     pool_id=config['pools']['highmem'],
                     **ctx),
            axes=('cell_id', ),
            func="single_cell.workflows.align.tasks.merge_realignment",
            args=(mgd.TempInputFile('realigned.bam', 'chrom', 'cell_id'),
                  mgd.TempOutputFile('merged_realign.bam', 'cell_id'), config,
                  mgd.InputInstance('cell_id')))

    final_bam = mgd.TempInputFile('merged_lanes.bam', 'cell_id')
    if args["realign"]:
        final_bam = mgd.TempInputFile('merged_realign.bam', 'cell_id')

    markdups_metrics = os.path.join(merge_metrics, 'markdups_metrics',
                                    '{cell_id}.markdups_metrics.txt')
    flagstat_metrics = os.path.join(merge_metrics, 'flagstat_metrics',
                                    '{cell_id}.flagstat_metrics.txt')
    workflow.transform(
        name='postprocess_bam',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        axes=('cell_id', ),
        func="single_cell.workflows.align.tasks.postprocess_bam",
        args=(
            final_bam,
            mgd.OutputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            mgd.OutputFile('sorted_markdups_index',
                           'cell_id',
                           fnames=bai_filename),
            mgd.TempSpace('tempdir', 'cell_id'),
            config,
            mgd.OutputFile(markdups_metrics, 'cell_id'),
            mgd.OutputFile(flagstat_metrics, 'cell_id'),
        ),
    )

    return workflow
示例#23
0
def create_hmmcopy_workflow(bam_file,
                            bai_file,
                            hmmcopy_data,
                            igv_seg_filename,
                            segs_pdf,
                            bias_pdf,
                            plot_heatmap_ec_output,
                            plot_heatmap_ec_filt_output,
                            plot_metrics_output,
                            plot_kernel_density_output,
                            cell_ids,
                            config,
                            args,
                            hmmparams,
                            params_tag,
                            results_dir,
                            alignment_metrics=None):

    sample_info = helpers.get_sample_info(args["input_yaml"])

    chromosomes = config["chromosomes"]

    multipliers = copy.deepcopy(hmmparams["multipliers"])
    multipliers.append(0)

    rows = [int(cellinfo["row"]) for cellinfo in sample_info.values()]
    rows = sorted(set(rows))

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.setobj(obj=mgd.TempOutputObj('sampleinfo',
                                          'cell_id',
                                          axes_origin=[]),
                    value=sample_info)

    workflow.setobj(
        obj=mgd.OutputChunks('row'),
        value=rows,
    )

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow.transform(
        name='run_hmmcopy',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.hmmcopy.tasks.run_hmmcopy",
        axes=('cell_id', ),
        args=(
            mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file),
            mgd.InputFile('bai_markdups', 'cell_id', fnames=bai_file),
            mgd.TempOutputFile('reads.h5', 'cell_id'),
            mgd.TempOutputFile('segs.h5', 'cell_id'),
            mgd.TempOutputFile('params.h5', 'cell_id'),
            mgd.TempOutputFile('hmm_metrics.h5', 'cell_id'),
            mgd.TempOutputFile('segments.png', 'cell_id'),
            mgd.TempOutputFile('bias.png', 'cell_id'),
            mgd.InputInstance('cell_id'),
            config['ref_genome'],
            config,
            hmmparams,
            multipliers,
            mgd.TempSpace('hmmcopy_temp', 'cell_id'),
            mgd.TempInputObj('sampleinfo', 'cell_id'),
        ),
    )

    workflow.transform(
        name='merge_reads',
        ctx=dict(mem=config['memory']['low'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.hmmcopy.tasks.merge_hdf_files_on_disk",
        args=(mgd.TempInputFile('reads.h5', 'cell_id'),
              mgd.TempOutputFile("reads.h5"), multipliers, 'hmmcopy/reads'),
        kwargs={
            'dtypes': {
                'valid': bool,
                'ideal': bool,
                'state': float,
                'multiplier': float
            }
        })

    workflow.transform(
        name='merge_segs',
        ctx=dict(mem=config['memory']['low'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.hmmcopy.tasks.merge_hdf_files_on_disk",
        args=(mgd.TempInputFile('segs.h5',
                                'cell_id'), mgd.TempOutputFile("segments.h5"),
              multipliers, 'hmmcopy/segments'),
        kwargs={'dtypes': {
            'end': float,
            'median': float
        }})

    workflow.transform(
        name='merge_metrics',
        ctx=dict(mem=config['memory']['low'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.hmmcopy.tasks.merge_hdf_files_in_memory",
        args=(mgd.TempInputFile('hmm_metrics.h5', 'cell_id'),
              mgd.TempOutputFile("hmmcopy_metrics.h5"), multipliers,
              'hmmcopy/metrics'),
        kwargs={'dtypes': {
            'mad_neutral_state': float
        }})

    workflow.transform(
        name='merge_params',
        ctx=dict(mem=config['memory']['low'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.hmmcopy.tasks.merge_hdf_files_in_memory",
        args=(mgd.TempInputFile('params.h5', 'cell_id'),
              mgd.TempOutputFile("params.h5"), multipliers, 'hmmcopy/params'),
    )

    annotation_input = 'hmmcopy_metrics.h5'
    if alignment_metrics:
        annotation_input = 'hmmcopy_quality_metrics.h5'
        workflow.transform(
            name="add_quality",
            ctx=dict(mem=config['memory']['low'],
                     pool_id=config['pools']['standard'],
                     **ctx),
            func="single_cell.workflows.hmmcopy.tasks.add_quality",
            args=(
                mgd.TempInputFile('hmmcopy_metrics.h5'),
                mgd.InputFile(alignment_metrics),
                multipliers,
                mgd.TempOutputFile("hmmcopy_quality_metrics.h5"),
                hmmparams['classifier_training_data'],
            ),
        )

    workflow.transform(
        name='annotate_metrics_with_info_and_clustering',
        ctx=dict(mem=config['memory']['low'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.hmmcopy.tasks.annotate_metrics",
        args=(
            mgd.TempInputFile('reads.h5'),
            mgd.TempInputFile(annotation_input),
            mgd.TempOutputFile("annotated_metrics.h5"),
            sample_info,
            cell_ids,
            multipliers,
        ),
        kwargs={'chromosomes': config["chromosomes"]})

    workflow.transform(name='merge_hmm_copy_plots',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.workflows.hmmcopy.tasks.merge_pdf",
                       args=([
                           mgd.TempInputFile('segments.png', 'cell_id'),
                           mgd.TempInputFile('bias.png', 'cell_id'),
                       ], [
                           mgd.OutputFile(segs_pdf),
                           mgd.OutputFile(bias_pdf),
                       ], mgd.TempInputFile("annotated_metrics.h5"), None,
                             mgd.TempSpace("hmmcopy_plot_merge_temp"),
                             ['segments', 'bias']))

    workflow.transform(
        name='create_igv_seg',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.hmmcopy.tasks.create_igv_seg",
        args=(mgd.TempInputFile("segments.h5"),
              mgd.TempInputFile("annotated_metrics.h5"),
              mgd.OutputFile(igv_seg_filename), hmmparams))

    workflow.transform(name='plot_metrics',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.workflows.hmmcopy.tasks.plot_metrics",
                       args=(mgd.TempInputFile("annotated_metrics.h5"),
                             mgd.OutputFile(plot_metrics_output),
                             mgd.TempSpace("plot_metrics_temp"),
                             'QC pipeline metrics', multipliers))

    workflow.transform(
        name='plot_kernel_density',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.hmmcopy.tasks.plot_kernel_density",
        args=(mgd.TempInputFile('annotated_metrics.h5'),
              mgd.OutputFile(plot_kernel_density_output),
              mgd.TempSpace("hmmcopy_kde_plot_temp"), ',', 'mad_neutral_state',
              'QC pipeline metrics', multipliers))

    workflow.transform(name='plot_heatmap_ec',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.workflows.hmmcopy.tasks.plot_pcolor",
                       args=(mgd.TempInputFile('reads.h5'),
                             mgd.TempInputFile('annotated_metrics.h5'),
                             mgd.OutputFile(plot_heatmap_ec_output),
                             mgd.TempSpace("heatmap_ec_temp"), multipliers),
                       kwargs={
                           'plot_title': 'QC pipeline metrics',
                           'column_name': 'state',
                           'plot_by_col': 'experimental_condition',
                           'color_by_col': 'cell_call',
                           'chromosomes': chromosomes,
                           'max_cn': hmmparams['num_states'],
                           'scale_by_cells': False,
                           'mappability_threshold': hmmparams["map_cutoff"]
                       })

    workflow.transform(name='plot_heatmap_ec_filtered',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.workflows.hmmcopy.tasks.plot_pcolor",
                       args=(mgd.TempInputFile('reads.h5'),
                             mgd.TempInputFile('annotated_metrics.h5'),
                             mgd.OutputFile(plot_heatmap_ec_filt_output),
                             mgd.TempSpace("heatmap_ec_filt_temp"),
                             multipliers),
                       kwargs={
                           'plot_title': 'QC pipeline metrics',
                           'column_name': 'state',
                           'plot_by_col': 'experimental_condition',
                           'color_by_col': 'cell_call',
                           'chromosomes': chromosomes,
                           'max_cn': hmmparams['num_states'],
                           'scale_by_cells': False,
                           'cell_filters': config["good_cells"],
                           'mappability_threshold': hmmparams["map_cutoff"]
                       })

    workflow.transform(name='merge_all_hdf5_stores',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.workflows.hmmcopy.tasks.merge_tables",
                       args=(mgd.TempInputFile("reads.h5"),
                             mgd.TempInputFile("segments.h5"),
                             mgd.TempInputFile("annotated_metrics.h5"),
                             mgd.TempInputFile("params.h5"),
                             mgd.TempOutputFile("hmmcopy_precast.h5"),
                             cell_ids))

    workflow.transform(name='cast_h5',
                       ctx=dict(mem=config['memory']['high'],
                                pool_id=config['pools']['highmem'],
                                **ctx),
                       func="single_cell.utils.hdfutils.cast_h5_file",
                       args=(
                           mgd.TempInputFile("hmmcopy_precast.h5"),
                           mgd.OutputFile(hmmcopy_data),
                       ))

    return workflow
def create_strelka_workflow(
        normal_bam_file,
        tumour_bam_file,
        ref_genome_fasta_file,
        indel_vcf_file,
        snv_vcf_file,
        config,
        chromosomes=default_chromosomes,
        split_size=int(1e7),
        use_depth_thresholds=True):

    ctx = {'mem_retry_increment': 2, 'ncpus': 1, 'num_retry': 3}
    docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')
    ctx.update(docker_ctx)

    regions = normal_bam_file.keys()
    assert set(tumour_bam_file.keys()) == set(regions)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('region'),
        value=regions,
    )

    workflow.transform(
        name='count_fasta_bases',
        ctx=dict(mem=2,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.tasks.count_fasta_bases",
        args=(
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
            helpers.get_container_ctx(config['containers'], 'strelka')
        )
    )

    workflow.transform(
        name="get_chrom_sizes",
        ctx=dict(mem=2,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(
              pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes
        )
    )

    workflow.transform(
        name='call_somatic_variants',
        ctx=dict(mem=4,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.tasks.call_somatic_variants",
        axes=('region',),
        args=(
            pypeliner.managed.InputFile("normal.split.bam", "region", fnames=normal_bam_file),
            pypeliner.managed.InputFile("merged_bam", "region", fnames=tumour_bam_file),
            pypeliner.managed.TempInputObj('known_sizes'),
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf', 'region'),
            pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf.window', 'region'),
            pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf', 'region'),
            pypeliner.managed.TempOutputFile('strelka.stats', 'region'),
            pypeliner.managed.InputInstance("region"),
            helpers.get_container_ctx(config['containers'], 'strelka')
        ),
    )

    workflow.transform(
        name='add_indel_filters',
        axes=('chrom',),
        ctx=dict(mem=4,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.tasks.filter_indel_file_list",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf', 'region'),
            pypeliner.managed.TempInputFile('strelka.stats', 'region'),
            pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf.window', 'region'),
            pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf', 'chrom'),
            pypeliner.managed.InputInstance("chrom"),
            pypeliner.managed.TempInputObj('known_sizes'),
            regions
        ),
        kwargs={'use_depth_filter': use_depth_thresholds}
    )

    workflow.transform(
        name='add_snv_filters',
        axes=('chrom',),
        ctx=dict(mem=4,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.tasks.filter_snv_file_list",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf', 'region'),
            pypeliner.managed.TempInputFile('strelka.stats', 'region'),
            pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf', 'chrom'),
            pypeliner.managed.InputInstance("chrom"),
            pypeliner.managed.TempInputObj('known_sizes'),
            regions,
        ),
        kwargs={'use_depth_filter': use_depth_thresholds}
    )

    workflow.transform(
        name='merge_indels',
        ctx=dict(mem=4,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf', 'chrom'),
            pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempSpace("merge_indels_temp"),
            helpers.get_container_ctx(config['containers'], 'vcftools')
        )
    )

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=4,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf', 'chrom'),
            pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz'),
            pypeliner.managed.TempSpace("merge_snvs_temp"),
            helpers.get_container_ctx(config['containers'], 'vcftools')
        )
    )

    workflow.transform(
        name='filter_indels',
        ctx=dict(mem=4,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf')
        )
    )

    workflow.transform(
        name='filter_snvs',
        ctx=dict(mem=4,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'),
            pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf')
        )
    )

    workflow.transform(
        name='finalise_indels',
        ctx=dict(mem=4,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'),
            pypeliner.managed.OutputFile(indel_vcf_file),
            helpers.get_container_ctx(config['containers'], 'vcftools')
        )
    )

    workflow.transform(
        name='finalise_snvs',
        ctx=dict(mem=2,
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'),
            pypeliner.managed.OutputFile(snv_vcf_file),
            helpers.get_container_ctx(config['containers'], 'vcftools')
        )
    )

    return workflow
示例#25
0
def create_split_workflow(normal_bam,
                          normal_bai,
                          normal_split_bam,
                          normal_split_bai,
                          regions,
                          config,
                          by_reads=False):

    ctx = {'mem_retry_increment': 2}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    normal_split_bam = dict([(ival, normal_split_bam[ival])
                             for ival in regions])
    normal_split_bai = dict([(ival, normal_split_bai[ival])
                             for ival in regions])

    one_split_job = config["one_split_job"]

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=regions,
    )

    # split by reads always runs no a single node
    if by_reads:
        workflow.transform(
            name='split_normal_bam',
            ctx=dict(mem=config['memory']['low'],
                     pool_id=config['pools']['multicore'],
                     ncpus=config['max_cores'],
                     **ctx),
            func=
            "single_cell.workflows.split_bams.tasks.split_bam_file_by_reads",
            args=(mgd.InputFile(normal_bam), mgd.InputFile(normal_bai),
                  mgd.OutputFile("normal.split.bam",
                                 "region",
                                 fnames=normal_split_bam,
                                 axes_origin=[]),
                  mgd.OutputFile("normal.split.bam.bai",
                                 "region",
                                 fnames=normal_split_bai,
                                 axes_origin=[]),
                  mgd.TempSpace("bam_split_by_reads"), regions,
                  helpers.get_container_ctx(config['containers'], 'samtools')),
        )

    elif one_split_job:
        workflow.transform(
            name='split_normal_bam',
            ctx=dict(mem=config['memory']['low'],
                     pool_id=config['pools']['multicore'],
                     ncpus=config['max_cores'],
                     **ctx),
            func=
            "single_cell.workflows.split_bams.tasks.split_bam_file_one_job",
            args=(mgd.InputFile(normal_bam, extensions=['.bai']),
                  mgd.OutputFile(
                      "normal.split.bam",
                      "region",
                      fnames=normal_split_bam,
                      axes_origin=[],
                      extensions=['.bai'],
                  ), regions,
                  helpers.get_container_ctx(config['containers'], 'samtools')),
            kwargs={"ncores": config["max_cores"]})

    else:
        workflow.transform(
            name='split_normal_bam',
            ctx=dict(mem=config['memory']['low'],
                     pool_id=config['pools']['standard'],
                     ncpus=1,
                     **ctx),
            axes=('region', ),
            func="single_cell.workflows.split_bams.tasks.split_bam_file",
            args=(mgd.InputFile(normal_bam), mgd.InputFile(normal_bai),
                  mgd.OutputFile("normal.split.bam",
                                 "region",
                                 fnames=normal_split_bam),
                  mgd.OutputFile("normal.split.bam.bai",
                                 "region",
                                 fnames=normal_split_bai),
                  mgd.InputInstance('region'),
                  helpers.get_container_ctx(config['containers'], 'samtools')))

    return workflow
def copy_number_calling_workflow(workflow, args):

    config = helpers.load_config(args)

    ctx = {'mem_retry_increment': 2, 'ncpus': 1,
           'mem': config["memory"]['low'],
           'pool_id': config['pools']['standard']}
    docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')
    ctx.update(docker_ctx)

    tumour_bam_files, tumour_bai_files = helpers.get_bams(args['tumour_yaml'])

    normal_bam_files, normal_bai_files = helpers.get_bams(args['normal_yaml'])

    tumour_cellids = helpers.get_samples(args['tumour_yaml'])

    normal_cellids = helpers.get_samples(args['normal_yaml'])

    if set(tumour_bam_files.keys()) != set(tumour_cellids):
        raise ValueError()

    if set(normal_bam_files.keys()) != set(normal_cellids):
        raise ValueError()

    copynumber_dir = os.path.join(args["out_dir"], "copynumber")

    out_file = os.path.join(copynumber_dir, "results", "results.h5")

    cloneid = args["clone_id"]

    remixt_config = config['titan_params'].get('extract_seqdata', {})

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=tumour_cellids,
    )

    workflow.setobj(
        obj=mgd.OutputChunks('normal_cell_id'),
        value=normal_cellids,
    )

    workflow.transform(
        name="get_snp_positions_filename",
        ctx=ctx,
        func="remixt.config.get_filename",
        ret=mgd.TempOutputObj('snp_positions_filename'),
        args=(
              remixt_config,
              config['titan_params']['ref_data_dir'],
              'snp_positions'
        )
    )

    workflow.transform(
        name="get_bam_max_fragment_length",
        ctx=ctx,
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_max_fragment_length'),
        args=(
              remixt_config,
              'bam_max_fragment_length'
        )
    )

    workflow.transform(
        name="get_bam_max_soft_clipped",
        ctx=ctx,
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_max_soft_clipped'),
        args=(
              remixt_config,
              'bam_max_soft_clipped'
        )
    )

    workflow.transform(
        name="get_bam_check_proper_pair",
        ctx=ctx,
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_check_proper_pair'),
        args=(
              remixt_config,
              'bam_check_proper_pair'
        )
    )


    workflow.subworkflow(
        name="extract_seqdata_tumour",
        axes=('tumour_cell_id',),
        func=extract_seqdata.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile(
                'bam_markdups',
                'tumour_cell_id',
                fnames=tumour_bam_files),
            mgd.InputFile(
                'bam_markdups_index',
                'tumour_cell_id',
                fnames=tumour_bai_files),
            mgd.TempOutputFile("tumour.h5", "tumour_cell_id"),
            config,
            config['titan_params'].get('extract_seqdata', {}),
            config['titan_params']['ref_data_dir'],
            mgd.TempInputObj('snp_positions_filename'),
            mgd.TempInputObj('bam_max_fragment_length'),
            mgd.TempInputObj('bam_max_soft_clipped'),
            mgd.TempInputObj('bam_check_proper_pair'),
        )
    )

    workflow.subworkflow(
        name="extract_seqdata_normal",
        axes=('normal_cell_id',),
        func=extract_seqdata.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile(
                'bam_markdups',
                'normal_cell_id',
                fnames=normal_bam_files),
            mgd.InputFile(
                'bam_markdups_index',
                'normal_cell_id',
                fnames=normal_bai_files),
            mgd.TempOutputFile("normal.h5", "normal_cell_id"),
            config,
            config['titan_params'].get('extract_seqdata', {}),
            config['titan_params']['ref_data_dir'],
            mgd.TempInputObj('snp_positions_filename'),
            mgd.TempInputObj('bam_max_fragment_length'),
            mgd.TempInputObj('bam_max_soft_clipped'),
            mgd.TempInputObj('bam_check_proper_pair'),
        )
    )

    workflow.subworkflow(
        name='titan_workflow',
        func=titan.create_titan_workflow,
        args=(
            mgd.TempInputFile("normal.h5", "normal_cell_id"),
            mgd.TempInputFile("tumour.h5", "tumour_cell_id"),
            config['ref_genome'],
            copynumber_dir,
            out_file,
            config,
            args,
            tumour_cellids,
            normal_cellids,
            cloneid
        ),
    )

    info_file = os.path.join(args["out_dir"],'results','copynumber_calling', "info.yaml")

    results = {
        'copynumber_data': helpers.format_file_yaml(out_file),
    }

    tumours = {k: helpers.format_file_yaml(v) for k,v in tumour_bam_files.iteritems()}
    normals = {k: helpers.format_file_yaml(v) for k,v in normal_bam_files.iteritems()}
    input_datasets = {'tumour': tumours, 'normal': normals}

    metadata = {
        'copynumber_calling': {
            'chromosomes': config['chromosomes'],
            'ref_genome': config['ref_genome'],
            'version': single_cell.__version__,
            'results': results,
            'containers': config['containers'],
            'input_datasets': input_datasets,
            'output_datasets': None
        }
    }

    workflow.transform(
        name='generate_meta_yaml',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 mem_retry_increment=2, ncpus=1),
        func="single_cell.utils.helpers.write_to_yaml",
        args=(
            mgd.OutputFile(info_file),
            metadata
        )
    )

    return workflow
示例#27
0
def create_aneufinder_workflow(bam_file,
                               cell_ids,
                               config,
                               aneufinder_output,
                               aneufinder_results_filename,
                               aneufinder_pdf_filename,
                               library_id,
                               ):

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')
    ctx.update(docker_ctx)

    aneufinder_docker = helpers.get_container_ctx(config['containers'], 'aneufinder', docker_only=True)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.transform(
        name='run_aneufinder_on_individual_cells',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.aneufinder.tasks.run_aneufinder",
        axes=('cell_id',),
        args=(
            mgd.InputFile('bam_file', 'cell_id', fnames=bam_file),
            mgd.TempSpace('working_dir', 'cell_id', fnames=bam_file),
            mgd.InputInstance('cell_id'),
            aneufinder_output,
            mgd.TempOutputFile('segments.csv', 'cell_id'),
            mgd.TempOutputFile('reads.csv', 'cell_id'),
            mgd.TempOutputFile('dnacopy.pdf', 'cell_id'),
        ),
        kwargs={'docker_config': aneufinder_docker}
    )

    workflow.transform(
        name='merge_outputs',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.aneufinder.tasks.merge_outputs_to_hdf",
        args=(
            mgd.TempInputFile('reads.csv', 'cell_id'),
            mgd.TempInputFile('segments.csv', 'cell_id'),
            mgd.OutputFile(aneufinder_results_filename),
            mgd.TempSpace("aneufinder_merge"),
        )
    )

    workflow.transform(
        name='merge_aneufinder_pdfs',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.aneufinder.tasks.merge_pdf",
        args=(
            [mgd.TempInputFile('dnacopy.pdf', 'cell_id')],
            [mgd.OutputFile(aneufinder_pdf_filename)],
        )
    )

    return workflow
def create_alignment_metrics_workflow(bam_filename, bai_filename,
                                      alignment_metrics, plot_metrics,
                                      ref_genome, config, args, sample_info,
                                      cell_ids):

    out_dir = args['out_dir']

    merge_metrics = os.path.join(out_dir, 'metrics')

    bam_filename = dict([(cellid, bam_filename[cellid])
                         for cellid in cell_ids])

    bai_filename = dict([(cellid, bai_filename[cellid])
                         for cellid in cell_ids])

    chromosomes = config["chromosomes"]

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.setobj(obj=mgd.TempOutputObj('sampleinfo',
                                          'cell_id',
                                          axes_origin=[]),
                    value=sample_info)

    markdups_metrics = os.path.join(merge_metrics, 'markdups_metrics',
                                    '{cell_id}.markdups_metrics.txt')
    flagstat_metrics = os.path.join(merge_metrics, 'flagstat_metrics',
                                    '{cell_id}.flagstat_metrics.txt')
    workflow.transform(
        name='postprocess_bam',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        axes=('cell_id', ),
        func=
        "single_cell.workflows.alignment_metrics.tasks.get_postprocess_metrics",
        args=(
            mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            mgd.InputFile('sorted_markdups', 'cell_id', fnames=bai_filename),
            mgd.TempSpace('tempdir', 'cell_id'),
            config,
            mgd.OutputFile(markdups_metrics, 'cell_id'),
            mgd.OutputFile(flagstat_metrics, 'cell_id'),
        ),
    )

    wgs_metrics_filename = os.path.join(merge_metrics, 'wgs_metrics',
                                        '{cell_id}.wgs_metrics.txt')
    workflow.transform(
        name='bam_collect_wgs_metrics',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func=
        "single_cell.workflows.alignment_metrics.tasks.bam_collect_wgs_metrics",
        axes=('cell_id', ),
        args=(
            mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            ref_genome,
            mgd.OutputFile(wgs_metrics_filename, 'cell_id'),
            config,
            mgd.TempSpace('wgs_tempdir', 'cell_id'),
        ),
    )

    gc_metrics_filename = os.path.join(merge_metrics, 'gc_metrics',
                                       '{cell_id}.gc_metrics.txt')
    gc_summary_filename = os.path.join(merge_metrics, 'gc_metrics',
                                       '{cell_id}.gc_metrics.summ.txt')
    gc_chart_filename = os.path.join(merge_metrics, 'gc_metrics',
                                     '{cell_id}.gc_metrics.pdf')
    workflow.transform(
        name='bam_collect_gc_metrics',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func=
        "single_cell.workflows.alignment_metrics.tasks.bam_collect_gc_metrics",
        axes=('cell_id', ),
        args=(
            mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            ref_genome,
            mgd.OutputFile(gc_metrics_filename, 'cell_id'),
            mgd.OutputFile(gc_summary_filename, 'cell_id'),
            mgd.OutputFile(gc_chart_filename, 'cell_id'),
            mgd.TempSpace('gc_tempdir', 'cell_id'),
            config,
        ),
    )

    insert_metrics_filename = os.path.join(merge_metrics, 'insert_metrics',
                                           '{cell_id}.insert_metrics.txt')
    insert_histogram_filename = os.path.join(merge_metrics, 'insert_metrics',
                                             '{cell_id}.insert_metrics.pdf')
    workflow.transform(
        name='bam_collect_insert_metrics',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func=
        "single_cell.workflows.alignment_metrics.tasks.bam_collect_insert_metrics",
        axes=('cell_id', ),
        args=(
            mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            mgd.InputFile(flagstat_metrics, 'cell_id'),
            mgd.OutputFile(insert_metrics_filename, 'cell_id'),
            mgd.OutputFile(insert_histogram_filename, 'cell_id'),
            mgd.TempSpace('insert_tempdir', 'cell_id'),
            config,
        ),
    )

    workflow.transform(
        name="collect_gc_metrics",
        func="single_cell.workflows.alignment_metrics.tasks.collect_gc",
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        args=(mgd.InputFile(gc_metrics_filename, 'cell_id', axes_origin=[]),
              mgd.TempOutputFile("gc_metrics.h5"), mgd.TempSpace("temp_gc")))

    workflow.transform(
        name='collect_metrics',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.alignment_metrics.tasks.collect_metrics",
        args=(
            mgd.InputFile(flagstat_metrics, 'cell_id', axes_origin=[]),
            mgd.InputFile(markdups_metrics, 'cell_id', axes_origin=[]),
            mgd.InputFile(insert_metrics_filename, 'cell_id', axes_origin=[]),
            mgd.InputFile(wgs_metrics_filename, 'cell_id', axes_origin=[]),
            mgd.TempSpace("tempdir_collect_metrics"),
            mgd.TempOutputFile("alignment_metrics.h5"),
        ),
    )

    workflow.transform(
        name='annotate_metrics',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.alignment_metrics.tasks.annotate_metrics",
        args=(
            mgd.TempInputFile("alignment_metrics.h5"),
            sample_info,
            mgd.TempOutputFile("alignment_metrics_annotated.h5"),
        ))

    workflow.transform(
        name='plot_metrics',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.alignment_metrics.tasks.plot_metrics",
        args=(
            mgd.TempInputFile("alignment_metrics_annotated.h5"),
            mgd.OutputFile(plot_metrics),
            'QC pipeline metrics',
            mgd.TempInputFile("gc_metrics.h5"),
            config['gc_windows'],
        ))

    workflow.transform(
        name='concatenate_all_hdf_tables',
        ctx=dict(mem=config['memory']['low'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.utils.hdfutils.concat_hdf_tables",
        args=(
            [
                mgd.TempInputFile("alignment_metrics_annotated.h5"),
                mgd.TempInputFile("gc_metrics.h5"),
            ],
            mgd.TempOutputFile("alignment_precast.h5"),
        ),
    )

    workflow.transform(name='cast_h5',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.utils.hdfutils.cast_h5_file",
                       args=(
                           mgd.TempInputFile("alignment_precast.h5"),
                           mgd.OutputFile(alignment_metrics),
                       ))

    return workflow