예제 #1
0
def annotation_workflow(args):
    config = inpututils.load_config(args)

    annotation_infiles = inpututils.load_yaml(args['input_yaml'])

    lib = args["library_id"]

    workflow = pypeliner.workflow.Workflow(ctx={
        'docker_image':
        config['annotation']['docker']['single_cell_pipeline']
    }, )

    annotation_dir = args["out_dir"]

    input_yaml_blob = os.path.join(annotation_dir, 'input.yaml')
    annotation_files = get_output_files(annotation_dir, lib)
    annotation_meta = os.path.join(annotation_dir, 'metadata.yaml')

    workflow.subworkflow(
        name='annotation_workflow',
        func=qc_annotation.create_qc_annotation_workflow,
        args=(
            mgd.InputFile(annotation_infiles['hmmcopy_metrics']),
            mgd.InputFile(annotation_infiles['hmmcopy_reads']),
            mgd.InputFile(annotation_infiles['alignment_metrics']),
            mgd.InputFile(annotation_infiles['gc_metrics']),
            mgd.InputFile(annotation_infiles['segs_pdf_tar']),
            mgd.OutputFile(annotation_files['merged_metrics_csvs']),
            mgd.OutputFile(annotation_files['qc_report']),
            mgd.OutputFile(annotation_files['corrupt_tree_newick']),
            mgd.OutputFile(annotation_files['consensus_tree_newick']),
            mgd.OutputFile(annotation_files['phylo_csv']),
            mgd.OutputFile(annotation_files['loci_rank_trees']),
            mgd.OutputFile(annotation_files['filtered_data']),
            mgd.OutputFile(annotation_files['corrupt_tree_pdf']),
            mgd.OutputFile(annotation_files['segs_pass']),
            mgd.OutputFile(annotation_files['segs_fail']),
            mgd.OutputFile(annotation_files['corrupt_heatmap_pdf']),
            mgd.OutputFile(annotation_files['heatmap_filt_pdf']),
            config['annotation'],
            lib,
        ),
        kwargs={'no_corrupt_tree': args['no_corrupt_tree']})

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], annotation_dir, list(annotation_files.values()),
              mgd.OutputFile(annotation_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'library_id': lib,
                'type': 'annotation'
            }
        })

    return workflow
예제 #2
0
def create_genotyping_workflow(args):
    strelka_vcf, museq_vcf, tumour_cell_bams = inpututils.load_variant_counting_input(
        args['input_yaml'])

    counts_output = os.path.join(args['out_dir'], "counts.csv.gz")

    config = inpututils.load_config(args)
    config = config['variant_calling']
예제 #3
0
def count_haps_workflow(args):
    config = inpututils.load_config(args)
    config = config['count_haps']
    baseimage = config['docker']['single_cell_pipeline']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               docker_image=baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    allele_counts_filename = os.path.join(args["out_dir"],
                                          "allele_counts.csv.gz")

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    haplotypes_filename, tumour_cells = inpututils.load_count_haps_input(
        args['input_yaml'])

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    workflow.subworkflow(
        name='extract_allele_readcounts',
        func=
        'single_cell.workflows.extract_allele_readcounts.extract_allele_readcounts',
        args=(
            mgd.InputFile(haplotypes_filename, extensions=['.yaml']),
            mgd.InputFile('tumour_cells.bam',
                          'tumour_cell_id',
                          extensions=['.bai'],
                          axes_origin=[],
                          fnames=tumour_cells),
            mgd.OutputFile(allele_counts_filename),
            config,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], [allele_counts_filename],
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'count_haps'
            }
        })

    return workflow
예제 #4
0
def infer_haps_workflow(args):
    config = inpututils.load_config(args)
    config = config['infer_haps']
    baseimage = config['docker']['single_cell_pipeline']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               docker_image=baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    haplotypes_filename = os.path.join(args["out_dir"], "haplotypes.tsv")

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    normal_data = inpututils.load_infer_haps_input(args['input_yaml'])

    if isinstance(normal_data, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_data.keys()),
        )
        bam_file = mgd.InputFile('normal.bam',
                                 'normal_cell_id',
                                 fnames=normal_data,
                                 extensions=['.bai'])
    else:
        bam_file = mgd.InputFile(normal_data, extensions=['.bai'])

    workflow.subworkflow(
        name='infer_haps',
        func=infer_haps,
        args=(
            bam_file,
            mgd.OutputFile(haplotypes_filename),
            config,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], [haplotypes_filename],
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'infer_haps'
            }
        })

    return workflow
예제 #5
0
def sv_genotyping_pipeline(args):
    pyp = pypeliner.app.Pypeline(config=args)

    lumpy_sv, destruct_sv, tumour_bams = inpututils.load_sv_genotyper_input(
        args['input_yaml'])

    out_dir = args['out_dir']

    config = inpututils.load_config(args)

    workflow = create_sv_genotyper_workflow(tumour_bams, lumpy_sv, destruct_sv,
                                            out_dir, config)

    pyp.run(workflow)
예제 #6
0
def cohort_qc_pipeline(args):
    """Process maf, run classify copynumber, make plots.
    Args:
        args ([dict]): [pipeline arguments]
    """
    config = inpututils.load_config(args)
    config = config["cohort_qc"]

    pyp = pypeliner.app.Pypeline(config=args)

    workflow = pypeliner.workflow.Workflow()

    out_dir = args["out_dir"]
    api_key = args["API_key"]

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    # inputs
    cohort, germline_mafs, vcfs, hmmcopy = inpututils.load_cohort_qc_inputs(
        args["input_yaml"]
    )

    museq = {
        label: data["museq"] for label, data in vcfs.items()
    }
    strelka_snv = {
        label: data["strelka_snv"] for label, data in vcfs.items()
    }
    strelka_indel = {
        label: data["strelka_indel"] for label, data in vcfs.items()
    }
    hmmcopy_files = {
        label: data["hmmcopy"] for label, data in hmmcopy.items()
    }
    hmmcopy_metrics_files = {
        label: data["hmmcopy_metrics"] for label, data in hmmcopy.items()
    }
    # outputs
    cbiofile_paths = get_cbioportal_paths(os.path.join(out_dir, cohort))
    maftools_filepaths = get_maftools_paths(os.path.join(out_dir, cohort))

    workflow.setobj(
        obj=mgd.OutputChunks('sample_label', 'library_label'),
        value=list(museq.keys()),
    )
    workflow.subworkflow(
        name="merge_somatic_mafs",
        func="single_cell.workflows.cohort_qc.merge_somatic_mafs",
        axes=('sample_label',),
        args=(
            mgd.InputInstance('sample_label'),
            config,
            mgd.InputFile(
                'museq', 'sample_label', 'library_label',
                fnames=museq, axes_origin=[]
            ),
            mgd.InputFile(
                'strelka_snv', 'sample_label', 'library_label',
                fnames=strelka_snv, axes_origin=[]
            ),
            mgd.InputFile(
                'strelka_indel', 'sample_label', 'library_label',
                fnames=strelka_indel, axes_origin=[]
            ),
            mgd.TempOutputFile('somatic_maf', 'sample_label')
        ),
    )
    
    workflow.subworkflow(
        name="classifycopynumber",
        func="single_cell.workflows.cohort_qc.cna_annotation_workflow",
        args=(
            config,
            mgd.InputFile(
                'hmmcopy_dict', 'sample_label', 'library_label',
                fnames=hmmcopy_files, axes_origin=[]
            ),
            mgd.InputFile(
                'hmmcopy_metrics_dict', 'sample_label', 'library_label',
                fnames=hmmcopy_metrics_files, axes_origin=[]
            ),
            mgd.OutputFile(cbiofile_paths["cna_table"]),
            mgd.OutputFile(maftools_filepaths["maftools_cna"]),
            mgd.OutputFile(cbiofile_paths["segments"]),
            config["gtf"],

        ),
    )

    workflow.subworkflow(
        name="maf_annotation_workflow",
        func="single_cell.workflows.cohort_qc.preprocess_mafs_workflow",
        args=(
            config,
            mgd.InputFile(
                'germline_mafs_dict',  'sample_label',
                fnames=germline_mafs, axes_origin=[]
            ),
            mgd.TempInputFile(
                'somatic_maf',  'sample_label',
                axes_origin=[]
            ),
            mgd.OutputFile(cbiofile_paths["filtered_germline_maf"]),
            mgd.OutputFile(cbiofile_paths["annotated_somatic_maf"]),
            api_key
        ),
    )
    workflow.subworkflow(
        name="make_plots_and_report",
        func="single_cell.workflows.cohort_qc.create_cohort_oncoplot",
        args=(
            config,
            mgd.InputFile(cbiofile_paths["filtered_germline_maf"]),
            mgd.InputFile(cbiofile_paths["annotated_somatic_maf"]),
            mgd.InputFile(maftools_filepaths["maftools_cna"]),
            mgd.OutputFile(maftools_filepaths["maftools_maf"]),
            mgd.OutputFile(maftools_filepaths["cohort_oncoplot"]),
            mgd.OutputFile(maftools_filepaths["report"]),
            cohort
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            args['out_dir'],
            list(cbiofile_paths.values()) + list(maftools_filepaths.values()),
            mgd.OutputFile(meta_yaml)
        ),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'cohort_qc'}
        }
    )
    pyp.run(workflow)
예제 #7
0
def germline_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['germline_calling']

    normal_bams = inpututils.load_germline_data(args['input_yaml'])

    varcalls_meta = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')
    out_files = get_output_files(args['out_dir'])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=list(normal_bams.keys()),
    )

    workflow.subworkflow(
        name='samtools_germline',
        func=germline.create_samtools_germline_workflow,
        args=(
            mgd.InputFile("normal_split.bam",
                          "region",
                          extensions=['.bai'],
                          fnames=normal_bams),
            config['ref_genome'],
            mgd.OutputFile(out_files['samtools_germline_vcf'],
                           extensions=['.tbi']),
            config,
        ),
    )

    workflow.subworkflow(
        name='annotate_mappability',
        func=
        "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow",
        args=(
            config['databases']['mappability']['local_path'],
            mgd.InputFile(out_files['samtools_germline_vcf'],
                          extensions=['.tbi']),
            mgd.OutputFile(out_files['mappability_filename']),
        ),
        kwargs={'chromosomes': config['chromosomes']})

    workflow.transform(
        name='annotate_genotype',
        func="single_cell.workflows.germline.tasks.annotate_normal_genotype",
        args=(
            mgd.InputFile(out_files['samtools_germline_vcf'],
                          extensions=['.tbi']),
            mgd.OutputFile(out_files['normal_genotype_filename']),
            config["chromosomes"],
        ),
    )

    workflow.subworkflow(
        name='snpeff',
        func=
        "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow",
        args=(
            config['databases']['snpeff']['db'],
            config['databases']['snpeff']['data_dir'],
            mgd.InputFile(out_files['samtools_germline_vcf'],
                          extensions=['.tbi']),
            mgd.OutputFile(out_files['snpeff_vcf_filename']),
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], list(out_files.values()),
              mgd.OutputFile(varcalls_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'germline_calling'
            }
        })

    return workflow
예제 #8
0
def alignment_workflow(args):
    config = inpututils.load_config(args)
    config = config['alignment']

    lib = args["library_id"]
    alignment_dir = args["out_dir"]
    bams_dir = args["bams_dir"]

    trim = args['trim']
    center = args['sequencing_center']

    sampleinfo = inpututils.get_sample_info(args['input_yaml'])

    cellids = inpututils.get_samples(args['input_yaml'])
    fastq1_files, fastq2_files = inpututils.get_fastqs(args['input_yaml'])

    alignment_files = get_output_files(alignment_dir, lib)
    alignment_meta = os.path.join(alignment_dir, 'metadata.yaml')

    bam_files_template = os.path.join(bams_dir, '{cell_id}.bam')
    mt_bam_files_template = os.path.join(bams_dir, '{cell_id}_MT.bam')
    bams_meta = os.path.join(bams_dir, 'metadata.yaml')

    lanes = sorted(set([v[1] for v in fastq1_files.keys()]))
    cells = sorted(set([v[0] for v in fastq1_files.keys()]))

    input_yaml_blob = os.path.join(alignment_dir, 'input.yaml')

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id', 'lane'),
        value=list(fastq1_files.keys()),
    )

    workflow.subworkflow(
        name='alignment_workflow',
        func=align.create_alignment_workflow,
        args=(
            mgd.InputFile('fastq_1',
                          'cell_id',
                          'lane',
                          fnames=fastq1_files,
                          axes_origin=[]),
            mgd.InputFile('fastq_2',
                          'cell_id',
                          'lane',
                          fnames=fastq2_files,
                          axes_origin=[]),
            mgd.OutputFile('bam_markdups',
                           'cell_id',
                           template=bam_files_template,
                           axes_origin=[],
                           extensions=['.bai']),
            mgd.OutputFile('mt_bam_markdups',
                           'cell_id',
                           template=mt_bam_files_template,
                           axes_origin=[],
                           extensions=['.bai']),
            mgd.OutputFile(alignment_files['alignment_metrics_csv']),
            mgd.OutputFile(alignment_files['gc_metrics_csv']),
            mgd.OutputFile(alignment_files['fastqc_metrics_csv']),
            mgd.OutputFile(alignment_files['plot_metrics_output']),
            config['ref_genome'],
            config,
            sampleinfo,
            cellids,
            mgd.OutputFile(alignment_files['alignment_metrics_tar']),
            lib,
            trim,
            center,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], alignment_dir, list(alignment_files.values()),
              mgd.OutputFile(alignment_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'library_id': lib,
                'cell_ids': cells,
                'lane_ids': lanes,
                'type': 'alignment'
            }
        })

    workflow.transform(
        name='generate_meta_files_bams',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], bams_dir,
              mgd.Template('aligned.bam',
                           'cell_id',
                           template=bam_files_template),
              mgd.OutputFile(bams_meta)),
        kwargs={
            'metadata': {
                'library_id': lib,
                'cell_ids': cells,
                'lane_ids': lanes,
                'type': 'cellbams'
            },
            'template':
            (mgd.InputChunks('cell_id'), bam_files_template, 'cell_id'),
        })

    return workflow
def variant_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['variant_calling']

    normal_bams, tumour_bams = inpututils.load_variant_calling_input(
        args['input_yaml'])

    filepaths = get_file_paths(args['out_dir'])

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    basedocker = {'docker_image': config['docker']['single_cell_pipeline']}
    vcftools_docker = {'docker_image': config['docker']['vcftools']}

    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'docker_image': baseimage
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=list(normal_bams.keys()),
    )
    workflow.subworkflow(
        name='museq',
        func=mutationseq.create_museq_workflow,
        args=(
            mgd.InputFile('normal_regions.bam',
                          'region',
                          extensions=['.bai'],
                          fnames=normal_bams),
            mgd.InputFile('tumour_regions.bam',
                          'region',
                          extensions=['.bai'],
                          fnames=tumour_bams),
            config['ref_genome'],
            mgd.OutputFile(filepaths['museq_vcf'], extensions=['.tbi',
                                                               '.csi']),
            config,
        ),
    )

    workflow.subworkflow(name='strelka',
                         func=strelka.create_strelka_workflow,
                         args=(
                             mgd.InputFile('normal_regions.bam',
                                           'region',
                                           extensions=['.bai'],
                                           fnames=normal_bams),
                             mgd.InputFile('tumour_regions.bam',
                                           'region',
                                           extensions=['.bai'],
                                           fnames=tumour_bams),
                             config['ref_genome'],
                             mgd.OutputFile(filepaths['strelka_indel'],
                                            extensions=['.tbi', '.csi']),
                             mgd.OutputFile(filepaths['strelka_snv'],
                                            extensions=['.tbi', '.csi']),
                             config,
                         ),
                         kwargs={"chromosomes": config["chromosomes"]})

    workflow.transform(
        name='merge_snvs',
        func='biowrappers.components.io.vcf.tasks.merge_vcfs',
        ctx=ctx,
        args=([
            mgd.InputFile(filepaths['museq_vcf'], extensions=['.tbi', '.csi']),
            mgd.InputFile(filepaths['strelka_snv'],
                          extensions=['.tbi', '.csi']),
        ], mgd.TempOutputFile('all.snv.vcf')),
    )

    workflow.transform(name='finalise_snvs',
                       func="biowrappers.components.io.vcf.tasks.finalise_vcf",
                       ctx=ctx,
                       args=(mgd.TempInputFile('all.snv.vcf'),
                             mgd.TempOutputFile('all.snv.vcf.gz',
                                                extensions=['.tbi', '.csi'])),
                       kwargs={'docker_config': vcftools_docker})

    workflow.subworkflow(
        name='annotate_snvs',
        axes=(),
        ctx=ctx,
        func=
        "biowrappers.pipelines.snv_call_and_annotate.create_annotation_workflow",
        args=(
            config,
            mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.TempOutputFile('snv_annotations.h5'),
            mgd.TempSpace('raw_data_dir_annotate'),
        ),
        kwargs={
            'variant_type': 'snv',
            'docker_config': basedocker,
            'snpeff_docker': vcftools_docker,
            'vcftools_docker': vcftools_docker
        })

    workflow.transform(
        name='convert_museq_to_csv',
        func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv",
        ctx=ctx,
        args=(
            mgd.InputFile(filepaths['museq_vcf']),
            mgd.TempOutputFile('museq.csv'),
        ),
        kwargs={
            'score_callback': museq_callback,
        })

    workflow.transform(name='prep_museq_csv',
                       func='single_cell.utils.csvutils.prep_csv_files',
                       args=(mgd.TempInputFile('museq.csv'),
                             mgd.OutputFile(filepaths['museq_csv'],
                                            extensions=['.yaml'])),
                       kwargs={'header': True})

    workflow.transform(
        name='convert_strelka_to_csv',
        func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv",
        ctx=ctx,
        args=(
            mgd.InputFile(filepaths['strelka_snv']),
            mgd.TempOutputFile('strelka_snv.csv'),
        ),
        kwargs={
            'score_callback': strelka_snv_callback,
        })

    workflow.transform(name='prep_strelka_csv',
                       func='single_cell.utils.csvutils.prep_csv_files',
                       args=(mgd.TempInputFile('strelka_snv.csv'),
                             mgd.OutputFile(filepaths['strelka_csv'],
                                            extensions=['.yaml'])),
                       kwargs={'header': True})

    workflow.transform(name='convert_h5_to_csv',
                       func='single_cell.utils.hdfutils.convert_hdf_to_csv',
                       args=(mgd.TempInputFile('snv_annotations.h5'), {
                           '/snv/cosmic_status':
                           mgd.OutputFile(filepaths['cosmic_csv'],
                                          extensions=['.yaml']),
                           '/snv/dbsnp_status':
                           mgd.OutputFile(filepaths['dbsnp_csv'],
                                          extensions=['.yaml']),
                           '/snv/mappability':
                           mgd.OutputFile(filepaths['mappability_csv'],
                                          extensions=['.yaml']),
                           '/snv/snpeff':
                           mgd.OutputFile(filepaths['snpeff_csv'],
                                          extensions=['.yaml']),
                           '/snv/tri_nucleotide_context':
                           mgd.OutputFile(filepaths['trinuc_csv'],
                                          extensions=['.yaml']),
                       }))

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], list(filepaths.values()),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'variant_calling'
            }
        })

    return workflow
예제 #10
0
def hmmcopy_workflow(args):
    config = inpututils.load_config(args)
    config = config['hmmcopy']

    sampleinfo = inpututils.get_sample_info(args['input_yaml'])
    cellids = inpututils.get_samples(args['input_yaml'])
    bam_files = inpututils.get_bams(args['input_yaml'])

    lib = args["library_id"]

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']}, )

    hmmcopy_dir = args["out_dir"]

    hmmcopy_files = get_output_files(hmmcopy_dir, lib)
    hmmcopy_meta = os.path.join(hmmcopy_dir, 'metadata.yaml')
    input_yaml_blob = os.path.join(hmmcopy_dir, 'input.yaml')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.subworkflow(
        name='hmmcopy_workflow',
        func=hmmcopy.create_hmmcopy_workflow,
        args=(mgd.InputFile('bam_markdups',
                            'cell_id',
                            fnames=bam_files,
                            extensions=['.bai']),
              mgd.OutputFile(hmmcopy_files['reads_csvs']),
              mgd.OutputFile(hmmcopy_files['segs_csvs']),
              mgd.OutputFile(hmmcopy_files['metrics_csvs']),
              mgd.OutputFile(hmmcopy_files['params_csvs']),
              mgd.OutputFile(hmmcopy_files['igv_csvs']),
              mgd.OutputFile(hmmcopy_files['segs_pdf']),
              mgd.OutputFile(hmmcopy_files['bias_pdf']),
              mgd.OutputFile(hmmcopy_files['heatmap_pdf']),
              mgd.OutputFile(hmmcopy_files['metrics_pdf']),
              mgd.OutputFile(hmmcopy_files['kernel_density_pdf']),
              mgd.OutputFile(hmmcopy_files['hmmcopy_data_tar']), cellids,
              config, sampleinfo),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], hmmcopy_dir, list(hmmcopy_files.values()),
              mgd.OutputFile(hmmcopy_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'library_id': lib,
                'cell_ids': list(bam_files.keys()),
                'type': 'hmmcopy',
            }
        })

    return workflow
def pseudo_bulk_qc_workflow(args):
    data = inpututils.load_qc_input(args["input_yaml"])
    config = inpututils.load_config(args)
    config = config["qc"]

    out_dir = args["out_dir"]

    mutationreports = os.path.join(out_dir, 'patient', "mutationreport.html")
    grouplevelmafs = os.path.join(out_dir, 'patient', "grouplevelmaf.maf")
    grouplevel_high_impact_mafs = os.path.join(
        out_dir, 'patient', "grouplevel_high_impact_maf.maf")
    grouplevel_high_impact_merged_snvs = os.path.join(
        out_dir, 'patient', "grouplevel_high_impact_merged_snvs.csv")
    grouplevel_snvs = os.path.join(out_dir, 'patient', "grouplevel_snvs.csv")

    isabl_ids = {label: paths["isabl_id"] for label, paths in data.items()}

    mappability_files = {
        label: paths["mappability"]
        for label, paths in data.items()
    }
    strelka_files = {label: paths["strelka"] for label, paths in data.items()}
    museq_files = {label: paths["museq"] for label, paths in data.items()}
    cosmic_status_files = {
        label: paths["cosmic_status"]
        for label, paths in data.items()
    }
    snpeff_files = {label: paths["snpeff"] for label, paths in data.items()}
    dbsnp_status_files = {
        label: paths["dbsnp_status"]
        for label, paths in data.items()
    }
    trinuc_files = {label: paths["trinuc"] for label, paths in data.items()}
    counts_files = {label: paths["counts"] for label, paths in data.items()}
    breakpoint_counts_files = {
        label: paths["destruct_breakpoint_counts"]
        for label, paths in data.items()
    }
    destruct_breakpoint_annotation_files = {
        label: paths["destruct_breakpoint_annotation"]
        for label, paths in data.items()
    }
    lumpy_breakpoint_annotation_files = {
        label: paths["lumpy_breakpoint_annotation"]
        for label, paths in data.items()
    }
    lumpy_breakpoint_evidence_files = {
        label: paths["lumpy_breakpoint_evidence"]
        for label, paths in data.items()
    }
    haplotype_allele_data_files = {
        label: paths["haplotype_allele_data"]
        for label, paths in data.items()
    }
    annotation_metrics_files = {
        label: paths["annotation_metrics"]
        for label, paths in data.items()
    }
    hmmcopy_reads_files = {
        label: paths["hmmcopy_reads"]
        for label, paths in data.items()
    }
    hmmcopy_segs_files = {
        label: paths["hmmcopy_segs"]
        for label, paths in data.items()
    }
    hmmcopy_metrics_files = {
        label: paths["hmmcopy_metrics"]
        for label, paths in data.items()
    }
    alignment_metrics_files = {
        label: paths["alignment_metrics"]
        for label, paths in data.items()
    }
    gc_metrics_files = {
        label: paths["gc_metrics"]
        for label, paths in data.items()
    }
    indel_files = {label: paths["indel_file"] for label, paths in data.items()}

    label_dir = os.path.join(out_dir, '{patient}', '{sample_id}',
                             '{library_id}')
    sample_level_report_htmls = os.path.join(label_dir, "mainreport.html")
    sample_level_maf = os.path.join(label_dir, "samplelevelmaf.maf")
    snvs_all = os.path.join(label_dir, 'snvs_all.csv')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']})

    workflow.setobj(
        obj=mgd.OutputChunks(
            'patient',
            'sample_id',
            'library_id',
        ),
        value=list(data.keys()),
    )

    workflow.subworkflow(
        name='create_sample_level_plots',
        func="single_cell.workflows.pseudo_bulk_qc.create_sample_level_plots",
        axes=(
            'patient',
            'sample_id',
            'library_id',
        ),
        args=(mgd.InputInstance('patient'), mgd.InputInstance('sample_id'),
              mgd.InputInstance('library_id'),
              mgd.InputFile('mappability',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=mappability_files),
              mgd.InputFile('strelka',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=strelka_files),
              mgd.InputFile('museq',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=museq_files),
              mgd.InputFile('cosmic_status',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=cosmic_status_files),
              mgd.InputFile('snpeff',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=snpeff_files),
              mgd.InputFile('dbsnp_status',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=dbsnp_status_files),
              mgd.InputFile('trinuc',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=trinuc_files),
              mgd.InputFile('counts',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=counts_files),
              mgd.InputFile('destruct_breakpoint_annotation',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=destruct_breakpoint_annotation_files),
              mgd.InputFile('destruct_breakpoint_counts',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=breakpoint_counts_files),
              mgd.InputFile('lumpy_breakpoint_annotation',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=lumpy_breakpoint_annotation_files),
              mgd.InputFile('lumpy_breakpoint_evidence',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=lumpy_breakpoint_evidence_files),
              mgd.InputFile('haplotype_allele_data',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=haplotype_allele_data_files),
              mgd.InputFile('annotation_metrics',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=annotation_metrics_files),
              mgd.InputFile('hmmcopy_reads',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=hmmcopy_reads_files),
              mgd.InputFile('isabl_ids',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=isabl_ids),
              mgd.InputFile('hmmcopy_segs',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=hmmcopy_segs_files),
              mgd.InputFile('hmmcopy_metrics',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=hmmcopy_metrics_files),
              mgd.InputFile('alignment_metrics',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=alignment_metrics_files),
              mgd.InputFile('gc_metrics',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=gc_metrics_files),
              mgd.InputFile('indel_files',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=indel_files),
              mgd.OutputFile('sample_level_report_htmls',
                             'patient',
                             'sample_id',
                             'library_id',
                             template=sample_level_report_htmls),
              mgd.OutputFile('mafs',
                             'patient',
                             'sample_id',
                             'library_id',
                             template=sample_level_maf),
              mgd.OutputFile('snvs_all',
                             'patient',
                             'sample_id',
                             'library_id',
                             template=snvs_all), out_dir, config),
    )
    workflow.subworkflow(
        name='create_patient_workflow',
        func="single_cell.workflows.pseudo_bulk_qc.create_patient_workflow",
        axes=('patient', ),
        args=(
            mgd.InputInstance('patient'),
            mgd.InputFile("mafs",
                          "patient",
                          "sample_id",
                          "library_id",
                          template=sample_level_maf,
                          axes_origin=[]),
            mgd.InputFile("snvs_all",
                          "patient",
                          "sample_id",
                          "library_id",
                          template=snvs_all,
                          axes_origin=[]),
            mgd.OutputFile('mutationreport',
                           'patient',
                           template=mutationreports),
            mgd.OutputFile('grouplevelmaf', 'patient',
                           template=grouplevelmafs),
            mgd.OutputFile('grouplevel_high_impact_maf',
                           'patient',
                           template=grouplevel_high_impact_mafs),
            mgd.OutputFile('grouplevel_snvs',
                           'patient',
                           template=grouplevel_snvs),
            mgd.OutputFile('grouplevel_high_impact_merged_snvs',
                           'patient',
                           template=grouplevel_high_impact_merged_snvs),
            config,
        ),
    )

    return workflow
예제 #12
0
def create_variant_counting_workflow(args):
    """ Count variant reads for multiple sets of variants across cells.
    """

    strelka_vcf, museq_vcf, tumour_cell_bams = inpututils.load_variant_counting_input(
        args['input_yaml'])

    counts_output = os.path.join(args['out_dir'], "counts.csv.gz")

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    config = inpututils.load_config(args)
    config = config['variant_calling']

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']})

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.transform(name='merge_snvs_museq',
                       func='single_cell.utils.vcfutils.merge_vcf',
                       args=([
                           mgd.InputFile('museq.vcf',
                                         'sample_id',
                                         'library_id',
                                         fnames=museq_vcf,
                                         extensions=['.tbi', '.csi'],
                                         axes_origin=[]),
                           mgd.InputFile('strelka.vcf',
                                         'sample_id',
                                         'library_id',
                                         fnames=strelka_vcf,
                                         extensions=['.tbi', '.csi'],
                                         axes_origin=[]),
                       ],
                             mgd.TempOutputFile('all.snv.vcf.gz',
                                                extensions=['.tbi', '.csi']),
                             mgd.TempSpace("merge_vcf_temp")),
                       kwargs={'docker_image': config['docker']['vcftools']})

    workflow.subworkflow(
        name='count_alleles',
        func=create_snv_allele_counts_for_vcf_targets_workflow,
        args=(
            mgd.InputFile('tumour_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams,
                          axes_origin=[]),
            mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.OutputFile(counts_output),
            config['memory'],
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], [counts_output],
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'snv_genotyping'
            }
        })

    return workflow
예제 #13
0
def split_bam_workflow(args):
    config = inpututils.load_config(args)
    config = config['split_bam']

    bam_file = inpututils.load_split_wgs_input(args['input_yaml'])

    baseimage = config['docker']['single_cell_pipeline']

    split_bam_template = os.path.join(args['out_dir'], '{region}.bam')

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    workflow = pypeliner.workflow.Workflow(ctx={'docker_image': baseimage})

    workflow.transform(
        name="get_regions",
        ctx={
            'mem': config['memory']['low'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.TempOutputObj('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.subworkflow(
        name="split_normal",
        func=split_bams.create_split_workflow,
        ctx={
            'mem': config['memory']['low'],
            'ncpus': 1
        },
        args=(
            mgd.InputFile(bam_file),
            mgd.OutputFile("normal.split.bam",
                           'region',
                           template=split_bam_template,
                           axes_origin=[]),
            pypeliner.managed.TempInputObj('region'),
            config,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('bam_filenames',
                           'region',
                           template=split_bam_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'wgs_regionbams'
            },
            'template':
            (mgd.TempInputObj('region'), split_bam_template, 'region'),
        })

    return workflow
예제 #14
0
def variant_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['variant_calling']

    normal_bams, tumour_bams = inpututils.load_variant_calling_input(
        args['input_yaml'])

    filepaths = get_file_paths(args['out_dir'], config)

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    ctx = {
        'ncpus': 1,
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'mem': config["memory"]['low'],
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=list(normal_bams.keys()),
    )

    workflow.subworkflow(
        name='museq',
        func=mutationseq.create_museq_workflow,
        args=(
            mgd.InputFile('normal_regions.bam',
                          'region',
                          extensions=['.bai'],
                          fnames=normal_bams),
            mgd.InputFile('tumour_regions.bam',
                          'region',
                          extensions=['.bai'],
                          fnames=tumour_bams),
            mgd.OutputFile(filepaths['museq_vcf'], extensions=['.tbi',
                                                               '.csi']),
            mgd.OutputFile(filepaths['museq_csv'], extensions=['.tbi',
                                                               '.csi']),
            config,
        ),
    )

    workflow.subworkflow(name='strelka',
                         func=strelka.create_strelka_workflow,
                         args=(
                             mgd.InputFile('normal_regions.bam',
                                           'region',
                                           extensions=['.bai'],
                                           fnames=normal_bams),
                             mgd.InputFile('tumour_regions.bam',
                                           'region',
                                           extensions=['.bai'],
                                           fnames=tumour_bams),
                             config['ref_genome'],
                             mgd.OutputFile(filepaths['strelka_indel'],
                                            extensions=['.tbi', '.csi']),
                             mgd.OutputFile(filepaths['strelka_snv'],
                                            extensions=['.tbi', '.csi']),
                             mgd.OutputFile(filepaths['strelka_csv'],
                                            extensions=['.yaml']),
                         ),
                         kwargs={
                             "chromosomes": config["chromosomes"],
                             "use_depth_thresholds":
                             config['use_depth_thresholds']
                         })

    workflow.subworkflow(
        name='annotate_snv_vcf_files',
        func=snv_annotate.create_snv_annotate_workflow,
        args=(config,
              mgd.InputFile(filepaths['museq_vcf'],
                            extensions=['.tbi', '.csi']),
              mgd.InputFile(filepaths['strelka_snv'],
                            extensions=['.tbi', '.csi']),
              mgd.OutputFile(filepaths['mappability_csv'],
                             extensions=['.yaml']),
              mgd.OutputFile(filepaths['snpeff_csv'], extensions=['.yaml']),
              mgd.OutputFile(filepaths['trinuc_csv'], extensions=['.yaml']), {
                  k: mgd.OutputFile(v)
                  for k, v in filepaths['additional_databases'].items()
              }, config['memory']))

    allfiles = [
        filepaths[k] for k in filepaths if not k == 'additional_databases'
    ]
    allfiles += filepaths['additional_databases'].values()

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], allfiles,
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'variant_calling'
            }
        })

    return workflow
예제 #15
0
def merge_bams_workflow(args):
    config = inpututils.load_config(args)
    config = config['merge_bams']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low']
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    bam_files = inpututils.load_merge_cell_bams(args['input_yaml'])

    merge_out_template = os.path.join(args['out_dir'], '{region}.bam')

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.transform(
        name="get_regions",
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.OutputChunks('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.transform(
        name="remove_softclipped_reads",
        func="single_cell.utils.pysamutils.remove_softclipped_reads",
        axes=('cell_id', ),
        args=(mgd.InputFile('bam_markdups',
                            'cell_id',
                            fnames=bam_files,
                            extensions=['.bai']),
              mgd.TempOutputFile('bam_rm_softclipped.bam',
                                 'cell_id',
                                 extensions=['.bai']),
              args['softclipped_reads_threshold']))

    workflow.subworkflow(name="wgs_merge_workflow",
                         func=merge_bams.create_merge_bams_workflow,
                         args=(
                             mgd.TempInputFile('bam_rm_softclipped.bam',
                                               'cell_id',
                                               extensions=['.bai']),
                             mgd.OutputFile("merged.bam",
                                            "region",
                                            axes_origin=[],
                                            extensions=['.bai'],
                                            template=merge_out_template),
                             mgd.InputChunks("region"),
                             config,
                         ))

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('bam_filenames',
                           'region',
                           template=merge_out_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'template':
            (mgd.InputChunks('region'), merge_out_template, 'region'),
            'metadata': {
                'type': 'pseudowgs_regionbams',
                'cell_ids': list(bam_files.keys())
            }
        })

    return workflow
예제 #16
0
def breakpoint_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['breakpoint_calling']

    run_destruct = True if args['destruct'] else False
    run_lumpy = True if args['lumpy'] else False

    if not run_destruct and not run_lumpy:
        run_destruct = True
        run_lumpy = True

    normal_data, tumour_cells = inpututils.load_breakpoint_calling_input(args['input_yaml'])

    bkp_dir = os.path.join(args['out_dir'])
    bkp_meta = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    out_files = get_output_files(bkp_dir, run_destruct, run_lumpy)

    ref_data_directory = config['ref_data_directory']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    if isinstance(normal_data, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_data.keys()),
        )
        normal_bam = mgd.InputFile(
            'normal_cells.bam', 'normal_cell_id',
            extensions=['.bai'], fnames=normal_data
        )
    else:
        normal_bam = mgd.InputFile(normal_data, extensions=['.bai'])

    if run_destruct:
        workflow.subworkflow(
            name='destruct',
            func="single_cell.workflows.destruct_singlecell.create_destruct_workflow",
            args=(
                normal_bam,
                mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells),
                config.get('destruct_config', {}),
                config,
                ref_data_directory,
                mgd.OutputFile(out_files['destruct_breakpoints_filename'], extensions=['.yaml']),
                mgd.OutputFile(out_files['destruct_breakpoints_lib_filename'], extensions=['.yaml']),
                mgd.OutputFile(out_files['destruct_cell_counts_filename'], extensions=['.yaml']),
            ),
        )

    if run_lumpy:
        workflow.subworkflow(
            name='lumpy',
            func="single_cell.workflows.lumpy.create_lumpy_workflow",
            args=(
                config,
                normal_bam,
                mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells, extensions=['.bai']),
                mgd.OutputFile(out_files['lumpy_breakpoints_csv'], extensions=['.yaml']),
                mgd.OutputFile(out_files['lumpy_breakpoints_evidence_csv'], extensions=['.yaml']),
                mgd.OutputFile(out_files['lumpy_breakpoints_bed']),
            ),
        )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            bkp_dir,
            list(out_files.values()),
            mgd.OutputFile(bkp_meta)
        ),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'breakpoint_calling'}
        }
    )

    return workflow
예제 #17
0
def create_variant_counting_workflow(args):
    """ Count variant reads for multiple sets of variants across cells.
    """

    vcf_files, tumour_cell_bams, sample_library = inpututils.load_variant_counting_input(
        args['input_yaml'])

    counts_template = '{sample_id}_{library_id}_counts.csv.gz'
    counts_output_template = os.path.join(args['out_dir'], counts_template)

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    config = inpututils.load_config(args)
    config = config['variant_calling']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.transform(
        name='merge_snvs_museq',
        func='single_cell.utils.vcfutils.merge_vcf',
        args=([mgd.InputFile(vcf_file) for vcf_file in vcf_files],
              mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi',
                                                               '.csi']),
              mgd.TempSpace("merge_vcf_temp")),
    )

    workflow.subworkflow(
        name='count_alleles',
        axes=('sample_id', 'library_id'),
        func=
        'single_cell.workflows.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(
            mgd.InputFile('tumour_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams,
                          axes_origin=[]),
            mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.OutputFile('counts.csv.gz',
                           'sample_id',
                           'library_id',
                           template=counts_output_template),
            mgd.Instance('sample_id'),
            mgd.Instance('library_id'),
            config['memory'],
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('counts.csv.gz',
                           'sample_id',
                           'library_id',
                           template=counts_output_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'snv_genotyping',
                'counts': {
                    'template': counts_template,
                    'instances': sample_library,
                }
            }
        })

    return workflow