Пример #1
0
def annotation_workflow(args):
    config = inpututils.load_config(args)

    annotation_infiles = inpututils.load_yaml(args['input_yaml'])

    lib = args["library_id"]

    workflow = pypeliner.workflow.Workflow(ctx={
        'docker_image':
        config['annotation']['docker']['single_cell_pipeline']
    }, )

    annotation_dir = args["out_dir"]

    input_yaml_blob = os.path.join(annotation_dir, 'input.yaml')
    annotation_files = get_output_files(annotation_dir, lib)
    annotation_meta = os.path.join(annotation_dir, 'metadata.yaml')

    workflow.subworkflow(
        name='annotation_workflow',
        func=qc_annotation.create_qc_annotation_workflow,
        args=(
            mgd.InputFile(annotation_infiles['hmmcopy_metrics']),
            mgd.InputFile(annotation_infiles['hmmcopy_reads']),
            mgd.InputFile(annotation_infiles['alignment_metrics']),
            mgd.InputFile(annotation_infiles['gc_metrics']),
            mgd.InputFile(annotation_infiles['segs_pdf_tar']),
            mgd.OutputFile(annotation_files['merged_metrics_csvs']),
            mgd.OutputFile(annotation_files['qc_report']),
            mgd.OutputFile(annotation_files['corrupt_tree_newick']),
            mgd.OutputFile(annotation_files['consensus_tree_newick']),
            mgd.OutputFile(annotation_files['phylo_csv']),
            mgd.OutputFile(annotation_files['loci_rank_trees']),
            mgd.OutputFile(annotation_files['filtered_data']),
            mgd.OutputFile(annotation_files['corrupt_tree_pdf']),
            mgd.OutputFile(annotation_files['segs_pass']),
            mgd.OutputFile(annotation_files['segs_fail']),
            mgd.OutputFile(annotation_files['corrupt_heatmap_pdf']),
            mgd.OutputFile(annotation_files['heatmap_filt_pdf']),
            config['annotation'],
            lib,
        ),
        kwargs={'no_corrupt_tree': args['no_corrupt_tree']})

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], annotation_dir, list(annotation_files.values()),
              mgd.OutputFile(annotation_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'library_id': lib,
                'type': 'annotation'
            }
        })

    return workflow
Пример #2
0
def count_haps_workflow(args):
    config = inpututils.load_config(args)
    config = config['count_haps']
    baseimage = config['docker']['single_cell_pipeline']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               docker_image=baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    allele_counts_filename = os.path.join(args["out_dir"],
                                          "allele_counts.csv.gz")

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    haplotypes_filename, tumour_cells = inpututils.load_count_haps_input(
        args['input_yaml'])

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    workflow.subworkflow(
        name='extract_allele_readcounts',
        func=
        'single_cell.workflows.extract_allele_readcounts.extract_allele_readcounts',
        args=(
            mgd.InputFile(haplotypes_filename, extensions=['.yaml']),
            mgd.InputFile('tumour_cells.bam',
                          'tumour_cell_id',
                          extensions=['.bai'],
                          axes_origin=[],
                          fnames=tumour_cells),
            mgd.OutputFile(allele_counts_filename),
            config,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], [allele_counts_filename],
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'count_haps'
            }
        })

    return workflow
Пример #3
0
def infer_haps_workflow(args):
    config = inpututils.load_config(args)
    config = config['infer_haps']
    baseimage = config['docker']['single_cell_pipeline']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               docker_image=baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    haplotypes_filename = os.path.join(args["out_dir"], "haplotypes.tsv")

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    normal_data = inpututils.load_infer_haps_input(args['input_yaml'])

    if isinstance(normal_data, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_data.keys()),
        )
        bam_file = mgd.InputFile('normal.bam',
                                 'normal_cell_id',
                                 fnames=normal_data,
                                 extensions=['.bai'])
    else:
        bam_file = mgd.InputFile(normal_data, extensions=['.bai'])

    workflow.subworkflow(
        name='infer_haps',
        func=infer_haps,
        args=(
            bam_file,
            mgd.OutputFile(haplotypes_filename),
            config,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], [haplotypes_filename],
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'infer_haps'
            }
        })

    return workflow
Пример #4
0
def make_meta(args):
    workflow = pypeliner.workflow.Workflow()

    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')
    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    filelist = []
    for root, dirs, files in os.walk(args['out_dir']):
        for file in files:
            filelist.append(os.path.join(root, file))

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], filelist,
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'cohort_qc'
            }
        })
    return workflow
Пример #5
0
def breakpoint_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['breakpoint_calling']

    run_destruct = True if args['destruct'] else False
    run_lumpy = True if args['lumpy'] else False

    if not run_destruct and not run_lumpy:
        run_destruct = True
        run_lumpy = True

    normal_data, tumour_cells = inpututils.load_breakpoint_calling_input(args['input_yaml'])

    bkp_dir = os.path.join(args['out_dir'])
    bkp_meta = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    out_files = get_output_files(bkp_dir, run_destruct, run_lumpy)

    ref_data_directory = config['ref_data_directory']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    if isinstance(normal_data, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_data.keys()),
        )
        normal_bam = mgd.InputFile(
            'normal_cells.bam', 'normal_cell_id',
            extensions=['.bai'], fnames=normal_data
        )
    else:
        normal_bam = mgd.InputFile(normal_data, extensions=['.bai'])

    if run_destruct:
        workflow.subworkflow(
            name='destruct',
            func="single_cell.workflows.destruct_singlecell.create_destruct_workflow",
            args=(
                normal_bam,
                mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells),
                config.get('destruct_config', {}),
                config,
                ref_data_directory,
                mgd.OutputFile(out_files['destruct_breakpoints_filename'], extensions=['.yaml']),
                mgd.OutputFile(out_files['destruct_breakpoints_lib_filename'], extensions=['.yaml']),
                mgd.OutputFile(out_files['destruct_cell_counts_filename'], extensions=['.yaml']),
            ),
        )

    if run_lumpy:
        workflow.subworkflow(
            name='lumpy',
            func="single_cell.workflows.lumpy.create_lumpy_workflow",
            args=(
                config,
                normal_bam,
                mgd.InputFile('tumour.bam', 'tumour_cell_id', fnames=tumour_cells, extensions=['.bai']),
                mgd.OutputFile(out_files['lumpy_breakpoints_csv'], extensions=['.yaml']),
                mgd.OutputFile(out_files['lumpy_breakpoints_evidence_csv'], extensions=['.yaml']),
                mgd.OutputFile(out_files['lumpy_breakpoints_bed']),
            ),
        )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            bkp_dir,
            list(out_files.values()),
            mgd.OutputFile(bkp_meta)
        ),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'breakpoint_calling'}
        }
    )

    return workflow
Пример #6
0
def variant_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['variant_calling']

    normal_bams, tumour_bams = inpututils.load_variant_calling_input(
        args['input_yaml'])

    filepaths = get_file_paths(args['out_dir'], config)

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    ctx = {
        'ncpus': 1,
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'mem': config["memory"]['low'],
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=list(normal_bams.keys()),
    )

    workflow.subworkflow(
        name='museq',
        func=mutationseq.create_museq_workflow,
        args=(
            mgd.InputFile('normal_regions.bam',
                          'region',
                          extensions=['.bai'],
                          fnames=normal_bams),
            mgd.InputFile('tumour_regions.bam',
                          'region',
                          extensions=['.bai'],
                          fnames=tumour_bams),
            mgd.OutputFile(filepaths['museq_vcf'], extensions=['.tbi',
                                                               '.csi']),
            mgd.OutputFile(filepaths['museq_csv'], extensions=['.tbi',
                                                               '.csi']),
            config,
        ),
    )

    workflow.subworkflow(name='strelka',
                         func=strelka.create_strelka_workflow,
                         args=(
                             mgd.InputFile('normal_regions.bam',
                                           'region',
                                           extensions=['.bai'],
                                           fnames=normal_bams),
                             mgd.InputFile('tumour_regions.bam',
                                           'region',
                                           extensions=['.bai'],
                                           fnames=tumour_bams),
                             config['ref_genome'],
                             mgd.OutputFile(filepaths['strelka_indel'],
                                            extensions=['.tbi', '.csi']),
                             mgd.OutputFile(filepaths['strelka_snv'],
                                            extensions=['.tbi', '.csi']),
                             mgd.OutputFile(filepaths['strelka_csv'],
                                            extensions=['.yaml']),
                         ),
                         kwargs={
                             "chromosomes": config["chromosomes"],
                             "use_depth_thresholds":
                             config['use_depth_thresholds']
                         })

    workflow.subworkflow(
        name='annotate_snv_vcf_files',
        func=snv_annotate.create_snv_annotate_workflow,
        args=(config,
              mgd.InputFile(filepaths['museq_vcf'],
                            extensions=['.tbi', '.csi']),
              mgd.InputFile(filepaths['strelka_snv'],
                            extensions=['.tbi', '.csi']),
              mgd.OutputFile(filepaths['mappability_csv'],
                             extensions=['.yaml']),
              mgd.OutputFile(filepaths['snpeff_csv'], extensions=['.yaml']),
              mgd.OutputFile(filepaths['trinuc_csv'], extensions=['.yaml']), {
                  k: mgd.OutputFile(v)
                  for k, v in filepaths['additional_databases'].items()
              }, config['memory']))

    allfiles = [
        filepaths[k] for k in filepaths if not k == 'additional_databases'
    ]
    allfiles += filepaths['additional_databases'].values()

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], allfiles,
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'variant_calling'
            }
        })

    return workflow
Пример #7
0
def cohort_qc_pipeline(args):
    """Process maf, run classify copynumber, make plots.
    Args:
        args ([dict]): [pipeline arguments]
    """
    config = inpututils.load_config(args)
    config = config["cohort_qc"]

    pyp = pypeliner.app.Pypeline(config=args)

    workflow = pypeliner.workflow.Workflow()

    out_dir = args["out_dir"]
    api_key = args["API_key"]

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    # inputs
    cohort, germline_mafs, vcfs, hmmcopy = inpututils.load_cohort_qc_inputs(
        args["input_yaml"]
    )

    museq = {
        label: data["museq"] for label, data in vcfs.items()
    }
    strelka_snv = {
        label: data["strelka_snv"] for label, data in vcfs.items()
    }
    strelka_indel = {
        label: data["strelka_indel"] for label, data in vcfs.items()
    }
    hmmcopy_files = {
        label: data["hmmcopy"] for label, data in hmmcopy.items()
    }
    hmmcopy_metrics_files = {
        label: data["hmmcopy_metrics"] for label, data in hmmcopy.items()
    }
    # outputs
    cbiofile_paths = get_cbioportal_paths(os.path.join(out_dir, cohort))
    maftools_filepaths = get_maftools_paths(os.path.join(out_dir, cohort))

    workflow.setobj(
        obj=mgd.OutputChunks('sample_label', 'library_label'),
        value=list(museq.keys()),
    )
    workflow.subworkflow(
        name="merge_somatic_mafs",
        func="single_cell.workflows.cohort_qc.merge_somatic_mafs",
        axes=('sample_label',),
        args=(
            mgd.InputInstance('sample_label'),
            config,
            mgd.InputFile(
                'museq', 'sample_label', 'library_label',
                fnames=museq, axes_origin=[]
            ),
            mgd.InputFile(
                'strelka_snv', 'sample_label', 'library_label',
                fnames=strelka_snv, axes_origin=[]
            ),
            mgd.InputFile(
                'strelka_indel', 'sample_label', 'library_label',
                fnames=strelka_indel, axes_origin=[]
            ),
            mgd.TempOutputFile('somatic_maf', 'sample_label')
        ),
    )
    
    workflow.subworkflow(
        name="classifycopynumber",
        func="single_cell.workflows.cohort_qc.cna_annotation_workflow",
        args=(
            config,
            mgd.InputFile(
                'hmmcopy_dict', 'sample_label', 'library_label',
                fnames=hmmcopy_files, axes_origin=[]
            ),
            mgd.InputFile(
                'hmmcopy_metrics_dict', 'sample_label', 'library_label',
                fnames=hmmcopy_metrics_files, axes_origin=[]
            ),
            mgd.OutputFile(cbiofile_paths["cna_table"]),
            mgd.OutputFile(maftools_filepaths["maftools_cna"]),
            mgd.OutputFile(cbiofile_paths["segments"]),
            config["gtf"],

        ),
    )

    workflow.subworkflow(
        name="maf_annotation_workflow",
        func="single_cell.workflows.cohort_qc.preprocess_mafs_workflow",
        args=(
            config,
            mgd.InputFile(
                'germline_mafs_dict',  'sample_label',
                fnames=germline_mafs, axes_origin=[]
            ),
            mgd.TempInputFile(
                'somatic_maf',  'sample_label',
                axes_origin=[]
            ),
            mgd.OutputFile(cbiofile_paths["filtered_germline_maf"]),
            mgd.OutputFile(cbiofile_paths["annotated_somatic_maf"]),
            api_key
        ),
    )
    workflow.subworkflow(
        name="make_plots_and_report",
        func="single_cell.workflows.cohort_qc.create_cohort_oncoplot",
        args=(
            config,
            mgd.InputFile(cbiofile_paths["filtered_germline_maf"]),
            mgd.InputFile(cbiofile_paths["annotated_somatic_maf"]),
            mgd.InputFile(maftools_filepaths["maftools_cna"]),
            mgd.OutputFile(maftools_filepaths["maftools_maf"]),
            mgd.OutputFile(maftools_filepaths["cohort_oncoplot"]),
            mgd.OutputFile(maftools_filepaths["report"]),
            cohort
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            args['out_dir'],
            list(cbiofile_paths.values()) + list(maftools_filepaths.values()),
            mgd.OutputFile(meta_yaml)
        ),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'cohort_qc'}
        }
    )
    pyp.run(workflow)
Пример #8
0
def germline_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['germline_calling']

    normal_bams = inpututils.load_germline_data(args['input_yaml'])

    varcalls_meta = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')
    out_files = get_output_files(args['out_dir'])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=list(normal_bams.keys()),
    )

    workflow.subworkflow(
        name='samtools_germline',
        func=germline.create_samtools_germline_workflow,
        args=(
            mgd.InputFile("normal_split.bam",
                          "region",
                          extensions=['.bai'],
                          fnames=normal_bams),
            config['ref_genome'],
            mgd.OutputFile(out_files['samtools_germline_vcf'],
                           extensions=['.tbi']),
            config,
        ),
    )

    workflow.subworkflow(
        name='annotate_mappability',
        func=
        "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow",
        args=(
            config['databases']['mappability']['local_path'],
            mgd.InputFile(out_files['samtools_germline_vcf'],
                          extensions=['.tbi']),
            mgd.OutputFile(out_files['mappability_filename']),
        ),
        kwargs={'chromosomes': config['chromosomes']})

    workflow.transform(
        name='annotate_genotype',
        func="single_cell.workflows.germline.tasks.annotate_normal_genotype",
        args=(
            mgd.InputFile(out_files['samtools_germline_vcf'],
                          extensions=['.tbi']),
            mgd.OutputFile(out_files['normal_genotype_filename']),
            config["chromosomes"],
        ),
    )

    workflow.subworkflow(
        name='snpeff',
        func=
        "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow",
        args=(
            config['databases']['snpeff']['db'],
            config['databases']['snpeff']['data_dir'],
            mgd.InputFile(out_files['samtools_germline_vcf'],
                          extensions=['.tbi']),
            mgd.OutputFile(out_files['snpeff_vcf_filename']),
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], list(out_files.values()),
              mgd.OutputFile(varcalls_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'germline_calling'
            }
        })

    return workflow
Пример #9
0
def alignment_workflow(args):
    config = inpututils.load_config(args)
    config = config['alignment']

    lib = args["library_id"]
    alignment_dir = args["out_dir"]
    bams_dir = args["bams_dir"]

    trim = args['trim']
    center = args['sequencing_center']

    sampleinfo = inpututils.get_sample_info(args['input_yaml'])

    cellids = inpututils.get_samples(args['input_yaml'])
    fastq1_files, fastq2_files = inpututils.get_fastqs(args['input_yaml'])

    alignment_files = get_output_files(alignment_dir, lib)
    alignment_meta = os.path.join(alignment_dir, 'metadata.yaml')

    bam_files_template = os.path.join(bams_dir, '{cell_id}.bam')
    mt_bam_files_template = os.path.join(bams_dir, '{cell_id}_MT.bam')
    bams_meta = os.path.join(bams_dir, 'metadata.yaml')

    lanes = sorted(set([v[1] for v in fastq1_files.keys()]))
    cells = sorted(set([v[0] for v in fastq1_files.keys()]))

    input_yaml_blob = os.path.join(alignment_dir, 'input.yaml')

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id', 'lane'),
        value=list(fastq1_files.keys()),
    )

    workflow.subworkflow(
        name='alignment_workflow',
        func=align.create_alignment_workflow,
        args=(
            mgd.InputFile('fastq_1',
                          'cell_id',
                          'lane',
                          fnames=fastq1_files,
                          axes_origin=[]),
            mgd.InputFile('fastq_2',
                          'cell_id',
                          'lane',
                          fnames=fastq2_files,
                          axes_origin=[]),
            mgd.OutputFile('bam_markdups',
                           'cell_id',
                           template=bam_files_template,
                           axes_origin=[],
                           extensions=['.bai']),
            mgd.OutputFile('mt_bam_markdups',
                           'cell_id',
                           template=mt_bam_files_template,
                           axes_origin=[],
                           extensions=['.bai']),
            mgd.OutputFile(alignment_files['alignment_metrics_csv']),
            mgd.OutputFile(alignment_files['gc_metrics_csv']),
            mgd.OutputFile(alignment_files['fastqc_metrics_csv']),
            mgd.OutputFile(alignment_files['plot_metrics_output']),
            config['ref_genome'],
            config,
            sampleinfo,
            cellids,
            mgd.OutputFile(alignment_files['alignment_metrics_tar']),
            lib,
            trim,
            center,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], alignment_dir, list(alignment_files.values()),
              mgd.OutputFile(alignment_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'library_id': lib,
                'cell_ids': cells,
                'lane_ids': lanes,
                'type': 'alignment'
            }
        })

    workflow.transform(
        name='generate_meta_files_bams',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], bams_dir,
              mgd.Template('aligned.bam',
                           'cell_id',
                           template=bam_files_template),
              mgd.OutputFile(bams_meta)),
        kwargs={
            'metadata': {
                'library_id': lib,
                'cell_ids': cells,
                'lane_ids': lanes,
                'type': 'cellbams'
            },
            'template':
            (mgd.InputChunks('cell_id'), bam_files_template, 'cell_id'),
        })

    return workflow
def variant_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['variant_calling']

    normal_bams, tumour_bams = inpututils.load_variant_calling_input(
        args['input_yaml'])

    filepaths = get_file_paths(args['out_dir'])

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    basedocker = {'docker_image': config['docker']['single_cell_pipeline']}
    vcftools_docker = {'docker_image': config['docker']['vcftools']}

    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'docker_image': baseimage
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=list(normal_bams.keys()),
    )
    workflow.subworkflow(
        name='museq',
        func=mutationseq.create_museq_workflow,
        args=(
            mgd.InputFile('normal_regions.bam',
                          'region',
                          extensions=['.bai'],
                          fnames=normal_bams),
            mgd.InputFile('tumour_regions.bam',
                          'region',
                          extensions=['.bai'],
                          fnames=tumour_bams),
            config['ref_genome'],
            mgd.OutputFile(filepaths['museq_vcf'], extensions=['.tbi',
                                                               '.csi']),
            config,
        ),
    )

    workflow.subworkflow(name='strelka',
                         func=strelka.create_strelka_workflow,
                         args=(
                             mgd.InputFile('normal_regions.bam',
                                           'region',
                                           extensions=['.bai'],
                                           fnames=normal_bams),
                             mgd.InputFile('tumour_regions.bam',
                                           'region',
                                           extensions=['.bai'],
                                           fnames=tumour_bams),
                             config['ref_genome'],
                             mgd.OutputFile(filepaths['strelka_indel'],
                                            extensions=['.tbi', '.csi']),
                             mgd.OutputFile(filepaths['strelka_snv'],
                                            extensions=['.tbi', '.csi']),
                             config,
                         ),
                         kwargs={"chromosomes": config["chromosomes"]})

    workflow.transform(
        name='merge_snvs',
        func='biowrappers.components.io.vcf.tasks.merge_vcfs',
        ctx=ctx,
        args=([
            mgd.InputFile(filepaths['museq_vcf'], extensions=['.tbi', '.csi']),
            mgd.InputFile(filepaths['strelka_snv'],
                          extensions=['.tbi', '.csi']),
        ], mgd.TempOutputFile('all.snv.vcf')),
    )

    workflow.transform(name='finalise_snvs',
                       func="biowrappers.components.io.vcf.tasks.finalise_vcf",
                       ctx=ctx,
                       args=(mgd.TempInputFile('all.snv.vcf'),
                             mgd.TempOutputFile('all.snv.vcf.gz',
                                                extensions=['.tbi', '.csi'])),
                       kwargs={'docker_config': vcftools_docker})

    workflow.subworkflow(
        name='annotate_snvs',
        axes=(),
        ctx=ctx,
        func=
        "biowrappers.pipelines.snv_call_and_annotate.create_annotation_workflow",
        args=(
            config,
            mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.TempOutputFile('snv_annotations.h5'),
            mgd.TempSpace('raw_data_dir_annotate'),
        ),
        kwargs={
            'variant_type': 'snv',
            'docker_config': basedocker,
            'snpeff_docker': vcftools_docker,
            'vcftools_docker': vcftools_docker
        })

    workflow.transform(
        name='convert_museq_to_csv',
        func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv",
        ctx=ctx,
        args=(
            mgd.InputFile(filepaths['museq_vcf']),
            mgd.TempOutputFile('museq.csv'),
        ),
        kwargs={
            'score_callback': museq_callback,
        })

    workflow.transform(name='prep_museq_csv',
                       func='single_cell.utils.csvutils.prep_csv_files',
                       args=(mgd.TempInputFile('museq.csv'),
                             mgd.OutputFile(filepaths['museq_csv'],
                                            extensions=['.yaml'])),
                       kwargs={'header': True})

    workflow.transform(
        name='convert_strelka_to_csv',
        func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv",
        ctx=ctx,
        args=(
            mgd.InputFile(filepaths['strelka_snv']),
            mgd.TempOutputFile('strelka_snv.csv'),
        ),
        kwargs={
            'score_callback': strelka_snv_callback,
        })

    workflow.transform(name='prep_strelka_csv',
                       func='single_cell.utils.csvutils.prep_csv_files',
                       args=(mgd.TempInputFile('strelka_snv.csv'),
                             mgd.OutputFile(filepaths['strelka_csv'],
                                            extensions=['.yaml'])),
                       kwargs={'header': True})

    workflow.transform(name='convert_h5_to_csv',
                       func='single_cell.utils.hdfutils.convert_hdf_to_csv',
                       args=(mgd.TempInputFile('snv_annotations.h5'), {
                           '/snv/cosmic_status':
                           mgd.OutputFile(filepaths['cosmic_csv'],
                                          extensions=['.yaml']),
                           '/snv/dbsnp_status':
                           mgd.OutputFile(filepaths['dbsnp_csv'],
                                          extensions=['.yaml']),
                           '/snv/mappability':
                           mgd.OutputFile(filepaths['mappability_csv'],
                                          extensions=['.yaml']),
                           '/snv/snpeff':
                           mgd.OutputFile(filepaths['snpeff_csv'],
                                          extensions=['.yaml']),
                           '/snv/tri_nucleotide_context':
                           mgd.OutputFile(filepaths['trinuc_csv'],
                                          extensions=['.yaml']),
                       }))

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], list(filepaths.values()),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'variant_calling'
            }
        })

    return workflow
Пример #11
0
def merge_bams_workflow(args):
    config = inpututils.load_config(args)
    config = config['merge_bams']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low']
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    bam_files = inpututils.load_merge_cell_bams(args['input_yaml'])

    merge_out_template = os.path.join(args['out_dir'], '{region}.bam')

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.transform(
        name="get_regions",
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.OutputChunks('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.transform(
        name="remove_softclipped_reads",
        func="single_cell.utils.pysamutils.remove_softclipped_reads",
        axes=('cell_id', ),
        args=(mgd.InputFile('bam_markdups',
                            'cell_id',
                            fnames=bam_files,
                            extensions=['.bai']),
              mgd.TempOutputFile('bam_rm_softclipped.bam',
                                 'cell_id',
                                 extensions=['.bai']),
              args['softclipped_reads_threshold']))

    workflow.subworkflow(name="wgs_merge_workflow",
                         func=merge_bams.create_merge_bams_workflow,
                         args=(
                             mgd.TempInputFile('bam_rm_softclipped.bam',
                                               'cell_id',
                                               extensions=['.bai']),
                             mgd.OutputFile("merged.bam",
                                            "region",
                                            axes_origin=[],
                                            extensions=['.bai'],
                                            template=merge_out_template),
                             mgd.InputChunks("region"),
                             config,
                         ))

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('bam_filenames',
                           'region',
                           template=merge_out_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'template':
            (mgd.InputChunks('region'), merge_out_template, 'region'),
            'metadata': {
                'type': 'pseudowgs_regionbams',
                'cell_ids': list(bam_files.keys())
            }
        })

    return workflow
Пример #12
0
def hmmcopy_workflow(args):
    config = inpututils.load_config(args)
    config = config['hmmcopy']

    sampleinfo = inpututils.get_sample_info(args['input_yaml'])
    cellids = inpututils.get_samples(args['input_yaml'])
    bam_files = inpututils.get_bams(args['input_yaml'])

    lib = args["library_id"]

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']}, )

    hmmcopy_dir = args["out_dir"]

    hmmcopy_files = get_output_files(hmmcopy_dir, lib)
    hmmcopy_meta = os.path.join(hmmcopy_dir, 'metadata.yaml')
    input_yaml_blob = os.path.join(hmmcopy_dir, 'input.yaml')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.subworkflow(
        name='hmmcopy_workflow',
        func=hmmcopy.create_hmmcopy_workflow,
        args=(mgd.InputFile('bam_markdups',
                            'cell_id',
                            fnames=bam_files,
                            extensions=['.bai']),
              mgd.OutputFile(hmmcopy_files['reads_csvs']),
              mgd.OutputFile(hmmcopy_files['segs_csvs']),
              mgd.OutputFile(hmmcopy_files['metrics_csvs']),
              mgd.OutputFile(hmmcopy_files['params_csvs']),
              mgd.OutputFile(hmmcopy_files['igv_csvs']),
              mgd.OutputFile(hmmcopy_files['segs_pdf']),
              mgd.OutputFile(hmmcopy_files['bias_pdf']),
              mgd.OutputFile(hmmcopy_files['heatmap_pdf']),
              mgd.OutputFile(hmmcopy_files['metrics_pdf']),
              mgd.OutputFile(hmmcopy_files['kernel_density_pdf']),
              mgd.OutputFile(hmmcopy_files['hmmcopy_data_tar']), cellids,
              config, sampleinfo),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], hmmcopy_dir, list(hmmcopy_files.values()),
              mgd.OutputFile(hmmcopy_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'library_id': lib,
                'cell_ids': list(bam_files.keys()),
                'type': 'hmmcopy',
            }
        })

    return workflow
Пример #13
0
def create_variant_counting_workflow(args):
    """ Count variant reads for multiple sets of variants across cells.
    """

    strelka_vcf, museq_vcf, tumour_cell_bams = inpututils.load_variant_counting_input(
        args['input_yaml'])

    counts_output = os.path.join(args['out_dir'], "counts.csv.gz")

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    config = inpututils.load_config(args)
    config = config['variant_calling']

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']})

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.transform(name='merge_snvs_museq',
                       func='single_cell.utils.vcfutils.merge_vcf',
                       args=([
                           mgd.InputFile('museq.vcf',
                                         'sample_id',
                                         'library_id',
                                         fnames=museq_vcf,
                                         extensions=['.tbi', '.csi'],
                                         axes_origin=[]),
                           mgd.InputFile('strelka.vcf',
                                         'sample_id',
                                         'library_id',
                                         fnames=strelka_vcf,
                                         extensions=['.tbi', '.csi'],
                                         axes_origin=[]),
                       ],
                             mgd.TempOutputFile('all.snv.vcf.gz',
                                                extensions=['.tbi', '.csi']),
                             mgd.TempSpace("merge_vcf_temp")),
                       kwargs={'docker_image': config['docker']['vcftools']})

    workflow.subworkflow(
        name='count_alleles',
        func=create_snv_allele_counts_for_vcf_targets_workflow,
        args=(
            mgd.InputFile('tumour_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams,
                          axes_origin=[]),
            mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.OutputFile(counts_output),
            config['memory'],
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], [counts_output],
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'snv_genotyping'
            }
        })

    return workflow
Пример #14
0
def split_bam_workflow(args):
    config = inpututils.load_config(args)
    config = config['split_bam']

    bam_file = inpututils.load_split_wgs_input(args['input_yaml'])

    baseimage = config['docker']['single_cell_pipeline']

    split_bam_template = os.path.join(args['out_dir'], '{region}.bam')

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    workflow = pypeliner.workflow.Workflow(ctx={'docker_image': baseimage})

    workflow.transform(
        name="get_regions",
        ctx={
            'mem': config['memory']['low'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.TempOutputObj('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.subworkflow(
        name="split_normal",
        func=split_bams.create_split_workflow,
        ctx={
            'mem': config['memory']['low'],
            'ncpus': 1
        },
        args=(
            mgd.InputFile(bam_file),
            mgd.OutputFile("normal.split.bam",
                           'region',
                           template=split_bam_template,
                           axes_origin=[]),
            pypeliner.managed.TempInputObj('region'),
            config,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('bam_filenames',
                           'region',
                           template=split_bam_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'wgs_regionbams'
            },
            'template':
            (mgd.TempInputObj('region'), split_bam_template, 'region'),
        })

    return workflow
Пример #15
0
def create_variant_counting_workflow(args):
    """ Count variant reads for multiple sets of variants across cells.
    """

    vcf_files, tumour_cell_bams, sample_library = inpututils.load_variant_counting_input(
        args['input_yaml'])

    counts_template = '{sample_id}_{library_id}_counts.csv.gz'
    counts_output_template = os.path.join(args['out_dir'], counts_template)

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    config = inpututils.load_config(args)
    config = config['variant_calling']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.transform(
        name='merge_snvs_museq',
        func='single_cell.utils.vcfutils.merge_vcf',
        args=([mgd.InputFile(vcf_file) for vcf_file in vcf_files],
              mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi',
                                                               '.csi']),
              mgd.TempSpace("merge_vcf_temp")),
    )

    workflow.subworkflow(
        name='count_alleles',
        axes=('sample_id', 'library_id'),
        func=
        'single_cell.workflows.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(
            mgd.InputFile('tumour_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams,
                          axes_origin=[]),
            mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.OutputFile('counts.csv.gz',
                           'sample_id',
                           'library_id',
                           template=counts_output_template),
            mgd.Instance('sample_id'),
            mgd.Instance('library_id'),
            config['memory'],
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('counts.csv.gz',
                           'sample_id',
                           'library_id',
                           template=counts_output_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'snv_genotyping',
                'counts': {
                    'template': counts_template,
                    'instances': sample_library,
                }
            }
        })

    return workflow