def create_snv_allele_counts_for_vcf_targets_workflow(
    bam_files,
    vcf_file,
    out_file,
    memory_cfg,
    count_duplicates=False,
    min_bqual=0,
    min_mqual=0,
    table_name='snv_allele_counts',
    vcf_to_bam_chrom_map=None,
):
    ctx = {
        'mem': memory_cfg['low'],
        'num_retry': 3,
        'mem_retry_increment': 2,
        'ncpus': 1,
        'disk_retry_increment': 50,
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.transform(
        name='get_snv_allele_counts_for_vcf_targets',
        axes=('sample_id', 'library_id', 'cell_id'),
        func=
        "biowrappers.components.variant_calling.snv_allele_counts.tasks.get_snv_allele_counts_for_vcf_targets",
        args=(
            mgd.InputFile('tumour.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          fnames=bam_files,
                          extensions=['.bai']),
            mgd.InputFile(vcf_file),
            mgd.TempOutputFile('counts.h5', 'sample_id', 'library_id',
                               'cell_id'),
            table_name,
        ),
        kwargs={
            'count_duplicates': count_duplicates,
            'min_bqual': min_bqual,
            'min_mqual': min_mqual,
            'vcf_to_bam_chrom_map': vcf_to_bam_chrom_map,
            'cell_id': mgd.Instance('cell_id'),
            'sample_id': mgd.Instance('sample_id'),
            'library_id': mgd.Instance('library_id'),
            'report_zero_count_positions': False,
        })

    workflow.transform(
        name='merge_snv_allele_counts',
        ctx={
            'mem': memory_cfg['high'],
            'disk': 20
        },
        func="biowrappers.components.io.hdf5.tasks.concatenate_tables",
        args=(
            mgd.TempInputFile('counts.h5', 'sample_id', 'library_id',
                              'cell_id'),
            mgd.TempOutputFile('merged_counts.h5'),
        ),
        kwargs={
            'in_memory': False,
        },
    )

    workflow.transform(name='convert_h5_to_csv',
                       func='single_cell.utils.hdfutils.convert_hdf_to_csv',
                       args=(mgd.TempInputFile('merged_counts.h5'), {
                           '/snv_allele_counts':
                           mgd.OutputFile(out_file, extensions=['.yaml']),
                       }))

    return workflow
示例#2
0
def germline_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['germline_calling']

    vcftoolsdocker = {'docker_image': config['docker']['vcftools']}
    samtoolsdocker = {'docker_image': config['docker']['samtools']}
    snpeffdocker = {'docker_image': config['docker']['snpeff']}

    normal_bams = inpututils.load_germline_data(args['input_yaml'])

    varcalls_meta = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')
    out_files = get_output_files(args['out_dir'])

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']})

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=list(normal_bams.keys()),
    )

    workflow.subworkflow(name='samtools_germline',
                         func=germline.create_samtools_germline_workflow,
                         args=(
                             mgd.InputFile("normal_split.bam",
                                           "region",
                                           extensions=['.bai'],
                                           fnames=normal_bams),
                             config['ref_genome'],
                             mgd.OutputFile(out_files['samtools_germline_vcf'],
                                            extensions=['.tbi']),
                             config,
                         ),
                         kwargs={
                             'vcftools_docker': vcftoolsdocker,
                             'samtools_docker': samtoolsdocker,
                         })

    workflow.subworkflow(
        name='annotate_mappability',
        func=
        "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow",
        args=(
            config['databases']['mappability']['local_path'],
            mgd.InputFile(out_files['samtools_germline_vcf'],
                          extensions=['.tbi']),
            mgd.OutputFile(out_files['mappability_filename']),
        ),
        kwargs={'chromosomes': config['chromosomes']})

    workflow.transform(
        name='annotate_genotype',
        func="single_cell.workflows.germline.tasks.annotate_normal_genotype",
        args=(
            mgd.InputFile(out_files['samtools_germline_vcf'],
                          extensions=['.tbi']),
            mgd.OutputFile(out_files['normal_genotype_filename']),
            config["chromosomes"],
        ),
    )

    workflow.subworkflow(
        name='snpeff',
        func=
        "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow",
        args=(
            config['databases']['snpeff']['db'],
            mgd.InputFile(out_files['samtools_germline_vcf'],
                          extensions=['.tbi']),
            mgd.OutputFile(out_files['snpeff_vcf_filename']),
        ),
        kwargs={
            'hdf5_output': False,
            'vcftools_docker': vcftoolsdocker,
            'snpeff_docker': snpeffdocker,
        })

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], list(out_files.values()),
              mgd.OutputFile(varcalls_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'germline_calling'
            }
        })

    return workflow
示例#3
0
def create_alignment_workflow(fastq_1_filename,
                              fastq_2_filename,
                              bam_filename,
                              alignment_metrics,
                              gc_metrics,
                              detailed_fastqscreen_metrics,
                              plot_metrics,
                              ref_genome,
                              config,
                              triminfo,
                              centerinfo,
                              sample_info,
                              cell_ids,
                              metrics_tar,
                              library_id,
                              realign=False):
    baseimage = config['docker']['single_cell_pipeline']

    bam_filename = dict([(cellid, bam_filename[cellid])
                         for cellid in cell_ids])

    chromosomes = config["chromosomes"]

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(obj=mgd.TempOutputObj('sampleinfo',
                                          'cell_id',
                                          axes_origin=[]),
                    value=sample_info)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id', 'lane'),
        value=list(fastq_1_filename.keys()),
    )

    workflow.setobj(obj=mgd.TempOutputObj('trim',
                                          'cell_id',
                                          'lane',
                                          axes_origin=[]),
                    value=triminfo)

    workflow.setobj(obj=mgd.TempOutputObj('center',
                                          'cell_id',
                                          'lane',
                                          axes_origin=[]),
                    value=centerinfo)

    workflow.transform(
        name='run_fastq_screen',
        ctx={
            'mem': 7,
            'ncpus': 1,
            'docker_image': baseimage
        },
        axes=(
            'cell_id',
            'lane',
        ),
        func="single_cell.workflows.align.fastqscreen.organism_filter",
        args=(mgd.InputFile('fastq_1',
                            'cell_id',
                            'lane',
                            fnames=fastq_1_filename),
              mgd.InputFile('fastq_2',
                            'cell_id',
                            'lane',
                            fnames=fastq_2_filename),
              mgd.TempOutputFile('fastq_r1_matching_reads.fastq.gz', 'cell_id',
                                 'lane'),
              mgd.TempOutputFile('fastq_r2_matching_reads.fastq.gz', 'cell_id',
                                 'lane'),
              mgd.TempOutputFile('organism_detailed_count_per_lane.csv',
                                 'cell_id', 'lane'),
              mgd.TempOutputFile('organism_summary_count_per_lane.csv',
                                 'cell_id', 'lane'),
              mgd.TempSpace("tempdir_organism_filter", 'cell_id',
                            'lane'), mgd.InputInstance('cell_id'),
              config['fastq_screen_params'], config['ref_type']),
        kwargs={
            'docker_image':
            config['docker']['fastq_screen'],
            'filter_contaminated_reads':
            config['fastq_screen_params']['filter_contaminated_reads']
        })

    workflow.transform(
        name='merge_fastq_screen_metrics',
        ctx={
            'mem': 7,
            'ncpus': 1,
            'docker_image': baseimage
        },
        func=
        "single_cell.workflows.align.fastqscreen.merge_fastq_screen_counts",
        args=(
            mgd.TempInputFile('organism_detailed_count_per_lane.csv',
                              'cell_id', 'lane'),
            mgd.TempInputFile('organism_summary_count_per_lane.csv', 'cell_id',
                              'lane'),
            mgd.OutputFile(detailed_fastqscreen_metrics, extensions=['.yaml']),
            mgd.TempOutputFile('organism_summary_count_per_cell.csv'),
        ))

    workflow.transform(
        name='align_reads',
        ctx={
            'mem': 7,
            'ncpus': 1,
            'docker_image': baseimage
        },
        axes=(
            'cell_id',
            'lane',
        ),
        func="single_cell.workflows.align.tasks.align_pe",
        args=(
            mgd.TempInputFile('fastq_r1_matching_reads.fastq.gz', 'cell_id',
                              'lane'),
            mgd.TempInputFile('fastq_r2_matching_reads.fastq.gz', 'cell_id',
                              'lane'),
            mgd.TempOutputFile('aligned_per_cell_per_lane.sorted.bam',
                               'cell_id', 'lane'),
            mgd.TempOutputFile('fastqc_reports.tar.gz', 'cell_id', 'lane'),
            mgd.TempOutputFile('flagstat_metrics.txt', 'cell_id', 'lane'),
            mgd.TempSpace('alignment_temp', 'cell_id', 'lane'),
            ref_genome,
            mgd.TempInputObj('trim', 'cell_id', 'lane'),
            mgd.TempInputObj('center', 'cell_id', 'lane'),
            mgd.TempInputObj('sampleinfo', 'cell_id'),
            mgd.InputInstance('cell_id'),
            mgd.InputInstance('lane'),
            library_id,
            config['aligner'],
            config['docker'],
            config['adapter'],
            config['adapter2'],
            config['fastq_screen_params'],
        ))

    workflow.transform(name='merge_bams',
                       ctx={
                           'mem': config['memory']['med'],
                           'ncpus': 1,
                           'docker_image': baseimage
                       },
                       func="single_cell.workflows.align.tasks.merge_bams",
                       axes=('cell_id', ),
                       args=(mgd.TempInputFile(
                           'aligned_per_cell_per_lane.sorted.bam', 'cell_id',
                           'lane'),
                             mgd.TempOutputFile('merged_lanes.bam', 'cell_id'),
                             mgd.TempOutputFile('merged_lanes.bam.bai',
                                                'cell_id'), config['docker']))

    if realign:
        workflow.transform(name='realignment',
                           axes=('chrom', ),
                           ctx={
                               'mem': config['memory']['med'],
                               'ncpus': 1,
                               'docker_image': baseimage
                           },
                           func="single_cell.workflows.align.tasks.realign",
                           args=(mgd.TempInputFile('merged_lanes.bam',
                                                   'cell_id'),
                                 mgd.TempInputFile('merged_lanes.bam.bai',
                                                   'cell_id'),
                                 mgd.TempOutputFile('realigned.bam', 'chrom',
                                                    'cell_id'),
                                 mgd.TempSpace('realignment_temp',
                                               'chrom'), config,
                                 mgd.InputInstance('chrom')))

        workflow.transform(
            name='merge_realignment',
            ctx={
                'mem': config['memory']['med'],
                'ncpus': 1,
                'docker_image': baseimage
            },
            axes=('cell_id', ),
            func="single_cell.workflows.align.tasks.merge_realignment",
            args=(mgd.TempInputFile('realigned.bam', 'chrom', 'cell_id'),
                  mgd.TempOutputFile('merged_realign.bam', 'cell_id'), config,
                  mgd.InputInstance('cell_id')))

    final_bam = mgd.TempInputFile('merged_lanes.bam', 'cell_id')
    if realign:
        final_bam = mgd.TempInputFile('merged_realign.bam', 'cell_id')

    workflow.transform(
        name='postprocess_bam',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        axes=('cell_id', ),
        func="single_cell.workflows.align.tasks.postprocess_bam",
        args=(
            final_bam,
            mgd.OutputFile('sorted_markdups',
                           'cell_id',
                           fnames=bam_filename,
                           extensions=['.bai']),
            mgd.TempSpace('tempdir', 'cell_id'),
            config['docker'],
        ),
    )

    workflow.subworkflow(
        name='metrics_subworkflow',
        func="single_cell.workflows.align.bam_metrics_workflow",
        args=(mgd.InputFile('sorted_markdups',
                            'cell_id',
                            fnames=bam_filename,
                            extensions=['.bai']),
              mgd.TempInputFile('organism_summary_count_per_cell.csv'),
              mgd.TempOutputFile('alignment_metrics.csv',
                                 extensions=['.yaml']),
              mgd.OutputFile(gc_metrics, extensions=['.yaml']),
              mgd.TempOutputFile('markdups_metrics.txt',
                                 'cell_id',
                                 axes_origin=[]),
              mgd.TempOutputFile('flagstat_metrics.txt',
                                 'cell_id',
                                 axes_origin=[]),
              mgd.TempOutputFile('wgs_metrics.txt', 'cell_id', axes_origin=[]),
              mgd.TempOutputFile('gc_metrics.txt', 'cell_id', axes_origin=[]),
              mgd.TempOutputFile('gc_metrics_summary.txt',
                                 'cell_id',
                                 axes_origin=[]),
              mgd.TempOutputFile('gc_metrics.pdf', 'cell_id', axes_origin=[]),
              mgd.TempOutputFile('insert_metrics.txt',
                                 'cell_id',
                                 axes_origin=[]),
              mgd.TempOutputFile('insert_metrics.pdf',
                                 'cell_id',
                                 axes_origin=[]), ref_genome, sample_info,
              config, cell_ids))

    workflow.transform(
        name='add_contamination_status',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.align.tasks.add_contamination_status",
        args=(
            mgd.TempInputFile('alignment_metrics.csv', extensions=['.yaml']),
            mgd.OutputFile(alignment_metrics, extensions=['.yaml']),
        ),
        kwargs={
            'reference': config['ref_type'],
            'strict_validation':
            config['fastq_screen_params']['strict_validation']
        })

    workflow.transform(name='plot_metrics',
                       ctx={
                           'mem': config['memory']['med'],
                           'ncpus': 1,
                           'docker_image': baseimage
                       },
                       func="single_cell.workflows.align.tasks.plot_metrics",
                       args=(
                           mgd.InputFile(alignment_metrics,
                                         extensions=['.yaml']),
                           mgd.OutputFile(plot_metrics),
                           'QC pipeline metrics',
                           mgd.InputFile(gc_metrics, extensions=['.yaml']),
                           config['gc_windows'],
                       ))

    workflow.transform(name='tar_all_files',
                       ctx={
                           'mem': config['memory']['med'],
                           'ncpus': 1,
                           'docker_image': baseimage
                       },
                       func="single_cell.utils.helpers.tar_files",
                       args=([
                           mgd.TempInputFile('fastqc_reports.tar.gz',
                                             'cell_id', 'lane'),
                           mgd.TempInputFile('flagstat_metrics.txt', 'cell_id',
                                             'lane'),
                           mgd.TempInputFile('markdups_metrics.txt',
                                             'cell_id'),
                           mgd.TempInputFile('flagstat_metrics.txt',
                                             'cell_id'),
                           mgd.TempInputFile('wgs_metrics.txt', 'cell_id'),
                           mgd.TempInputFile('gc_metrics.txt', 'cell_id'),
                           mgd.TempInputFile('gc_metrics_summary.txt',
                                             'cell_id'),
                           mgd.TempInputFile('gc_metrics.pdf', 'cell_id'),
                           mgd.TempInputFile('insert_metrics.txt', 'cell_id'),
                           mgd.TempInputFile('insert_metrics.pdf', 'cell_id'),
                       ], mgd.OutputFile(metrics_tar),
                             mgd.TempSpace("merge_metrics_tar")))

    return workflow
示例#4
0
def breakpoint_calling_workflow(workflow, args):

    config = helpers.load_config(args)

    normal_bam_file = args['matched_normal']
    bam_files, bai_files = helpers.get_bams(args['input_yaml'])

    varcalls_dir = os.path.join(args['out_dir'], 'results',
                                'breakpoint_calling')
    raw_data_directory = os.path.join(varcalls_dir, 'raw')
    breakpoints_filename = os.path.join(varcalls_dir, 'breakpoints.h5')
    ref_data_directory = '/refdata'

    pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=bam_files.keys(),
    )

    workflow.subworkflow(
        name='destruct',
        func=
        "biowrappers.components.breakpoint_calling.destruct.destruct_pipeline",
        args=(
            mgd.InputFile(normal_bam_file),
            mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files),
            config.get('destruct', {}),
            ref_data_directory,
            mgd.OutputFile(breakpoints_filename),
            raw_data_directory,
        ),
    )

    info_file = os.path.join(args["out_dir"], 'results', 'breakpoint_calling',
                             "info.yaml")

    results = {
        'destruct_data': helpers.format_file_yaml(breakpoints_filename),
    }

    input_datasets = {
        k: helpers.format_file_yaml(v)
        for k, v in bam_files.iteritems()
    }
    input_datasets = {'normal': normal_bam_file, 'tumour': input_datasets}

    metadata = {
        'breakpoint_calling': {
            'ref_data': ref_data_directory,
            'version': single_cell.__version__,
            'results': results,
            'containers': config['containers'],
            'input_datasets': input_datasets,
            'output_datasets': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                mem_retry_increment=2,
                                ncpus=1),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
示例#5
0
def wgs_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config_file'])
    inputs = helpers.load_yaml(args['input_yaml'])

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    targets = helpers.get_values_from_input(inputs, 'target_list')
    samples = tumours.keys()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples,
    )

    if args['alignment']:
        tumour_fastqs_r1, tumour_fastqs_r2 = get_fastqs(inputs, samples, 'tumour')
        normal_fastqs_r1, normal_fastqs_r2 = get_fastqs(inputs, samples, 'normal')

        normal_alignment_template = os.path.join(
            args['out_dir'], 'alignment', '{norm_sample_id}', '{norm_lane}', 'normal'
        )
        tumour_alignment_template = os.path.join(
            args['out_dir'], 'alignment', '{tum_sample_id}', '{tum_lane}', 'tumour'
        )

        workflow.subworkflow(
            name='wgs_alignment_paired_lanes',
            func=paired_alignment,
            args=(
                config,
                mgd.OutputFile("tumour.bam", 'sample_id', fnames=tumours,
                               extensions=['.bai'], axes_origin=[]),
                mgd.OutputFile("normal.bam", 'sample_id', fnames=normals,
                               extensions=['.bai'], axes_origin=[]),
                samples,
                tumour_fastqs_r1,
                tumour_fastqs_r2,
                normal_fastqs_r1,
                normal_fastqs_r2,
                normal_alignment_template,
                tumour_alignment_template,
            )
        )

    museq_dir = os.path.join(args['out_dir'], 'variants')
    museq_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_paired_annotated.vcf.gz')
    museq_ss_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_single_annotated.vcf.gz')
    strelka_snv_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_snv_annotated.vcf.gz')
    strelka_indel_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_indel_annotated.vcf.gz')
    parsed_snv_csv = os.path.join(museq_dir, '{sample_id}', 'allcalls.csv')
    museq_paired_pdf = os.path.join(museq_dir, '{sample_id}', 'paired_museqportrait.pdf')
    museq_single_pdf = os.path.join(museq_dir, '{sample_id}', 'single_museqportrait.pdf')
    workflow.subworkflow(
        name='variant_calling',
        func=call_variants,
        args=(
            samples,
            config,
            mgd.OutputFile('parsed_snv_csv', 'sample_id', template=parsed_snv_csv, axes_origin=[]),
            mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile('museq', 'sample_id', template=museq_vcf, axes_origin=[]),
            mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]),
            mgd.OutputFile('strelka_snv', 'sample_id', template=strelka_snv_vcf, axes_origin=[]),
            mgd.OutputFile('strelka_indel', 'sample_id', template=strelka_indel_vcf, axes_origin=[]),
            mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]),
            mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]),
        )
    )

    sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}')
    destruct_breakpoints = os.path.join(sv_outdir, 'destruct_breakpoints.csv')
    destruct_library = os.path.join(sv_outdir, 'destruct_library.csv')
    destruct_raw_breakpoints = os.path.join(sv_outdir, 'destruct_raw_breakpoints.csv')
    destruct_raw_library = os.path.join(sv_outdir, 'destruct_raw_library.csv')
    destruct_reads = os.path.join(sv_outdir, 'destruct_reads.csv')
    lumpy_vcf = os.path.join(sv_outdir, 'lumpy.vcf')
    parsed_csv = os.path.join(sv_outdir, 'filtered_consensus_calls.csv')
    workflow.subworkflow(
        name="call_breakpoints",
        func=call_breakpoints,
        args=(
            samples,
            config,
            mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints, axes_origin=[]),
            mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library, axes_origin=[]),
            mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints, axes_origin=[]),
            mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library, axes_origin=[]),
            mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads, axes_origin=[]),
            mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf, axes_origin=[]),
            mgd.OutputFile('parsed_csv', 'sample_id', template=parsed_csv, axes_origin=[])
        )
    )

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')
    remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data')
    titan_raw_dir = os.path.join(cna_outdir, 'titan')
    remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5')
    titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5')
    titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5')
    titan_params_filename = os.path.join(titan_raw_dir, 'params.h5')
    workflow.subworkflow(
        name='titan',
        func=titan.create_titan_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.InputFile("target_list", 'sample_id', fnames=targets, axes_origin=[]),
            mgd.Template(titan_raw_dir, 'sample_id'),
            mgd.OutputFile('titan_segments_filename', 'sample_id', axes_origin=[], template=titan_segments_filename),
            mgd.OutputFile('titan_params_filename', 'sample_id', axes_origin=[], template=titan_params_filename),
            mgd.OutputFile('titan_markers_filename', 'sample_id', axes_origin=[], template=titan_markers_filename),
            config['globals'],
            config['cna_calling'],
            config['cna_calling']['titan_intervals'],
            mgd.InputInstance('sample_id'),
        ),
    )
    workflow.subworkflow(
        name='remixt',
        func=remixt.create_remixt_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.InputFile('destruct_breakpoints', 'sample_id', axes_origin=[], template=destruct_breakpoints),
            mgd.InputInstance('sample_id'),
            config['cna_calling']['remixt_refdata'],
            mgd.OutputFile('remixt_results_filename', 'sample_id', axes_origin=[], template=remixt_results_filename),
            mgd.Template(remixt_raw_dir, 'sample_id'),
            config['cna_calling']['min_num_reads']
        ),
    )

    pyp.run(workflow)
示例#6
0
def split_bam_workflow(workflow, args):

    config = helpers.load_config(args)

    info_file = os.path.join(args["out_dir"], 'results', 'split_bam',
                             'info.yaml')
    split_bam_template = args["split_bam_template"]
    split_bai_template = args["split_bam_template"] + ".bai"

    by_reads = False if "{region}" in split_bam_template else True
    splitkeyword = "region" if "{region}" in split_bam_template else "reads"

    if by_reads:
        splitnames = [str(i) for i in range(config["num_splits_byreads"])]

        workflow.setobj(
            obj=mgd.OutputChunks('reads'),
            value=splitnames,
        )

    else:
        workflow.transform(
            name="get_regions",
            ctx={
                'mem': 2,
                'num_retry': 3,
                'mem_retry_increment': 2,
                'pool_id': config['pools']['standard'],
                'ncpus': 1
            },
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.TempOutputObj('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))

    workflow.subworkflow(name="split_normal",
                         func=split_bams.create_split_workflow,
                         args=(
                             mgd.InputFile(args['wgs_bam']),
                             mgd.InputFile(args['wgs_bam'] + ".bai"),
                             mgd.OutputFile("normal.split.bam",
                                            splitkeyword,
                                            template=split_bam_template,
                                            axes_origin=[]),
                             mgd.OutputFile("normal.split.bam.bai",
                                            splitkeyword,
                                            template=split_bai_template,
                                            axes_origin=[]),
                             pypeliner.managed.TempInputObj(splitkeyword),
                             config,
                         ),
                         kwargs={"by_reads": by_reads})

    regions = mgd.InputChunks(
        'reads') if by_reads else pypeliner.managed.TempInputObj('region')
    workflow.transform(name="get_files",
                       func='single_cell.utils.helpers.resolve_template',
                       ret=pypeliner.managed.TempOutputObj('outputs'),
                       args=(pypeliner.managed.TempInputObj('region'),
                             split_bam_template, 'region'))

    metadata = {
        'split_bams': {
            'name': 'merge_bams',
            'ref_genome': config["ref_genome"],
            'version': single_cell.__version__,
            'containers': config['containers'],
            'output_datasets': pypeliner.managed.TempInputObj('outputs'),
            'input_datasets': args['wgs_bam'],
            'results': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(
                           mem=config['memory']['med'],
                           pool_id=config['pools']['standard'],
                       ),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
示例#7
0
def variant_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['variant_calling']

    normal_bams, tumour_bams = inpututils.load_variant_calling_input(
        args['input_yaml'])

    filepaths = get_file_paths(args['out_dir'])

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    basedocker = {'docker_image': config['docker']['single_cell_pipeline']}
    vcftools_docker = {'docker_image': config['docker']['vcftools']}

    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'docker_image': baseimage
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=list(normal_bams.keys()),
    )
    workflow.subworkflow(
        name='museq',
        func=mutationseq.create_museq_workflow,
        args=(
            mgd.InputFile('normal_regions.bam',
                          'region',
                          extensions=['.bai'],
                          fnames=normal_bams),
            mgd.InputFile('tumour_regions.bam',
                          'region',
                          extensions=['.bai'],
                          fnames=tumour_bams),
            config['ref_genome'],
            mgd.OutputFile(filepaths['museq_vcf'], extensions=['.tbi',
                                                               '.csi']),
            config,
        ),
    )

    workflow.subworkflow(name='strelka',
                         func=strelka.create_strelka_workflow,
                         args=(
                             mgd.InputFile('normal_regions.bam',
                                           'region',
                                           extensions=['.bai'],
                                           fnames=normal_bams),
                             mgd.InputFile('tumour_regions.bam',
                                           'region',
                                           extensions=['.bai'],
                                           fnames=tumour_bams),
                             config['ref_genome'],
                             mgd.OutputFile(filepaths['strelka_indel'],
                                            extensions=['.tbi', '.csi']),
                             mgd.OutputFile(filepaths['strelka_snv'],
                                            extensions=['.tbi', '.csi']),
                             config,
                         ),
                         kwargs={
                             "chromosomes": config["chromosomes"],
                             "use_depth_thresholds":
                             config['use_depth_thresholds']
                         })

    workflow.transform(
        name='merge_snvs',
        func='biowrappers.components.io.vcf.tasks.merge_vcfs',
        ctx=ctx,
        args=([
            mgd.InputFile(filepaths['museq_vcf'], extensions=['.tbi', '.csi']),
            mgd.InputFile(filepaths['strelka_snv'],
                          extensions=['.tbi', '.csi']),
        ], mgd.TempOutputFile('all.snv.vcf')),
    )

    workflow.transform(name='finalise_snvs',
                       func="biowrappers.components.io.vcf.tasks.finalise_vcf",
                       ctx=ctx,
                       args=(mgd.TempInputFile('all.snv.vcf'),
                             mgd.TempOutputFile('all.snv.vcf.gz',
                                                extensions=['.tbi', '.csi'])),
                       kwargs={'docker_config': vcftools_docker})

    workflow.subworkflow(
        name='annotate_snvs',
        axes=(),
        ctx=ctx,
        func=
        "biowrappers.pipelines.snv_call_and_annotate.create_annotation_workflow",
        args=(
            config,
            mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.TempOutputFile('snv_annotations.h5'),
            mgd.TempSpace('raw_data_dir_annotate'),
        ),
        kwargs={
            'variant_type': 'snv',
            'docker_config': basedocker,
            'snpeff_docker': vcftools_docker,
            'vcftools_docker': vcftools_docker
        })

    workflow.transform(
        name='convert_museq_to_csv',
        func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv",
        ctx=ctx,
        args=(
            mgd.InputFile(filepaths['museq_vcf']),
            mgd.TempOutputFile('museq.csv'),
        ),
        kwargs={
            'score_callback': museq_callback,
        })

    workflow.transform(
        name='prep_museq_csv',
        func='single_cell.utils.csvutils.finalize_csv',
        args=(mgd.TempInputFile('museq.csv'),
              mgd.OutputFile(filepaths['museq_csv'], extensions=['.yaml'])),
    )

    workflow.transform(
        name='convert_strelka_to_csv',
        func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv",
        ctx=ctx,
        args=(
            mgd.InputFile(filepaths['strelka_snv']),
            mgd.TempOutputFile('strelka_snv.csv'),
        ),
        kwargs={
            'score_callback': strelka_snv_callback,
        })

    workflow.transform(
        name='prep_strelka_csv',
        func='single_cell.utils.csvutils.finalize_csv',
        args=(mgd.TempInputFile('strelka_snv.csv'),
              mgd.OutputFile(filepaths['strelka_csv'], extensions=['.yaml'])),
    )

    workflow.transform(name='convert_h5_to_csv',
                       func='single_cell.utils.hdfutils.convert_hdf_to_csv',
                       args=(mgd.TempInputFile('snv_annotations.h5'), {
                           '/snv/cosmic_status':
                           mgd.OutputFile(filepaths['cosmic_csv'],
                                          extensions=['.yaml']),
                           '/snv/dbsnp_status':
                           mgd.OutputFile(filepaths['dbsnp_csv'],
                                          extensions=['.yaml']),
                           '/snv/mappability':
                           mgd.OutputFile(filepaths['mappability_csv'],
                                          extensions=['.yaml']),
                           '/snv/snpeff':
                           mgd.OutputFile(filepaths['snpeff_csv'],
                                          extensions=['.yaml']),
                           '/snv/tri_nucleotide_context':
                           mgd.OutputFile(filepaths['trinuc_csv'],
                                          extensions=['.yaml']),
                       }))

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], list(filepaths.values()),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'variant_calling'
            }
        })

    return workflow
示例#8
0
def infer_haps(
    bam_file,
    haplotypes_filename,
    config,
    from_tumour=False,
):

    remixt_config = config.get('extract_seqdata', {})
    remixt_ref_data_dir = config['ref_data_dir']

    chromosomes = config['chromosomes']
    remixt_config['chromosomes'] = chromosomes

    ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    if isinstance(bam_file, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=list(bam_file.keys()),
        )

        # dont parallelize over chromosomes for per cell bams
        workflow.subworkflow(
            name="extract_seqdata",
            axes=('cell_id', ),
            func='remixt.workflow.create_extract_seqdata_workflow',
            args=(
                mgd.InputFile('bam_markdups',
                              'cell_id',
                              fnames=bam_file,
                              extensions=['.bai']),
                mgd.TempOutputFile('seqdata_cell.h5', 'cell_id'),
                remixt_config,
                remixt_ref_data_dir,
            ),
            kwargs={'no_parallelism': True})
        workflow.transform(
            name='merge_all_seqdata',
            func="remixt.seqdataio.merge_overlapping_seqdata",
            args=(mgd.TempOutputFile('seqdata_file.h5'),
                  mgd.TempInputFile("seqdata_cell.h5",
                                    "cell_id"), config["chromosomes"]),
        )
    else:
        workflow.subworkflow(
            name='extract_seqdata',
            func='remixt.workflow.create_extract_seqdata_workflow',
            ctx={'disk': 150},
            args=(
                mgd.InputFile(bam_file, extensions=['.bai']),
                mgd.TempOutputFile('seqdata_file.h5'),
                remixt_config,
                remixt_ref_data_dir,
            ),
        )

    workflow.setobj(
        obj=mgd.OutputChunks('chromosome'),
        value=chromosomes,
    )

    if from_tumour:
        func = 'remixt.analysis.haplotype.infer_snp_genotype_from_tumour'
    else:
        func = 'remixt.analysis.haplotype.infer_snp_genotype_from_normal'

    workflow.transform(
        name='infer_snp_genotype',
        axes=('chromosome', ),
        ctx={'mem': 16},
        func=func,
        args=(
            mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'),
            mgd.TempInputFile('seqdata_file.h5'),
            mgd.InputInstance('chromosome'),
            config,
        ),
    )

    workflow.transform(
        name='infer_haps',
        axes=('chromosome', ),
        ctx={'mem': 16},
        func='remixt.analysis.haplotype.infer_haps',
        args=(
            mgd.TempOutputFile('haplotypes.tsv', 'chromosome'),
            mgd.TempInputFile('snp_genotype.tsv', 'chromosome'),
            mgd.InputInstance('chromosome'),
            mgd.TempSpace('haplotyping', 'chromosome'),
            remixt_config,
            remixt_ref_data_dir,
        ),
    )

    workflow.transform(name='merge_haps',
                       ctx={'mem': 16},
                       func='remixt.utils.merge_tables',
                       args=(
                           mgd.TempOutputFile('haplotypes_merged.tsv'),
                           mgd.TempInputFile('haplotypes.tsv', 'chromosome'),
                       ))

    workflow.transform(
        name='annotate_haps',
        ctx={'mem': 16},
        func='single_cell.workflows.infer_haps.tasks.annotate_ref_alt',
        args=(
            mgd.TempInputFile('haplotypes_merged.tsv'),
            remixt_ref_data_dir,
            mgd.TempOutputFile('haplotypes_annotated.tsv'),
        ))

    workflow.transform(
        name='finalize_csv',
        ctx={'mem': 16},
        func='single_cell.utils.csvutils.rewrite_csv_file',
        args=(
            mgd.TempInputFile('haplotypes_annotated.tsv'),
            mgd.OutputFile(haplotypes_filename, extensions=['.yaml']),
        ),
        kwargs={
            'write_header': True,
            'dtypes': dtypes()['haplotypes']
        },
    )

    return workflow
示例#9
0
def create_museq_workflow(snv_vcf,
                          snv_maf,
                          museqportrait_pdf,
                          reference,
                          reference_vep,
                          chromosomes,
                          normal_id=None,
                          tumour_id=None,
                          thousand_genomes=None,
                          dbsnp=None,
                          germline_refdata=None,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=None):
    name = 'run_museq'
    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai'])
        name += '_tumour'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam, extensions=['.bai'])
        name += '_normal'
    single = False if name == 'run_museq_tumour_normal' else True

    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='1:00',
        ),
        ret=mgd.OutputChunks('interval'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(memory=15,
                                                       walltime='48:00',
                                                       ncpus='8',
                                                       disk=600),
                           func='wgs.utils.museq_utils.run_museq_one_job',
                           args=(
                               mgd.TempSpace("run_museq_temp"),
                               mgd.TempOutputFile('merged.vcf'),
                               reference,
                               mgd.InputChunks('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam': tumour_bam,
                               'normal_bam': normal_bam,
                           })
        workflow.transform(
            name='fix_vcf_merged',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.workflows.mutationseq.tasks.fix_museq_vcf',
            args=(
                mgd.TempInputFile('merged.vcf'),
                mgd.TempOutputFile('merged_fixed.vcf'),
            ),
        )
    else:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(
                               memory=15,
                               walltime='24:00',
                           ),
                           axes=('interval', ),
                           func='wgs.utils.museq_utils.run_museq',
                           args=(mgd.TempOutputFile('museq.vcf', 'interval'),
                                 mgd.TempOutputFile('museq.log',
                                                    'interval'), reference,
                                 mgd.InputInstance('interval'),
                                 params['museq_params'],
                                 mgd.TempSpace('museq_temp', 'interval')),
                           kwargs={
                               'tumour_bam': tumour_bam,
                               'normal_bam': normal_bam,
                           })
        workflow.transform(
            name='fix_vcf',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            axes=('interval', ),
            func='wgs.workflows.mutationseq.tasks.fix_museq_vcf',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('museq_fixed.vcf', 'interval'),
            ),
        )
        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq_fixed.vcf', 'interval'),
                mgd.TempOutputFile('merged_fixed.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
        )

    workflow.transform(name='bcftools_normalize',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('merged_fixed.vcf'),
                           mgd.TempOutputFile('normalized.vcf'),
                           reference,
                       ))

    workflow.transform(
        name='finalise_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized.vcf'),
            mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='run_museqportrait',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='8:00',
        ),
        func='wgs.workflows.mutationseq.tasks.run_museqportrait',
        args=(
            mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']),
            mgd.OutputFile(museqportrait_pdf),
            mgd.TempOutputFile('museqportrait.txt'),
            mgd.TempOutputFile('museqportrait.log'),
            single,
        ),
        kwargs={
            'thousand_genomes': thousand_genomes,
            'dbsnp': dbsnp,
            'germline_refdata': germline_refdata,
            'germline_plot_threshold': params['germline_portrait_threshold']
        })

    workflow.subworkflow(name="mutationseq_single_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(mgd.InputFile(snv_vcf,
                                             extensions=['.tbi', '.csi']),
                               mgd.OutputFile(snv_maf), reference_vep),
                         kwargs={
                             'normal_id': normal_id,
                             'tumour_id': tumour_id
                         })

    return workflow
def create_alignment_workflow(fastq_1_filename, fastq_2_filename, bam_filename,
                              bai_filename, ref_genome, config, args,
                              instrumentinfo, centerinfo, sample_info,
                              cell_ids):

    out_dir = args['out_dir']

    merge_metrics = os.path.join(out_dir, 'metrics')

    lane_metrics = os.path.join(args['out_dir'], 'metrics_per_lane', '{lane}')

    bam_filename = dict([(cellid, bam_filename[cellid])
                         for cellid in cell_ids])

    bai_filename = dict([(cellid, bai_filename[cellid])
                         for cellid in cell_ids])

    chromosomes = config["chromosomes"]

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id', 'lane'),
        value=fastq_1_filename.keys(),
    )

    workflow.setobj(obj=mgd.TempOutputObj('instrument',
                                          'cell_id',
                                          'lane',
                                          axes_origin=[]),
                    value=instrumentinfo)

    workflow.setobj(obj=mgd.TempOutputObj('center',
                                          'cell_id',
                                          'lane',
                                          axes_origin=[]),
                    value=centerinfo)

    workflow.setobj(obj=mgd.TempOutputObj('sampleinfo',
                                          'cell_id',
                                          axes_origin=[]),
                    value=sample_info)

    fastqc_reports = os.path.join(lane_metrics, "fastqc",
                                  "{cell_id}_reports.tar.gz")
    flagstat_metrics = os.path.join(lane_metrics, 'flagstat', '{cell_id}.txt')
    workflow.transform(
        name='align_reads',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        axes=(
            'cell_id',
            'lane',
        ),
        func="single_cell.workflows.align.tasks.align_pe",
        args=(mgd.InputFile('fastq_1',
                            'cell_id',
                            'lane',
                            fnames=fastq_1_filename),
              mgd.InputFile('fastq_2',
                            'cell_id',
                            'lane',
                            fnames=fastq_2_filename),
              mgd.TempOutputFile('aligned_per_cell_per_lane.sorted.bam',
                                 'cell_id', 'lane'),
              mgd.OutputFile(fastqc_reports, 'cell_id', 'lane'),
              mgd.OutputFile(flagstat_metrics, 'cell_id', 'lane'),
              mgd.TempSpace('alignment_temp', 'cell_id', 'lane'), ref_genome,
              mgd.TempInputObj('instrument', 'cell_id', 'lane'),
              mgd.TempInputObj('center', 'cell_id', 'lane'),
              mgd.TempInputObj('sampleinfo',
                               'cell_id'), mgd.InputInstance('cell_id'),
              mgd.InputInstance('lane'), args['library_id'], config))

    workflow.transform(name='merge_bams',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.workflows.align.tasks.merge_bams",
                       axes=('cell_id', ),
                       args=(mgd.TempInputFile(
                           'aligned_per_cell_per_lane.sorted.bam', 'cell_id',
                           'lane'),
                             mgd.TempOutputFile('merged_lanes.bam', 'cell_id'),
                             mgd.TempOutputFile('merged_lanes.bam.bai',
                                                'cell_id'), config))

    if args['realign']:
        workflow.transform(name='realignment',
                           axes=('chrom', ),
                           ctx=dict(mem=config['memory']['high'],
                                    pool_id=config['pools']['highmem'],
                                    **ctx),
                           func="single_cell.workflows.align.tasks.realign",
                           args=(mgd.TempInputFile('merged_lanes.bam',
                                                   'cell_id'),
                                 mgd.TempInputFile('merged_lanes.bam.bai',
                                                   'cell_id'),
                                 mgd.TempOutputFile('realigned.bam', 'chrom',
                                                    'cell_id'),
                                 mgd.TempSpace('realignment_temp',
                                               'chrom',
                                               cleanup='before'), config,
                                 mgd.InputInstance('chrom')))

        workflow.transform(
            name='merge_realignment',
            ctx=dict(mem=config['memory']['high'],
                     pool_id=config['pools']['highmem'],
                     **ctx),
            axes=('cell_id', ),
            func="single_cell.workflows.align.tasks.merge_realignment",
            args=(mgd.TempInputFile('realigned.bam', 'chrom', 'cell_id'),
                  mgd.TempOutputFile('merged_realign.bam', 'cell_id'), config,
                  mgd.InputInstance('cell_id')))

    final_bam = mgd.TempInputFile('merged_lanes.bam', 'cell_id')
    if args["realign"]:
        final_bam = mgd.TempInputFile('merged_realign.bam', 'cell_id')

    markdups_metrics = os.path.join(merge_metrics, 'markdups_metrics',
                                    '{cell_id}.markdups_metrics.txt')
    flagstat_metrics = os.path.join(merge_metrics, 'flagstat_metrics',
                                    '{cell_id}.flagstat_metrics.txt')
    workflow.transform(
        name='postprocess_bam',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        axes=('cell_id', ),
        func="single_cell.workflows.align.tasks.postprocess_bam",
        args=(
            final_bam,
            mgd.OutputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            mgd.OutputFile('sorted_markdups_index',
                           'cell_id',
                           fnames=bai_filename),
            mgd.TempSpace('tempdir', 'cell_id'),
            config,
            mgd.OutputFile(markdups_metrics, 'cell_id'),
            mgd.OutputFile(flagstat_metrics, 'cell_id'),
        ),
    )

    return workflow
示例#11
0
def alignment_workflow(args):
    config = inpututils.load_config(args)
    config = config['alignment']

    lib = args["library_id"]
    alignment_dir = args["out_dir"]
    bams_dir = args["bams_dir"]

    sampleinfo = inpututils.get_sample_info(args['input_yaml'])
    laneinfo = inpututils.get_lane_info(args['input_yaml'])

    cellids = inpututils.get_samples(args['input_yaml'])
    fastq1_files, fastq2_files = inpututils.get_fastqs(args['input_yaml'])

    alignment_files = get_output_files(alignment_dir, lib)
    alignment_meta = os.path.join(alignment_dir, 'metadata.yaml')

    bam_files_template = os.path.join(bams_dir, '{cell_id}.bam')
    mt_bam_files_template = os.path.join(bams_dir, '{cell_id}_MT.bam')
    bams_meta = os.path.join(bams_dir, 'metadata.yaml')

    lanes = sorted(set([v[1] for v in fastq1_files.keys()]))
    cells = sorted(set([v[0] for v in fastq1_files.keys()]))

    input_yaml_blob = os.path.join(alignment_dir, 'input.yaml')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']})

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id', 'lane'),
        value=list(fastq1_files.keys()),
    )

    workflow.subworkflow(
        name='alignment_workflow',
        func=align.create_alignment_workflow,
        args=(
            mgd.InputFile('fastq_1',
                          'cell_id',
                          'lane',
                          fnames=fastq1_files,
                          axes_origin=[]),
            mgd.InputFile('fastq_2',
                          'cell_id',
                          'lane',
                          fnames=fastq2_files,
                          axes_origin=[]),
            mgd.OutputFile('bam_markdups',
                           'cell_id',
                           template=bam_files_template,
                           axes_origin=[],
                           extensions=['.bai']),
            mgd.OutputFile('mt_bam_markdups',
                           'cell_id',
                           template=mt_bam_files_template,
                           axes_origin=[],
                           extensions=['.bai']),
            mgd.OutputFile(alignment_files['alignment_metrics_csv']),
            mgd.OutputFile(alignment_files['gc_metrics_csv']),
            mgd.OutputFile(alignment_files['fastqc_metrics_csv']),
            mgd.OutputFile(alignment_files['plot_metrics_output']),
            config['ref_genome'],
            config,
            laneinfo,
            sampleinfo,
            cellids,
            mgd.OutputFile(alignment_files['alignment_metrics_tar']),
            lib,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], alignment_dir, list(alignment_files.values()),
              mgd.OutputFile(alignment_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'library_id': lib,
                'cell_ids': cells,
                'lane_ids': lanes,
                'type': 'alignment'
            }
        })

    workflow.transform(
        name='generate_meta_files_bams',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], bams_dir,
              mgd.Template('aligned.bam',
                           'cell_id',
                           template=bam_files_template),
              mgd.OutputFile(bams_meta)),
        kwargs={
            'metadata': {
                'library_id': lib,
                'cell_ids': cells,
                'lane_ids': lanes,
                'type': 'cellbams'
            },
            'template':
            (mgd.InputChunks('cell_id'), bam_files_template, 'cell_id'),
        })

    return workflow
def create_merge_bams_workflow(
    input_bams,
    merged_bams,
    regions,
    config,
):
    baseimage = config['docker']['single_cell_pipeline']

    merged_bams = dict([(region, merged_bams[region]) for region in regions])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(input_bams.keys()),
    )

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=regions,
    )

    one_split_job = config["one_split_job"]

    if one_split_job:
        workflow.transform(
            name='merge_bams',
            ctx={
                'mem': config['memory']['med'],
                'ncpus': config['max_cores'],
                'docker_image': baseimage
            },
            func="single_cell.workflows.merge_bams.tasks.merge_bams",
            args=(mgd.InputFile('bam',
                                'cell_id',
                                fnames=input_bams,
                                extensions=['.bai']),
                  mgd.OutputFile('merged.bam',
                                 "region",
                                 fnames=merged_bams,
                                 axes_origin=[],
                                 extensions=['.bai']), regions,
                  config['docker']['samtools'],
                  mgd.TempSpace("merge_bams_tempdir")),
            kwargs={"ncores": config["max_cores"]})
    else:
        workflow.transform(
            name='split_merge_tumour',
            func=
            'single_cell.workflows.merge_bams.tasks.cell_region_merge_bams',
            axes=('region', ),
            args=(
                mgd.InputFile('tumour_cells.bam',
                              'cell_id',
                              extensions=['.bai'],
                              fnames=input_bams),
                mgd.OutputFile('tumour_regions.bam',
                               'region',
                               axes_origin=[],
                               extensions=['.bai'],
                               fnames=merged_bams),
                mgd.Instance('region'),
                config['docker']['samtools'],
            ),
        )

    return workflow
示例#13
0
def hmmcopy_workflow(args):
    config = inpututils.load_config(args)
    config = config['hmmcopy']

    sampleinfo = inpututils.get_sample_info(args['input_yaml'])
    cellids = inpututils.get_samples(args['input_yaml'])
    bam_files = inpututils.get_bams(args['input_yaml'])

    lib = args["library_id"]

    workflow = pypeliner.workflow.Workflow()

    hmmcopy_dir = args["out_dir"]

    hmmcopy_files = get_output_files(hmmcopy_dir, lib)
    hmmcopy_meta = os.path.join(hmmcopy_dir, 'metadata.yaml')
    input_yaml_blob = os.path.join(hmmcopy_dir, 'input.yaml')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.subworkflow(
        name='hmmcopy_workflow',
        func=hmmcopy.create_hmmcopy_workflow,
        args=(
            mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']),
            mgd.OutputFile(hmmcopy_files['reads_csvs']),
            mgd.OutputFile(hmmcopy_files['segs_csvs']),
            mgd.OutputFile(hmmcopy_files['metrics_csvs']),
            mgd.OutputFile(hmmcopy_files['params_csvs']),
            mgd.OutputFile(hmmcopy_files['igv_csvs']),
            mgd.OutputFile(hmmcopy_files['segs_pdf']),
            mgd.OutputFile(hmmcopy_files['bias_pdf']),
            mgd.OutputFile(hmmcopy_files['heatmap_pdf']),
            mgd.OutputFile(hmmcopy_files['metrics_pdf']),
            mgd.OutputFile(hmmcopy_files['kernel_density_pdf']),
            mgd.OutputFile(hmmcopy_files['hmmcopy_data_tar']),
            cellids,
            config,
            sampleinfo
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            hmmcopy_dir,
            list(hmmcopy_files.values()),
            mgd.OutputFile(hmmcopy_meta)
        ),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'library_id': lib,
                'cell_ids': list(bam_files.keys()),
                'type': 'hmmcopy',
            }
        }
    )

    return workflow
def process_cells_destruct(
        destruct_config, cell_bam_files,
        reads_1, reads_2, sample_1, sample_2, stats,
        tag=False
):
    ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, }

    cells = list(cell_bam_files.keys())

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cells,
    )

    workflow.transform(
        name='bamdisc_and_numreads_cell',
        func="single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads",
        axes=('cell_id',),
        ctx={'io': 1, 'mem': 8},
        args=(
            destruct_config,
            mgd.InputFile('bam', 'cell_id', fnames=cell_bam_files),
            mgd.TempOutputFile('cell_stats', 'cell_id'),
            mgd.TempOutputFile('cell_reads_1.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_reads_2.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_sample_1.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_sample_2.fastq.gz', 'cell_id'),
            mgd.TempSpace('bamdisc_cell_tempspace', 'cell_id'),
        ),
    )

    workflow.transform(
        name='merge_reads_r1',
        ctx={'io': 1, 'mem': 8, 'disk': 100},
        func="single_cell.workflows.destruct_singlecell.tasks.merge_fastqs",
        args=(
            mgd.TempInputFile('cell_reads_1.fastq.gz', 'cell_id'),
            mgd.OutputFile(reads_1),
        ),
    )

    workflow.transform(
        name='merge_reads_r2',
        ctx={'io': 1, 'mem': 8, 'disk': 100},
        func="single_cell.workflows.destruct_singlecell.tasks.merge_fastqs",
        args=(
            mgd.TempInputFile('cell_reads_2.fastq.gz', 'cell_id'),
            mgd.OutputFile(reads_2),
        ),
    )

    workflow.transform(
        name='merge_sample',
        ctx={'io': 1, 'mem': 8, 'disk': 100},
        func="single_cell.workflows.destruct_singlecell.tasks.resample_fastqs",
        args=(
            mgd.TempInputFile('cell_sample_1.fastq.gz', 'cell_id'),
            mgd.TempInputFile('cell_sample_2.fastq.gz', 'cell_id'),
            mgd.OutputFile(sample_1),
            mgd.OutputFile(sample_2),
            destruct_config['num_read_samples'],
        ),
    )

    workflow.transform(
        name='merge_stats',
        ctx={'io': 1, 'mem': 8},
        func="single_cell.workflows.destruct_singlecell.tasks.merge_stats",
        args=(
            mgd.TempInputFile('cell_stats', 'cell_id'),
            mgd.OutputFile(stats),
        ),
    )

    return workflow
示例#15
0
def lumpy_preprocess_workflow(bam_files, config, discordants, split_reads,
                              histogram, mean_stdev):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'docker_image': config['docker']['single_cell_pipeline']
    }

    lumpydocker = {'docker_image': config['docker']['lumpy']}

    histogram_settings = dict(N=10000,
                              skip=0,
                              min_elements=100,
                              mads=10,
                              X=4,
                              read_length=101)
    histogram_settings.update(lumpydocker)

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    if isinstance(bam_files, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=list(bam_files.keys()),
        )
        workflow.set_filenames('cells.bam', 'cell_id', fnames=bam_files)
        workflow.subworkflow(
            name='process_cells',
            func='single_cell.workflows.lumpy.lumpy_preprocess_cells',
            args=(config,
                  mgd.InputFile('cells.bam',
                                'cell_id',
                                fnames=bam_files,
                                extensions=['.bai']),
                  mgd.OutputFile(discordants), mgd.OutputFile(split_reads),
                  mgd.OutputFile(histogram), mgd.OutputFile(mean_stdev)),
        )
    else:
        workflow.transform(
            name='process_bulk',
            ctx={
                'mem': 8,
                'ncpus': 1,
                'disk': 200
            },
            func='single_cell.workflows.lumpy.tasks.process_bam',
            args=(
                mgd.InputFile(bam_files, extensions=['.bai']),
                mgd.OutputFile(discordants),
                mgd.OutputFile(split_reads),
                mgd.TempOutputFile('hist_normal.csv'),
                mgd.TempSpace("lumpy_normal_processing"),
            ),
            kwargs=histogram_settings,
        )
        workflow.transform(
            name='format_histo_bulk',
            ctx={
                'mem': 8,
                'ncpus': 1
            },
            func=
            'single_cell.workflows.lumpy.merge_histograms.merge_histograms',
            args=(mgd.TempInputFile('hist_normal.csv'),
                  mgd.OutputFile(histogram), mgd.OutputFile(mean_stdev)),
        )

    return workflow
示例#16
0
def somatic_calling_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = list(tumours.keys())

    tumour_ids = helpers.get_values_from_input(inputs, 'tumour_id')
    normal_ids = helpers.get_values_from_input(inputs, 'normal_id')

    var_dir = os.path.join(args['out_dir'], 'somatic')
    museq_vcf = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_museq_paired_annotated.vcf.gz')
    museq_maf = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_museq_paired_annotated.maf')
    museq_paired_pdf = os.path.join(var_dir, '{sample_id}',
                                    '{sample_id}_paired_museqportrait.pdf')

    strelka_snv_vcf = os.path.join(var_dir, '{sample_id}',
                                   '{sample_id}_strelka_snv_annotated.vcf.gz')
    strelka_snv_maf = os.path.join(var_dir, '{sample_id}',
                                   '{sample_id}_strelka_snv_annotated.maf')
    strelka_indel_vcf = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz')
    strelka_indel_maf = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.maf')

    mutect_vcf = os.path.join(var_dir, '{sample_id}',
                              '{sample_id}_mutect.vcf.gz')
    mutect_maf = os.path.join(var_dir, '{sample_id}', '{sample_id}_mutect.maf')

    consensus_somatic_maf = os.path.join(var_dir, '{sample_id}',
                                         '{sample_id}_consensus_somatic.maf')

    pyp = pypeliner.app.Pypeline(config=args)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples,
    )

    workflow.subworkflow(name='variant_calling',
                         func=somatic_calling.create_somatic_calling_workflow,
                         args=(
                             samples,
                             mgd.InputFile("tumour.bam",
                                           'sample_id',
                                           fnames=tumours,
                                           extensions=['.bai'],
                                           axes_origin=[]),
                             mgd.InputFile("normal.bam",
                                           'sample_id',
                                           fnames=normals,
                                           extensions=['.bai'],
                                           axes_origin=[]),
                             mgd.OutputFile('museq_vcf',
                                            'sample_id',
                                            template=museq_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('museq_maf',
                                            'sample_id',
                                            template=museq_maf,
                                            axes_origin=[]),
                             mgd.OutputFile('museq_paired_pdf',
                                            'sample_id',
                                            template=museq_paired_pdf,
                                            axes_origin=[]),
                             mgd.OutputFile('strelka_snv_vcf',
                                            'sample_id',
                                            template=strelka_snv_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('strelka_snv_maf',
                                            'sample_id',
                                            template=strelka_snv_maf,
                                            axes_origin=[]),
                             mgd.OutputFile('strelka_indel_vcf',
                                            'sample_id',
                                            template=strelka_indel_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('strelka_indel_maf',
                                            'sample_id',
                                            template=strelka_indel_maf,
                                            axes_origin=[]),
                             mgd.OutputFile('mutect_vcf',
                                            'sample_id',
                                            template=mutect_vcf,
                                            axes_origin=[]),
                             mgd.OutputFile('mutect_maf',
                                            'sample_id',
                                            template=mutect_maf,
                                            axes_origin=[]),
                             mgd.OutputFile('consensus_somatic_maf',
                                            'sample_id',
                                            template=consensus_somatic_maf,
                                            axes_origin=[]),
                             args['refdir'],
                             normal_ids,
                             tumour_ids,
                         ),
                         kwargs={
                             'single_node': args['single_node'],
                             'is_exome': args['is_exome'],
                         })

    filenames = [
        museq_vcf, museq_maf, museq_paired_pdf, strelka_snv_vcf,
        strelka_snv_maf, strelka_indel_vcf, strelka_indel_maf, mutect_vcf,
        mutect_maf, consensus_somatic_maf
    ]

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], args['out_dir'],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'variant_calling'
                           }
                       })

    pyp.run(workflow)
示例#17
0
def lumpy_preprocess_cells(config, bam_files, merged_discordants,
                           merged_splitters, hist_csv, mean_stdev_obj):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'docker_image': config['docker']['single_cell_pipeline']
    }

    lumpydocker = {'docker_image': config['docker']['lumpy']}

    histogram_settings = dict(N=10000,
                              skip=0,
                              min_elements=100,
                              mads=10,
                              X=4,
                              read_length=101)
    histogram_settings.update(lumpydocker)

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.transform(
        name='process_tumour_cells',
        axes=('cell_id', ),
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func='single_cell.workflows.lumpy.tasks.process_bam',
        args=(
            mgd.InputFile('tumour_bam',
                          'cell_id',
                          fnames=bam_files,
                          extensions=['.bai']),
            mgd.TempOutputFile('tumour.discordants.sorted.bam', 'cell_id'),
            mgd.TempOutputFile('tumour.splitters.sorted.bam', 'cell_id'),
            mgd.TempOutputFile('hist.csv', 'cell_id'),
            mgd.TempSpace("lumpy_tumour_processing", "cell_id"),
        ),
        kwargs=dict(tag=mgd.InputInstance('cell_id'), **histogram_settings),
    )

    workflow.transform(
        name='merge_disc',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func='single_cell.workflows.lumpy.tasks.merge_bams',
        args=(mgd.TempInputFile('tumour.discordants.sorted.bam',
                                'cell_id'), mgd.OutputFile(merged_discordants),
              mgd.TempSpace("merge_disc_temp")),
        kwargs=lumpydocker,
    )

    workflow.transform(
        name='merge_split',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func='single_cell.workflows.lumpy.tasks.merge_bams',
        args=(mgd.TempInputFile('tumour.splitters.sorted.bam',
                                'cell_id'), mgd.OutputFile(merged_splitters),
              mgd.TempSpace("merge_split_temp")),
        kwargs=lumpydocker,
    )

    workflow.transform(
        name='merge_histo',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func='single_cell.workflows.lumpy.merge_histograms.merge_histograms',
        args=(mgd.TempInputFile('hist.csv',
                                'cell_id'), mgd.OutputFile(hist_csv),
              mgd.OutputFile(mean_stdev_obj)),
    )

    return workflow
示例#18
0
def create_hmmcopy_workflow(bam_file, reads, segs, metrics, params,
                            igv_seg_filename, segs_pdf, bias_pdf,
                            plot_heatmap_ec_output, plot_metrics_output,
                            plot_kernel_density_output, hmmcopy_data_tar,
                            cell_ids, hmmparams, sample_info):
    chromosomes = hmmparams["chromosomes"]

    baseimage = hmmparams['docker']['single_cell_pipeline']
    hmmcopy_docker = hmmparams['docker']['hmmcopy']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.setobj(obj=mgd.TempOutputObj('sampleinfo',
                                          'cell_id',
                                          axes_origin=[]),
                    value=sample_info)

    workflow.transform(
        name='run_hmmcopy',
        ctx={
            'mem': hmmparams['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.hmmcopy.tasks.run_hmmcopy",
        axes=('cell_id', ),
        args=(mgd.InputFile('bam_markdups',
                            'cell_id',
                            fnames=bam_file,
                            extensions=['.bai']),
              mgd.TempOutputFile('reads.csv.gz',
                                 'cell_id',
                                 extensions=['.yaml']),
              mgd.TempOutputFile('segs.csv.gz',
                                 'cell_id',
                                 extensions=['.yaml']),
              mgd.TempOutputFile('params.csv.gz',
                                 'cell_id',
                                 extensions=['.yaml']),
              mgd.TempOutputFile('hmm_metrics.csv.gz',
                                 'cell_id',
                                 extensions=['.yaml']),
              mgd.TempOutputFile('hmm_data.tar.gz', 'cell_id'),
              mgd.InputInstance('cell_id'), hmmparams,
              mgd.TempSpace('hmmcopy_temp', 'cell_id'), hmmcopy_docker),
    )

    workflow.transform(
        name='merge_reads',
        ctx={
            'mem': hmmparams['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(mgd.TempInputFile('reads.csv.gz',
                                'cell_id',
                                axes_origin=[],
                                extensions=['.yaml']),
              mgd.TempOutputFile('reads_merged.csv.gz',
                                 extensions=['.yaml']), 'reads'),
        kwargs={'low_memory': True})

    workflow.transform(
        name='add_mappability_bool',
        ctx={
            'mem': hmmparams['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.hmmcopy.tasks.get_mappability_col",
        args=(
            mgd.TempInputFile('reads_merged.csv.gz', extensions=['.yaml']),
            mgd.OutputFile(reads, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='merge_segs',
        ctx={
            'mem': hmmparams['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(mgd.TempInputFile('segs.csv.gz',
                                'cell_id',
                                axes_origin=[],
                                extensions=['.yaml']),
              mgd.OutputFile(segs, extensions=['.yaml']), 'segs'),
        kwargs={'low_memory': True})

    workflow.transform(
        name='merge_metrics',
        ctx={
            'mem': hmmparams['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(mgd.TempInputFile('hmm_metrics.csv.gz',
                                'cell_id',
                                axes_origin=[],
                                extensions=['.yaml']),
              mgd.TempOutputFile("hmm_metrics.csv.gz",
                                 extensions=['.yaml']), 'metrics'),
    )

    workflow.transform(
        name='merge_params',
        ctx={
            'mem': hmmparams['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(mgd.TempInputFile('params.csv.gz',
                                'cell_id',
                                axes_origin=[],
                                extensions=['.yaml']),
              mgd.OutputFile(params, extensions=['.yaml']), None),
    )

    workflow.transform(name='get_max_cn',
                       ctx={
                           'mem': hmmparams['memory']['med'],
                           'ncpus': 1,
                           'docker_image': baseimage
                       },
                       func="single_cell.workflows.hmmcopy.tasks.get_max_cn",
                       ret=mgd.TempOutputObj('max_cn'),
                       args=(mgd.InputFile(reads, extensions=['.yaml']), ))

    workflow.transform(name='hmmcopy_plots',
                       ctx={
                           'mem': hmmparams['memory']['med'],
                           'ncpus': 1,
                           'docker_image': baseimage
                       },
                       func="single_cell.workflows.hmmcopy.tasks.plot_hmmcopy",
                       axes=('cell_id', ),
                       args=(
                           mgd.TempInputFile('reads.csv.gz',
                                             'cell_id',
                                             axes_origin=[],
                                             extensions=['.yaml']),
                           mgd.TempInputFile('segs.csv.gz',
                                             'cell_id',
                                             axes_origin=[],
                                             extensions=['.yaml']),
                           mgd.TempInputFile('params.csv.gz',
                                             'cell_id',
                                             axes_origin=[],
                                             extensions=['.yaml']),
                           mgd.TempInputFile('hmm_metrics.csv.gz',
                                             'cell_id',
                                             axes_origin=[],
                                             extensions=['.yaml']),
                           hmmparams['ref_genome'],
                           mgd.TempOutputFile('segments.png',
                                              'cell_id',
                                              axes_origin=[]),
                           mgd.TempOutputFile('bias.png',
                                              'cell_id',
                                              axes_origin=[]),
                           mgd.InputInstance('cell_id'),
                       ),
                       kwargs={
                           'num_states': hmmparams['num_states'],
                           'sample_info':
                           mgd.TempInputObj('sampleinfo', 'cell_id'),
                           'max_cn': mgd.TempInputObj("max_cn")
                       })

    workflow.transform(
        name='annotate_metrics_with_info_and_clustering',
        ctx={
            'mem': hmmparams['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.hmmcopy.tasks.add_clustering_order",
        args=(
            mgd.InputFile(reads, extensions=['.yaml']),
            mgd.TempInputFile("hmm_metrics.csv.gz", extensions=['.yaml']),
            mgd.OutputFile(metrics, extensions=['.yaml']),
        ),
        kwargs={
            'chromosomes': hmmparams["chromosomes"],
            'sample_info': sample_info
        })

    workflow.transform(name='merge_hmm_copy_plots',
                       ctx={
                           'mem': hmmparams['memory']['med'],
                           'ncpus': 1,
                           'docker_image': baseimage
                       },
                       func="single_cell.workflows.hmmcopy.tasks.merge_pdf",
                       args=([
                           mgd.TempInputFile('segments.png', 'cell_id'),
                           mgd.TempInputFile('bias.png', 'cell_id'),
                       ], [
                           mgd.OutputFile(segs_pdf),
                           mgd.OutputFile(bias_pdf),
                       ], mgd.InputFile(metrics, extensions=['.yaml']), None,
                             mgd.TempSpace("hmmcopy_plot_merge_temp"),
                             ['segments', 'bias']))

    workflow.transform(
        name='create_igv_seg',
        ctx={
            'mem': hmmparams['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.hmmcopy.tasks.create_igv_seg",
        args=(
            mgd.InputFile(segs, extensions=['.yaml']),
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(igv_seg_filename),
            hmmparams,
        ))

    workflow.transform(name='plot_metrics',
                       ctx={
                           'mem': hmmparams['memory']['med'],
                           'ncpus': 1,
                           'docker_image': baseimage
                       },
                       func="single_cell.workflows.hmmcopy.tasks.plot_metrics",
                       args=(
                           mgd.InputFile(metrics, extensions=['.yaml']),
                           mgd.OutputFile(plot_metrics_output),
                           'QC pipeline metrics',
                       ))

    workflow.transform(
        name='plot_kernel_density',
        ctx={
            'mem': hmmparams['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.hmmcopy.tasks.plot_kernel_density",
        args=(
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(plot_kernel_density_output),
            ',',
            'mad_neutral_state',
            'QC pipeline metrics',
        ))

    workflow.transform(name='plot_heatmap_ec',
                       ctx={
                           'mem': hmmparams['memory']['med'],
                           'ncpus': 1,
                           'docker_image': baseimage
                       },
                       func="single_cell.workflows.hmmcopy.tasks.plot_pcolor",
                       args=(
                           mgd.InputFile(reads, extensions=['.yaml']),
                           mgd.InputFile(metrics, extensions=['.yaml']),
                           mgd.OutputFile(plot_heatmap_ec_output),
                       ),
                       kwargs={
                           'plot_title': 'QC pipeline metrics',
                           'column_name': 'state',
                           'plot_by_col': 'experimental_condition',
                           'color_by_col': 'cell_call',
                           'chromosomes': chromosomes,
                           'max_cn': hmmparams['num_states'],
                           'scale_by_cells': False,
                           'mappability_threshold': hmmparams["map_cutoff"]
                       })

    workflow.transform(
        name='merge_hmmcopy_data_tars',
        ctx={
            'mem': hmmparams['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.utils.helpers.tar_files",
        args=(mgd.TempInputFile('hmm_data.tar.gz', 'cell_id', axes_origin=[]),
              mgd.OutputFile(hmmcopy_data_tar),
              mgd.TempSpace("merge_tarballs")),
    )

    return workflow
示例#19
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            snv_vcf_file,
                            snv_maf_file,
                            indel_vcf_file,
                            indel_maf_file,
                            reference,
                            reference_vep,
                            chromosomes,
                            normal_id,
                            tumour_id,
                            single_node=False,
                            is_exome=False):
    params = config.default_params('variant_calling')

    workflow = Workflow(ctx=helpers.get_default_ctx(memory=5,
                                                    walltime='4:00'), )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ret=mgd.OutputChunks('regions'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    workflow.transform(
        name='count_fasta_bases',
        func="wgs.workflows.strelka.tasks.count_fasta_bases",
        args=(
            reference,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
        ),
    )

    workflow.transform(
        name="get_chrom_sizes",
        func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    if single_node:
        workflow.transform(name='strelka_one_node',
                           func="wgs.workflows.strelka.tasks.strelka_one_node",
                           args=(
                               pypeliner.managed.InputFile(normal_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               pypeliner.managed.InputFile(tumour_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               reference,
                               mgd.TempOutputFile('indels.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempOutputFile('snvs.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempSpace('call_genome_segment_tmp'),
                               mgd.InputChunks('regions'),
                               mgd.TempInputObj('known_sizes'),
                           ),
                           kwargs={
                               'is_exome': is_exome,
                           })
    else:
        workflow.transform(
            name='get_chromosome_depths',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.get_chromosome_depth",
            args=(
                mgd.InputInstance('regions'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('chrom_depth.txt', 'regions'),
            ),
        )

        workflow.transform(
            name='merge_chromosome_depths',
            func="wgs.workflows.strelka.tasks.merge_chromosome_depths",
            args=(mgd.TempInputFile('chrom_depth.txt',
                                    'regions',
                                    axes_origin=[]),
                  mgd.TempOutputFile('merged_chrom_depth.txt')))

        workflow.transform(
            name='call_genome_segment',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.call_genome_segment",
            args=(
                mgd.TempInputFile('merged_chrom_depth.txt'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                pypeliner.managed.InputFile(tumour_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('indels.vcf', 'regions'),
                mgd.TempOutputFile('snvs.vcf', 'regions'),
                mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                mgd.InputInstance('regions'),
                mgd.TempInputObj('known_sizes'),
            ),
            kwargs={
                'is_exome': False,
            })

        workflow.transform(
            name='merge_indels',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('indels.vcf', 'regions'),
                  mgd.TempOutputFile('indels.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("indels_merge")),
        )

        workflow.transform(
            name='merge_snvs',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('snvs.vcf', 'regions'),
                  mgd.TempOutputFile('snvs.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("snvs_merge")),
        )

    workflow.transform(name='bcftools_normalize_snv',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('snvs.vcf.gz'),
                           mgd.TempOutputFile('normalized_snvs.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs.vcf'),
            mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(name='bcftools_normalize_indel',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('indels.vcf.gz'),
                           mgd.TempOutputFile('normalized_indels.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_indel',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_indels.vcf'),
            mgd.TempOutputFile('normalized_indels_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_indel',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_indels_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_snv',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_snv_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(indel_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(indel_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
示例#20
0
def create_alignment_workflow(
    fastq_1_filename,
    fastq_2_filename,
    bam_filename,
    alignment_metrics,
    gc_metrics,
    detailed_fastqscreen_metrics,
    plot_metrics,
    ref_genome,
    config,
    laneinfo,
    sample_info,
    cell_ids,
    metrics_tar,
    library_id,
):

    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem': 7,
        'ncpus': 1,
        'docker_image': baseimage,
        'mem_retry_factor': 1
    }

    bam_filename = dict([(cellid, bam_filename[cellid])
                         for cellid in cell_ids])

    chromosomes = config["chromosomes"]

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(obj=mgd.TempOutputObj('sampleinfo',
                                          'cell_id',
                                          axes_origin=[]),
                    value=sample_info)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id', 'lane'),
        value=list(fastq_1_filename.keys()),
    )

    workflow.setobj(obj=mgd.TempOutputObj('laneinfo',
                                          'cell_id',
                                          'lane',
                                          axes_origin=[]),
                    value=laneinfo)

    workflow.transform(
        name='align_reads',
        axes=('cell_id', ),
        func="single_cell.workflows.align.align_tasks.align_lanes",
        args=(
            mgd.InputFile('fastq_1',
                          'cell_id',
                          'lane',
                          fnames=fastq_1_filename,
                          axes_origin=[]),
            mgd.InputFile('fastq_2',
                          'cell_id',
                          'lane',
                          fnames=fastq_2_filename,
                          axes_origin=[]),
            mgd.OutputFile('sorted_markdups',
                           'cell_id',
                           fnames=bam_filename,
                           extensions=['.bai']),
            mgd.TempOutputFile('fastqc_reports.tar.gz', 'cell_id'),
            mgd.TempSpace('alignment_temp', 'cell_id'),
            ref_genome,
            mgd.TempInputObj('laneinfo', 'cell_id', 'lane', axes_origin=[]),
            mgd.TempInputObj('sampleinfo', 'cell_id'),
            mgd.InputInstance('cell_id'),
            library_id,
            config['aligner'],
            config['docker'],
            config['adapter'],
            config['adapter2'],
            mgd.TempOutputFile('organism_detailed_count_per_cell.csv.gz',
                               'cell_id'),
            mgd.TempOutputFile('organism_summary_count_per_cell.csv.gz',
                               'cell_id'),
            config['fastq_screen_params'],
        ))

    workflow.transform(
        name='merge_fastq_screen_metrics',
        func=
        "single_cell.workflows.align.fastqscreen.merge_fastq_screen_counts",
        args=(
            mgd.TempInputFile('organism_detailed_count_per_cell.csv.gz',
                              'cell_id'),
            mgd.TempInputFile('organism_summary_count_per_cell.csv.gz',
                              'cell_id'),
            mgd.OutputFile(detailed_fastqscreen_metrics, extensions=['.yaml']),
            mgd.TempOutputFile('organism_summary_count_per_cell.csv.gz',
                               extensions=['.yaml']),
        ))

    workflow.subworkflow(
        name='metrics_subworkflow',
        func="single_cell.workflows.align.bam_metrics_workflow",
        args=(mgd.InputFile('sorted_markdups',
                            'cell_id',
                            fnames=bam_filename,
                            extensions=['.bai']),
              mgd.TempInputFile('organism_summary_count_per_cell.csv.gz',
                                extensions=['.yaml']),
              mgd.OutputFile(alignment_metrics, extensions=['.yaml']),
              mgd.OutputFile(gc_metrics, extensions=['.yaml']),
              mgd.TempOutputFile('markdups_metrics.txt',
                                 'cell_id',
                                 axes_origin=[]),
              mgd.TempOutputFile('flagstat_metrics.txt',
                                 'cell_id',
                                 axes_origin=[]),
              mgd.TempOutputFile('wgs_metrics.txt', 'cell_id', axes_origin=[]),
              mgd.TempOutputFile('gc_metrics.txt', 'cell_id', axes_origin=[]),
              mgd.TempOutputFile('gc_metrics_summary.txt',
                                 'cell_id',
                                 axes_origin=[]),
              mgd.TempOutputFile('gc_metrics.pdf', 'cell_id', axes_origin=[]),
              mgd.TempOutputFile('insert_metrics.txt',
                                 'cell_id',
                                 axes_origin=[]),
              mgd.TempOutputFile('insert_metrics.pdf',
                                 'cell_id',
                                 axes_origin=[]), ref_genome, sample_info,
              config, cell_ids))

    workflow.transform(name='plot_metrics',
                       ctx={'mem': config['memory']['med']},
                       func="single_cell.workflows.align.tasks.plot_metrics",
                       args=(
                           mgd.InputFile(alignment_metrics,
                                         extensions=['.yaml']),
                           mgd.OutputFile(plot_metrics),
                           'QC pipeline metrics',
                           mgd.InputFile(gc_metrics, extensions=['.yaml']),
                           config['gc_windows'],
                       ))

    workflow.transform(name='tar_all_files',
                       ctx={'mem': config['memory']['med']},
                       func="single_cell.utils.helpers.tar_files",
                       args=([
                           mgd.TempInputFile('fastqc_reports.tar.gz',
                                             'cell_id'),
                           mgd.TempInputFile('markdups_metrics.txt',
                                             'cell_id'),
                           mgd.TempInputFile('flagstat_metrics.txt',
                                             'cell_id'),
                           mgd.TempInputFile('wgs_metrics.txt', 'cell_id'),
                           mgd.TempInputFile('gc_metrics.txt', 'cell_id'),
                           mgd.TempInputFile('gc_metrics_summary.txt',
                                             'cell_id'),
                           mgd.TempInputFile('gc_metrics.pdf', 'cell_id'),
                           mgd.TempInputFile('insert_metrics.txt', 'cell_id'),
                           mgd.TempInputFile('insert_metrics.pdf', 'cell_id'),
                       ], mgd.OutputFile(metrics_tar),
                             mgd.TempSpace("merge_metrics_tar")))

    return workflow
def merge_bams_workflow(args):
    config = helpers.load_config(args)
    config = config['merge_bams']

    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'docker_image': baseimage
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    tumour_wgs = data['tumour_wgs']
    normal_wgs = data['normal_wgs']
    tumour_cells = data['tumour_cells']
    normal_cells = data['normal_cells']

    bam_files = tumour_cells if tumour_cells else normal_cells
    wgs_bams = tumour_wgs if tumour_cells else normal_wgs

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    if isinstance(wgs_bams, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('regions'),
            value=list(wgs_bams.keys()),
        )
        workflow.set_filenames("merged.bam", "region", fnames=wgs_bams)
    else:
        workflow.transform(
            name="get_regions",
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.OutputChunks('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))
        workflow.set_filenames('merged.bam', 'region', template=wgs_bams)

    workflow.subworkflow(name="wgs_merge_workflow",
                         func=merge_bams.create_merge_bams_workflow,
                         args=(
                             mgd.InputFile('bam_markdups',
                                           'cell_id',
                                           fnames=bam_files,
                                           extensions=['.bai']),
                             mgd.OutputFile("merged.bam",
                                            "region",
                                            axes_origin=[],
                                            extensions=['.bai']),
                             mgd.TempInputObj("region"),
                             config,
                         ))

    workflow.transform(name="get_files",
                       ctx={'mem': config['memory']['med']},
                       func='single_cell.utils.helpers.resolve_template',
                       ret=pypeliner.managed.TempOutputObj('outputs'),
                       args=(pypeliner.managed.TempInputObj('region'),
                             wgs_bams, 'region'))

    return workflow
def pseudo_bulk_qc_workflow(args):
    data = inpututils.load_qc_input(args["input_yaml"])
    config = inpututils.load_config(args)
    config = config["qc"]

    out_dir = args["out_dir"]

    mutationreports = os.path.join(out_dir, 'patient', "mutationreport.html")
    grouplevelmafs = os.path.join(out_dir, 'patient', "grouplevelmaf.maf")
    grouplevel_high_impact_mafs = os.path.join(
        out_dir, 'patient', "grouplevel_high_impact_maf.maf")
    grouplevel_high_impact_merged_snvs = os.path.join(
        out_dir, 'patient', "grouplevel_high_impact_merged_snvs.csv")
    grouplevel_snvs = os.path.join(out_dir, 'patient', "grouplevel_snvs.csv")

    isabl_ids = {label: paths["isabl_id"] for label, paths in data.items()}

    mappability_files = {
        label: paths["mappability"]
        for label, paths in data.items()
    }
    strelka_files = {label: paths["strelka"] for label, paths in data.items()}
    museq_files = {label: paths["museq"] for label, paths in data.items()}
    cosmic_status_files = {
        label: paths["cosmic_status"]
        for label, paths in data.items()
    }
    snpeff_files = {label: paths["snpeff"] for label, paths in data.items()}
    dbsnp_status_files = {
        label: paths["dbsnp_status"]
        for label, paths in data.items()
    }
    trinuc_files = {label: paths["trinuc"] for label, paths in data.items()}
    counts_files = {label: paths["counts"] for label, paths in data.items()}
    breakpoint_counts_files = {
        label: paths["destruct_breakpoint_counts"]
        for label, paths in data.items()
    }
    destruct_breakpoint_annotation_files = {
        label: paths["destruct_breakpoint_annotation"]
        for label, paths in data.items()
    }
    lumpy_breakpoint_annotation_files = {
        label: paths["lumpy_breakpoint_annotation"]
        for label, paths in data.items()
    }
    lumpy_breakpoint_evidence_files = {
        label: paths["lumpy_breakpoint_evidence"]
        for label, paths in data.items()
    }
    haplotype_allele_data_files = {
        label: paths["haplotype_allele_data"]
        for label, paths in data.items()
    }
    annotation_metrics_files = {
        label: paths["annotation_metrics"]
        for label, paths in data.items()
    }
    hmmcopy_reads_files = {
        label: paths["hmmcopy_reads"]
        for label, paths in data.items()
    }
    hmmcopy_segs_files = {
        label: paths["hmmcopy_segs"]
        for label, paths in data.items()
    }
    hmmcopy_metrics_files = {
        label: paths["hmmcopy_metrics"]
        for label, paths in data.items()
    }
    alignment_metrics_files = {
        label: paths["alignment_metrics"]
        for label, paths in data.items()
    }
    gc_metrics_files = {
        label: paths["gc_metrics"]
        for label, paths in data.items()
    }
    indel_files = {label: paths["indel_file"] for label, paths in data.items()}

    label_dir = os.path.join(out_dir, '{patient}', '{sample_id}',
                             '{library_id}')
    sample_level_report_htmls = os.path.join(label_dir, "mainreport.html")
    sample_level_maf = os.path.join(label_dir, "samplelevelmaf.maf")
    snvs_all = os.path.join(label_dir, 'snvs_all.csv')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']})

    workflow.setobj(
        obj=mgd.OutputChunks(
            'patient',
            'sample_id',
            'library_id',
        ),
        value=list(data.keys()),
    )

    workflow.subworkflow(
        name='create_sample_level_plots',
        func="single_cell.workflows.pseudo_bulk_qc.create_sample_level_plots",
        axes=(
            'patient',
            'sample_id',
            'library_id',
        ),
        args=(mgd.InputInstance('patient'), mgd.InputInstance('sample_id'),
              mgd.InputInstance('library_id'),
              mgd.InputFile('mappability',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=mappability_files),
              mgd.InputFile('strelka',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=strelka_files),
              mgd.InputFile('museq',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=museq_files),
              mgd.InputFile('cosmic_status',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=cosmic_status_files),
              mgd.InputFile('snpeff',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=snpeff_files),
              mgd.InputFile('dbsnp_status',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=dbsnp_status_files),
              mgd.InputFile('trinuc',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=trinuc_files),
              mgd.InputFile('counts',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=counts_files),
              mgd.InputFile('destruct_breakpoint_annotation',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=destruct_breakpoint_annotation_files),
              mgd.InputFile('destruct_breakpoint_counts',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=breakpoint_counts_files),
              mgd.InputFile('lumpy_breakpoint_annotation',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=lumpy_breakpoint_annotation_files),
              mgd.InputFile('lumpy_breakpoint_evidence',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=lumpy_breakpoint_evidence_files),
              mgd.InputFile('haplotype_allele_data',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=haplotype_allele_data_files),
              mgd.InputFile('annotation_metrics',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=annotation_metrics_files),
              mgd.InputFile('hmmcopy_reads',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=hmmcopy_reads_files),
              mgd.InputFile('isabl_ids',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=isabl_ids),
              mgd.InputFile('hmmcopy_segs',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=hmmcopy_segs_files),
              mgd.InputFile('hmmcopy_metrics',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=hmmcopy_metrics_files),
              mgd.InputFile('alignment_metrics',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=alignment_metrics_files),
              mgd.InputFile('gc_metrics',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=gc_metrics_files),
              mgd.InputFile('indel_files',
                            'patient',
                            'sample_id',
                            'library_id',
                            fnames=indel_files),
              mgd.OutputFile('sample_level_report_htmls',
                             'patient',
                             'sample_id',
                             'library_id',
                             template=sample_level_report_htmls),
              mgd.OutputFile('mafs',
                             'patient',
                             'sample_id',
                             'library_id',
                             template=sample_level_maf),
              mgd.OutputFile('snvs_all',
                             'patient',
                             'sample_id',
                             'library_id',
                             template=snvs_all), out_dir, config),
    )
    workflow.subworkflow(
        name='create_patient_workflow',
        func="single_cell.workflows.pseudo_bulk_qc.create_patient_workflow",
        axes=('patient', ),
        args=(
            mgd.InputInstance('patient'),
            mgd.InputFile("mafs",
                          "patient",
                          "sample_id",
                          "library_id",
                          template=sample_level_maf,
                          axes_origin=[]),
            mgd.InputFile("snvs_all",
                          "patient",
                          "sample_id",
                          "library_id",
                          template=snvs_all,
                          axes_origin=[]),
            mgd.OutputFile('mutationreport',
                           'patient',
                           template=mutationreports),
            mgd.OutputFile('grouplevelmaf', 'patient',
                           template=grouplevelmafs),
            mgd.OutputFile('grouplevel_high_impact_maf',
                           'patient',
                           template=grouplevel_high_impact_mafs),
            mgd.OutputFile('grouplevel_snvs',
                           'patient',
                           template=grouplevel_snvs),
            mgd.OutputFile('grouplevel_high_impact_merged_snvs',
                           'patient',
                           template=grouplevel_high_impact_merged_snvs),
            config,
        ),
    )

    return workflow
def create_ltm_workflow(hmmcopy, cn_matrix, output_gml, output_rooted_gml,
                        cnv_annots_csv, cnv_tree_edges_csv, cnv_data_csv,
                        output_rmd, config, root_id, root_id_file, number_jobs,
                        ploidy):

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('timepoint'),
        value=hmmcopy.keys(),
    )

    workflow.transform(
        name='generate_cn_matrices',
        axes=('timepoint', ),
        ctx={
            'mem': config['memory']['med'],
            'pool_id': config['pools']['standard'],
            'ncpus': 1
        },
        func='single_cell.workflows.ltm.tasks.generate_cn_matrices',
        args=(
            mgd.InputFile('hmmcopy.h5', 'timepoint', fnames=hmmcopy),
            mgd.TempOutputFile('cn_matrix.csv', 'timepoint'),
            str(ploidy),
        ),
    )

    # Generate copy number matrix
    workflow.transform(
        name='generate_cn_matrix',
        ctx={
            'mem': config['memory']['med'],
            'pool_id': config['pools']['standard'],
            'ncpus': 1
        },
        func='single_cell.workflows.ltm.tasks.merge_cn_matrices',
        args=(
            mgd.TempInputFile('cn_matrix.csv', 'timepoint'),
            mgd.OutputFile(cn_matrix),
        ),
    )

    node_pair_csvs = []
    for job in range(number_jobs):
        node_pair_csvs.append('list_{}.csv'.format(job))

    workflow.transform(
        name='generate_input_csvs',
        ctx={
            'mem': config['memory']['med'],
            'pool_id': config['pools']['standard'],
            'ncpus': 1
        },
        func='single_cell.workflows.ltm.tasks.generate_node_pair_csvs',
        args=(
            mgd.InputFile(cn_matrix),
            number_jobs,
            [mgd.TempOutputFile(csv) for csv in node_pair_csvs],
        ),
    )

    distance_csvs = []
    for job in range(number_jobs):
        distance_csvs.append('distance_list_{}.csv'.format(job))

    workflow.transform(
        name='calculate_distances',
        ctx={
            'mem': config['memory']['med'],
            'pool_id': config['pools']['standard'],
            'ncpus': 1
        },
        func='single_cell.workflows.ltm.tasks.calculate_distances',
        args=(
            [mgd.TempInputFile(csv) for csv in node_pair_csvs],
            mgd.InputFile(cn_matrix),
            [mgd.TempOutputFile(csv) for csv in distance_csvs],
            config,
        ),
    )

    # Generates a minimum spanning tree
    workflow.transform(
        name='generate_tree',
        ctx={
            'mem': config['memory']['med'],
            'pool_id': config['pools']['standard'],
            'ncpus': 1
        },
        func=
        'single_cell.workflows.ltm.scripts.learn_CL_from_distance.learn_CL_from_distance',
        args=(
            [mgd.TempInputFile(csv) for csv in distance_csvs],
            mgd.OutputFile(output_gml),
        ),
    )

    workflow.transform(
        name='generate_cellscape_inputs',
        ctx={
            'mem': config['memory']['med'],
            'pool_id': config['pools']['standard'],
            'ncpus': 1
        },
        func='single_cell.workflows.ltm.tasks.generate_cellscape_inputs',
        args=(
            mgd.InputFile(cn_matrix),
            mgd.OutputFile(cnv_annots_csv),
            mgd.OutputFile(cnv_tree_edges_csv),
            mgd.OutputFile(cnv_data_csv),
            mgd.InputFile(output_gml),
            mgd.OutputFile(output_rooted_gml),
            root_id,
            mgd.OutputFile(root_id_file),
        ),
    )

    workflow.transform(
        name='create_cellscape_rmarkdown',
        ctx={
            'mem': config['memory']['med'],
            'pool_id': config['pools']['standard'],
            'ncpus': 1
        },
        func='single_cell.workflows.ltm.tasks.move_cellscape',
        args=(mgd.OutputFile(output_rmd), ),
    )

    return workflow
示例#24
0
def breakpoint_calling_workflow(args):
    config = inpututils.load_config(args)
    config = config['breakpoint_calling']

    run_destruct = True if args['destruct'] else False
    run_lumpy = True if args['lumpy'] else False

    if not run_destruct and not run_lumpy:
        run_destruct = True
        run_lumpy = True

    normal_data, tumour_cells = inpututils.load_breakpoint_calling_input(
        args['input_yaml'])

    bkp_dir = os.path.join(args['out_dir'])
    bkp_meta = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    out_files = get_output_files(bkp_dir, run_destruct, run_lumpy)

    ref_data_directory = config['ref_data_directory']

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']})

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    if isinstance(normal_data, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_data.keys()),
        )
        normal_bam = mgd.InputFile('normal_cells.bam',
                                   'normal_cell_id',
                                   extensions=['.bai'],
                                   fnames=normal_data)
    else:
        normal_bam = mgd.InputFile(normal_data, extensions=['.bai'])

    if run_destruct:
        workflow.subworkflow(
            name='destruct',
            ctx={'docker_image': config['docker']['single_cell_pipeline']},
            func=
            "single_cell.workflows.destruct_singlecell.create_destruct_workflow",
            args=(
                normal_bam,
                mgd.InputFile('tumour.bam',
                              'tumour_cell_id',
                              fnames=tumour_cells),
                config.get('destruct_config', {}),
                config,
                ref_data_directory,
                mgd.OutputFile(out_files['destruct_breakpoints_filename'],
                               extensions=['.yaml']),
                mgd.OutputFile(out_files['destruct_breakpoints_lib_filename'],
                               extensions=['.yaml']),
                mgd.OutputFile(out_files['destruct_cell_counts_filename'],
                               extensions=['.yaml']),
            ),
        )

    if run_lumpy:
        workflow.subworkflow(
            name='lumpy',
            func="single_cell.workflows.lumpy.create_lumpy_workflow",
            args=(
                config,
                normal_bam,
                mgd.InputFile('tumour.bam',
                              'tumour_cell_id',
                              fnames=tumour_cells,
                              extensions=['.bai']),
                mgd.OutputFile(out_files['lumpy_breakpoints_csv'],
                               extensions=['.yaml']),
                mgd.OutputFile(out_files['lumpy_breakpoints_evidence_csv'],
                               extensions=['.yaml']),
                mgd.OutputFile(out_files['lumpy_breakpoints_bed']),
            ),
        )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], bkp_dir, list(out_files.values()),
              mgd.OutputFile(bkp_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'breakpoint_calling'
            }
        })

    return workflow
示例#25
0
def create_aneufinder_workflow(bam_file,
                               cell_ids,
                               config,
                               aneufinder_output,
                               aneufinder_results_filename,
                               aneufinder_pdf_filename,
                               library_id,
                               ):

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')
    ctx.update(docker_ctx)

    aneufinder_docker = helpers.get_container_ctx(config['containers'], 'aneufinder', docker_only=True)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.transform(
        name='run_aneufinder_on_individual_cells',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.aneufinder.tasks.run_aneufinder",
        axes=('cell_id',),
        args=(
            mgd.InputFile('bam_file', 'cell_id', fnames=bam_file),
            mgd.TempSpace('working_dir', 'cell_id', fnames=bam_file),
            mgd.InputInstance('cell_id'),
            aneufinder_output,
            mgd.TempOutputFile('segments.csv', 'cell_id'),
            mgd.TempOutputFile('reads.csv', 'cell_id'),
            mgd.TempOutputFile('dnacopy.pdf', 'cell_id'),
        ),
        kwargs={'docker_config': aneufinder_docker}
    )

    workflow.transform(
        name='merge_outputs',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.aneufinder.tasks.merge_outputs_to_hdf",
        args=(
            mgd.TempInputFile('reads.csv', 'cell_id'),
            mgd.TempInputFile('segments.csv', 'cell_id'),
            mgd.OutputFile(aneufinder_results_filename),
            mgd.TempSpace("aneufinder_merge"),
        )
    )

    workflow.transform(
        name='merge_aneufinder_pdfs',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func="single_cell.workflows.aneufinder.tasks.merge_pdf",
        args=(
            [mgd.TempInputFile('dnacopy.pdf', 'cell_id')],
            [mgd.OutputFile(aneufinder_pdf_filename)],
        )
    )

    return workflow
示例#26
0
def hmmcopy_workflow(workflow, args):

    config = helpers.load_config(args)

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    ctx.update(
        helpers.get_container_ctx(config['containers'],
                                  'single_cell_pipeline'))

    cellids = helpers.get_samples(args['input_yaml'])
    bam_files, bai_files = helpers.get_bams(args['input_yaml'])
    lib = args['library_id']

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cellids,
    )

    for params_tag, params in config["hmmcopy_params"].iteritems():
        params_tag = "hmmcopy_" + params_tag

        results_dir = os.path.join(args['out_dir'], 'results', params_tag)
        plots_dir = os.path.join(results_dir, "plots")

        info_file = os.path.join(results_dir, "info.yaml")

        igv_seg_file = os.path.join(results_dir,
                                    '{}_igv_segments.seg'.format(lib))

        hmmcopy_data = os.path.join(results_dir, '{}_hmmcopy.h5'.format(lib))

        segs_pdf = os.path.join(plots_dir, "segments", lib + '_segs.tar.gz')
        bias_pdf = os.path.join(plots_dir, "bias", lib + '_bias.tar.gz')

        heatmap_filt_pdf = os.path.join(
            plots_dir, '{}_heatmap_by_ec_filtered.pdf'.format(lib))
        heatmap_pdf = os.path.join(plots_dir,
                                   '{}_heatmap_by_ec.pdf'.format(lib))
        metrics_pdf = os.path.join(plots_dir, '{}_metrics.pdf'.format(lib))
        kernel_density_pdf = os.path.join(plots_dir,
                                          '{}_kernel_density.pdf'.format(lib))

        workflow.subworkflow(
            name='hmmcopy_workflow_' + params_tag,
            func=hmmcopy.create_hmmcopy_workflow,
            args=(mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files),
                  mgd.InputFile('bam_markdups_index',
                                'cell_id',
                                fnames=bai_files),
                  mgd.OutputFile(hmmcopy_data), mgd.OutputFile(igv_seg_file),
                  segs_pdf, bias_pdf, mgd.OutputFile(heatmap_pdf),
                  mgd.OutputFile(heatmap_filt_pdf),
                  mgd.OutputFile(metrics_pdf),
                  mgd.OutputFile(kernel_density_pdf), cellids, config, args,
                  params, params_tag, results_dir),
            kwargs={'alignment_metrics': args['alignment_metrics']})

        results = {
            'hmmcopy_metrics': helpers.format_file_yaml(hmmcopy_data),
            'segments_plot': helpers.format_file_yaml(segs_pdf),
            'bias_plot': helpers.format_file_yaml(bias_pdf),
            'filtered_heatmap_plot':
            helpers.format_file_yaml(heatmap_filt_pdf),
            'heatmap_plot': helpers.format_file_yaml(heatmap_pdf),
            'kde_plot': helpers.format_file_yaml(kernel_density_pdf),
            'metrics_plot': helpers.format_file_yaml(metrics_pdf)
        }

        input_datasets = {
            k: helpers.format_file_yaml(v)
            for k, v in bam_files.iteritems()
        }

        metadata = {
            'hmmcopy': {
                'reads_table': '/hmmcopy/reads/0',
                'parameters_table': '/hmmcopy/params/0',
                'segments_table': '/hmmcopy/segments/0',
                'metrics_table': '/hmmcopy/metrics/0',
                'hmmcopy_params_tag': params_tag,
                'hmmcopy_params': params,
                'chromosomes': config['chromosomes'],
                'ref_genome': config['ref_genome'],
                'cell_filters': config["good_cells"],
                'version': single_cell.__version__,
                'results': results,
                'containers': config['containers'],
                'input_datasets': input_datasets,
                'output_datasets': None
            }
        }

        workflow.transform(name='generate_meta_yaml',
                           ctx=dict(mem=config['memory']['med'],
                                    pool_id=config['pools']['standard'],
                                    **ctx),
                           func="single_cell.utils.helpers.write_to_yaml",
                           args=(mgd.OutputFile(info_file), metadata))

    return workflow
示例#27
0
def bam_metrics_workflow(bam_filename, summary_fastq_screen_count_per_cell,
                         alignment_metrics, gc_metrics,
                         markdups_metrics_percell, flagstat_metrics_percell,
                         wgs_metrics_percell, gc_metrics_percell,
                         gc_metrics_summary_percell, gc_metrics_pdf_percell,
                         insert_metrics_percell, insert_metrics_pdf_percell,
                         ref_genome, sample_info, config, cell_ids):
    markdups_metrics_percell = dict([(cellid, markdups_metrics_percell[cellid])
                                     for cellid in cell_ids])

    flagstat_metrics_percell = dict([(cellid, flagstat_metrics_percell[cellid])
                                     for cellid in cell_ids])

    wgs_metrics_percell = dict([(cellid, wgs_metrics_percell[cellid])
                                for cellid in cell_ids])

    gc_metrics_percell = dict([(cellid, gc_metrics_percell[cellid])
                               for cellid in cell_ids])

    gc_metrics_summary_percell = dict([
        (cellid, gc_metrics_summary_percell[cellid]) for cellid in cell_ids
    ])

    gc_metrics_pdf_percell = dict([(cellid, gc_metrics_pdf_percell[cellid])
                                   for cellid in cell_ids])

    insert_metrics_percell = dict([(cellid, insert_metrics_percell[cellid])
                                   for cellid in cell_ids])

    insert_metrics_pdf_percell = dict([
        (cellid, insert_metrics_pdf_percell[cellid]) for cellid in cell_ids
    ])

    baseimage = config['docker']['single_cell_pipeline']
    workflow = pypeliner.workflow.Workflow(ctx={'docker_image': baseimage})

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.transform(name='get_duplication_metrics',
                       axes=('cell_id', ),
                       func="single_cell.utils.picardutils.bam_markdups",
                       args=(
                           mgd.InputFile('sorted_markdups',
                                         'cell_id',
                                         fnames=bam_filename),
                           mgd.TempOutputFile("temp_markdup_bam.bam",
                                              'cell_id'),
                           mgd.OutputFile('markdups_metrics',
                                          'cell_id',
                                          fnames=markdups_metrics_percell),
                           mgd.TempSpace('tempdir_markdups', 'cell_id'),
                       ),
                       kwargs={'docker_image': config['docker']['picard']})

    workflow.transform(name='get_flagstat_metrics',
                       axes=('cell_id', ),
                       func="single_cell.utils.bamutils.bam_flagstat",
                       args=(
                           mgd.InputFile('sorted_markdups',
                                         'cell_id',
                                         fnames=bam_filename),
                           mgd.OutputFile('flagstat_metrics_percell',
                                          'cell_id',
                                          fnames=flagstat_metrics_percell),
                       ),
                       kwargs={'docker_image': config['docker']['samtools']})

    workflow.transform(
        name='bam_collect_wgs_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.utils.picardutils.bam_collect_wgs_metrics",
        axes=('cell_id', ),
        args=(
            mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            ref_genome,
            mgd.OutputFile('wgs_metrics_percell',
                           'cell_id',
                           fnames=wgs_metrics_percell),
            config['picard_wgs_params'],
            mgd.TempSpace('wgs_tempdir', 'cell_id'),
        ),
        kwargs={'docker_image': config['docker']['picard']})

    workflow.transform(
        name='bam_collect_gc_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.utils.picardutils.bam_collect_gc_metrics",
        axes=('cell_id', ),
        args=(
            mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            ref_genome,
            mgd.OutputFile('gc_metrics_percell',
                           'cell_id',
                           fnames=gc_metrics_percell),
            mgd.OutputFile('gc_metrics_summary_percell',
                           'cell_id',
                           fnames=gc_metrics_summary_percell),
            mgd.OutputFile('gc_metrics_pdf_percell',
                           'cell_id',
                           fnames=gc_metrics_pdf_percell),
            mgd.TempSpace('gc_tempdir', 'cell_id'),
        ),
        kwargs={'docker_image': config['docker']['picard']})

    workflow.transform(
        name='bam_collect_insert_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.utils.picardutils.bam_collect_insert_metrics",
        axes=('cell_id', ),
        args=(
            mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            mgd.InputFile('flagstat_metrics_percell',
                          'cell_id',
                          fnames=flagstat_metrics_percell),
            mgd.OutputFile('insert_metrics_percell',
                           'cell_id',
                           fnames=insert_metrics_percell),
            mgd.OutputFile('insert_metrics_pdf_percell',
                           'cell_id',
                           fnames=insert_metrics_pdf_percell),
            mgd.TempSpace('insert_tempdir', 'cell_id'),
        ),
        kwargs={'docker_image': config['docker']['picard']})

    workflow.transform(
        name="collect_gc_metrics",
        func="single_cell.workflows.align.tasks.collect_gc",
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        args=(mgd.InputFile('gc_metrics_percell',
                            'cell_id',
                            axes_origin=[],
                            fnames=gc_metrics_percell),
              mgd.OutputFile(gc_metrics,
                             extensions=['.yaml']), mgd.TempSpace("temp_gc")),
    )

    workflow.transform(
        name='collect_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.workflows.align.tasks.collect_metrics",
        args=(
            mgd.InputFile('flagstat_metrics',
                          'cell_id',
                          axes_origin=[],
                          fnames=flagstat_metrics_percell),
            mgd.InputFile('markdups_metrics',
                          'cell_id',
                          axes_origin=[],
                          fnames=markdups_metrics_percell),
            mgd.InputFile('insert_metrics_percell',
                          'cell_id',
                          axes_origin=[],
                          fnames=insert_metrics_percell),
            mgd.InputFile('wgs_metrics_percell',
                          'cell_id',
                          axes_origin=[],
                          fnames=wgs_metrics_percell),
            mgd.TempSpace("tempdir_collect_metrics"),
            mgd.TempOutputFile("alignment_metrics.csv.gz",
                               extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='annotate_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.utils.csvutils.annotate_csv",
        args=(
            mgd.TempInputFile("alignment_metrics.csv.gz",
                              extensions=['.yaml']),
            sample_info,
            mgd.TempOutputFile('alignment_metrics_annotated.csv.gz',
                               extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='add_fastqscreen_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.utils.csvutils.merge_csv",
        args=(
            [
                mgd.TempInputFile("alignment_metrics_annotated.csv.gz",
                                  extensions=['.yaml']),
                mgd.InputFile(summary_fastq_screen_count_per_cell),
            ],
            mgd.OutputFile(alignment_metrics, extensions=['.yaml']),
            'outer',
            ['cell_id'],
        ),
    )

    return workflow
示例#28
0
def lumpy_multi_sample_workflow(config, normal_bam, tumour_cell_bams,
                                lumpy_breakpoints_csv,
                                lumpy_breakpoints_evidence,
                                lumpy_breakpoints_bed):
    ctx = {'docker_image': config['docker']['single_cell_pipeline']}
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    keys = [(sample_id, library_id)
            for (sample_id, library_id, _) in list(tumour_cell_bams.keys())]
    keys = sorted(set(keys))

    lumpy_breakpoints_csv = dict([(key, lumpy_breakpoints_csv(*key))
                                  for key in keys])
    lumpy_breakpoints_evidence = dict([(key, lumpy_breakpoints_evidence(*key))
                                       for key in keys])
    lumpy_breakpoints_bed = dict([(key, lumpy_breakpoints_bed(*key))
                                  for key in keys])

    workflow.set_filenames('tumour_cells.bam',
                           'sample_id',
                           'library_id',
                           'cell_id',
                           fnames=tumour_cell_bams)
    workflow.set_filenames('lumpy_breakpoints.csv.gz',
                           'sample_id',
                           'library_id',
                           fnames=lumpy_breakpoints_csv)
    workflow.set_filenames('lumpy_breakpoints_evidence.csv.gz',
                           'sample_id',
                           'library_id',
                           fnames=lumpy_breakpoints_evidence)
    workflow.set_filenames('lumpy_breakpoints.bed',
                           'sample_id',
                           'library_id',
                           fnames=lumpy_breakpoints_bed)

    workflow.subworkflow(
        name='normal_preprocess_lumpy',
        func='single_cell.workflows.lumpy.lumpy_preprocess_workflow',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        args=(normal_bam, config,
              mgd.TempOutputFile('normal.discordants.sorted.bam'),
              mgd.TempOutputFile('normal.splitters.sorted.bam'),
              mgd.TempOutputFile('hist_normal_formatted.csv'),
              mgd.TempOutputFile('normal_mean_stdev.yaml')),
    )

    workflow.subworkflow(
        name='tumour_preprocess_lumpy',
        func='single_cell.workflows.lumpy.lumpy_preprocess_workflow',
        axes=('sample_id', 'library_id'),
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        args=(mgd.InputFile('tumour_cells.bam',
                            'sample_id',
                            'library_id',
                            'cell_id',
                            extensions=['.bai']), config,
              mgd.TempOutputFile('tumour.discordants.sorted.bam', 'sample_id',
                                 'library_id'),
              mgd.TempOutputFile('tumour.splitters.sorted.bam', 'sample_id',
                                 'library_id'),
              mgd.TempOutputFile('hist_tumour_formatted.csv', 'sample_id',
                                 'library_id'),
              mgd.TempOutputFile('tumour_mean_stdev.yaml', 'sample_id',
                                 'library_id')),
    )

    workflow.subworkflow(
        name='lumpy',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        axes=('sample_id', 'library_id'),
        func="single_cell.workflows.lumpy.create_lumpy_workflow",
        args=(
            config,
            mgd.TempInputFile('normal.discordants.sorted.bam'),
            mgd.TempInputFile('normal.splitters.sorted.bam'),
            mgd.TempInputFile('hist_normal_formatted.csv'),
            mgd.TempInputFile('normal_mean_stdev.yaml'),
            mgd.TempInputFile('tumour.discordants.sorted.bam', 'sample_id',
                              'library_id'),
            mgd.TempInputFile('tumour.splitters.sorted.bam', 'sample_id',
                              'library_id'),
            mgd.TempInputFile('hist_tumour_formatted.csv', 'sample_id',
                              'library_id'),
            mgd.TempInputFile('tumour_mean_stdev.yaml', 'sample_id',
                              'library_id'),
            mgd.OutputFile('lumpy_breakpoints.bed', 'sample_id', 'library_id'),
            mgd.OutputFile('lumpy_breakpoints.csv.gz', 'sample_id',
                           'library_id'),
            mgd.OutputFile('lumpy_breakpoints_evidence.csv.gz', 'sample_id',
                           'library_id'),
        ),
        kwargs={
            'sample_id': mgd.InputInstance('sample_id'),
            'library_id': mgd.InputInstance('library_id')
        })

    return workflow
示例#29
0
def create_destruct_workflow(
    normal_bam,
    tumour_bam_files,
    destruct_config,
    config,
    destruct_ref_data_dir,
    breakpoints_csv,
    breakpoints_library_csv,
    cell_counts_csv,
    normal_sample_id='normal',
):
    ctx = {'docker_image': config['docker']['single_cell_pipeline']}
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(tumour_bam_files.keys()),
    )

    workflow.subworkflow(
        name='normal_preprocess_destruct',
        func=
        'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow',
        args=(normal_bam, mgd.TempOutputFile('normal_stats'),
              mgd.TempOutputFile('normal_reads_1.fastq.gz'),
              mgd.TempOutputFile('normal_reads_2.fastq.gz'),
              mgd.TempOutputFile('normal_sample_1.fastq.gz'),
              mgd.TempOutputFile('normal_sample_2.fastq.gz'),
              destruct_ref_data_dir, destruct_config, config),
    )

    workflow.subworkflow(
        name='tumour_preprocess_destruct',
        func=
        'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow',
        args=(mgd.InputFile('tumour_cells.bam',
                            'cell_id',
                            extensions=['.bai'],
                            fnames=tumour_bam_files),
              mgd.TempOutputFile('tumour_stats'),
              mgd.TempOutputFile('tumour_reads_1.fastq.gz'),
              mgd.TempOutputFile('tumour_reads_2.fastq.gz'),
              mgd.TempOutputFile('tumour_sample_1.fastq.gz'),
              mgd.TempOutputFile('tumour_sample_2.fastq.gz'),
              destruct_ref_data_dir, destruct_config, config),
        kwargs={'tag': True})

    workflow.subworkflow(
        name='run_destruct',
        func='single_cell.workflows.destruct_singlecell.destruct_workflow',
        args=(
            mgd.TempInputFile('normal_stats'),
            mgd.TempInputFile('normal_reads_1.fastq.gz'),
            mgd.TempInputFile('normal_reads_2.fastq.gz'),
            mgd.TempInputFile('normal_sample_1.fastq.gz'),
            mgd.TempInputFile('normal_sample_2.fastq.gz'),
            mgd.TempInputFile('tumour_stats'),
            mgd.TempInputFile('tumour_reads_1.fastq.gz'),
            mgd.TempInputFile('tumour_reads_2.fastq.gz'),
            mgd.TempInputFile('tumour_sample_1.fastq.gz'),
            mgd.TempInputFile('tumour_sample_2.fastq.gz'),
            destruct_config,
            config,
            destruct_ref_data_dir,
            mgd.OutputFile(breakpoints_csv),
            mgd.OutputFile(breakpoints_library_csv),
            mgd.OutputFile(cell_counts_csv),
            mgd.TempSpace("raw_data_dir"),
        ),
    )

    return workflow
def create_variant_counting_workflow(args):
    """ Count variant reads for multiple sets of variants across cells.
    """

    strelka_vcf, museq_vcf, tumour_cell_bams = inpututils.load_variant_counting_input(
        args['input_yaml'])

    counts_output = os.path.join(args['out_dir'], "counts.csv.gz")

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    config = inpututils.load_config(args)
    config = config['variant_calling']

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']})

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.transform(name='merge_snvs_museq',
                       func='single_cell.utils.vcfutils.merge_vcf',
                       args=([
                           mgd.InputFile('museq.vcf',
                                         'sample_id',
                                         'library_id',
                                         fnames=museq_vcf,
                                         extensions=['.tbi', '.csi'],
                                         axes_origin=[]),
                           mgd.InputFile('strelka.vcf',
                                         'sample_id',
                                         'library_id',
                                         fnames=strelka_vcf,
                                         extensions=['.tbi', '.csi'],
                                         axes_origin=[]),
                       ],
                             mgd.TempOutputFile('all.snv.vcf.gz',
                                                extensions=['.tbi', '.csi']),
                             mgd.TempSpace("merge_vcf_temp")),
                       kwargs={'docker_image': config['docker']['vcftools']})

    workflow.subworkflow(
        name='count_alleles',
        func=create_snv_allele_counts_for_vcf_targets_workflow,
        args=(
            mgd.InputFile('tumour_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams,
                          axes_origin=[]),
            mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.OutputFile(counts_output),
            config['memory'],
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'], [counts_output],
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'snv_genotyping'
            }
        })

    return workflow