예제 #1
0
def build_pipeline():

    pipe = Pipeline("my_pipeline")

    pipe.originate(
        name="create_three_new_files",
        task_func=create_new_file,
        output=[os.path.join(WORK_DIR, f"file{i}.csv") for i in range(1, 4)],
    )

    pipe.transform(
        name="convert_csv_files_to_tsv",
        task_func=csv_to_tsv,
        input=output_from("create_three_new_files"),
        filter=suffix(".csv"),
        output=".tsv",
    )

    pipe.transform(
        name="calculate_md5",
        task_func=md5,
        input=output_from("convert_csv_files_to_tsv"),
        filter=suffix(".tsv"),
        output=".md5sum",
    )

    return pipe
예제 #2
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='ovarian_cancer_pipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    human_reference_genome_file = state.config.get_option('human_reference_genome')
    # Stages are dependent on the state
    stages = PipelineStages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # The human reference genome in FASTA format
    pipeline.originate(
        task_func=stages.human_reference_genome,
        name='human_reference_genome',
        output=human_reference_genome_file)

    # Index the human reference genome with BWA, needed before we can map reads
    pipeline.transform(
        task_func=stages.index_ref_bwa,
        name='index_ref_bwa',
        input=output_from('human_reference_genome'),
        filter=suffix('.fa'),
        output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Align paired end reads in FASTQ to the reference producing a BAM file
    (pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[_a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')
        .follows('index_ref_bwa'))


    return pipeline
예제 #3
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='hiplexpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/undr_rover')
    safe_make_dir('variants/undr_rover/coverdir')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('passed_filter_files'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='variants/undr_rover/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    #### concatenate undr_rover vcfs ####
    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('apply_undr_rover'),
        filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/undr_rover/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
예제 #4
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='twin ion')
    # Get a list of paths to all the MZML files
    mzml_files = state.config.get_option('mzml')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original MZML files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_mzml,
        name='original_mzml',
        output=mzml_files)

    pipeline.transform(
        task_func=stages.resample,
        name='resample',
        input=output_from('original_mzml'),
        filter=suffix('.mzML'),
        output='.resample.mzML')

    pipeline.transform(
        task_func=stages.noise_filter_sgolay,
        name='noise_filter_sgolay',
        input=output_from('resample'),
        filter=suffix('.resample.mzML'),
        output='.denoise.mzML')

    pipeline.transform(
        task_func=stages.baseline_filter,
        name='baseline_filter',
        input=output_from('noise_filter_sgolay'),
        filter=suffix('.denoise.mzML'),
        output='.baseline.mzML')

    pipeline.transform(
        task_func=stages.peak_picker_hires,
        name='peak_picker_hires',
        input=output_from('baseline_filter'),
        filter=suffix('.baseline.mzML'),
        output='.peaks.mzML')

    pipeline.transform(
        task_func=stages.feature_finder_centroid,
        name='feature_finder_centroid',
        input=output_from('peak_picker_hires'),
        filter=suffix('.peaks.mzML'),
        output='.featureXML')

    return pipeline
예제 #5
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='test_pipeline')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_files,
        name='original_files',
        output=input_files)

    pipeline.transform(
        task_func=stages.stage1,
        name='stage1',
        input=output_from('original_files'),
        filter=suffix('.0'),
        output='.1')

    pipeline.transform(
        task_func=stages.stage2,
        name='stage2',
        input=output_from('stage1'),
        filter=suffix('.1'),
        output='.2')

    pipeline.transform(
        task_func=stages.stage3,
        name='stage3',
        input=output_from('stage2'),
        filter=suffix('.2'),
        output='.3')

    pipeline.transform(
        task_func=stages.stage4,
        name='stage4',
        input=output_from('stage3'),
        filter=suffix('.3'),
        output='.4')

    pipeline.transform(
        task_func=stages.stage5,
        name='stage5',
        input=output_from('stage4'),
        filter=suffix('.4'),
        output='.5')

    return pipeline
예제 #6
0
def make_pipeline_call(state):
    #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs
    pipeline = Pipeline(name='genericpipe')

    with open("all_sample.passed.summary.txt", 'r') as inputf:
        passed_files = inputf.read().split('\n')

        stages = Stages(state)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')

    pipeline.originate(task_func=stages.passed_filter_files,
                       name='passed_filter_files',
                       output=passed_files)

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')

    return pipeline
예제 #7
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='md5')
    # Get a list of paths to all the input files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_files,
        name='original_files',
        output=input_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.md5_checksum,
        name='md5_checksum',
        input=output_from('original_files'),
        filter=suffix(''),
        output='.md5')


    return pipeline
예제 #8
0
def make_pipeline1(
        pipeline_name,  # Pipelines need to have a unique name
        starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(
        task_func=task_m_to_1,
        name="add_input",
        # Lookup Task from function name task_originate()
        #   So long as this is unique in the pipeline
        input=task_originate,
        # requires an anchor from 3.7 onwards, see
        # https://bugs.python.org/issue34982
        filter=regex(r"^(.*)"),
        add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"),
        output=r"\1.22")
    test_pipeline.transform(
        task_func=task_1_to_1,
        name="22_to_33",
        # Lookup Task from Task name
        #   Function name is not unique in the pipeline
        input=output_from("add_input"),
        filter=suffix(".22"),
        output=".33")
    tail_task = test_pipeline.transform(
        task_func=task_1_to_1,
        name="33_to_44",
        # Ask Pipeline to lookup Task from Task name
        input=test_pipeline["22_to_33"],
        filter=suffix(".33"),
        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
예제 #9
0
def make_pipeline1(pipeline_name,   # Pipelines need to have a unique name
                   starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(task_func=task_m_to_1,
                            name="add_input",
                            # Lookup Task from function name task_originate()
                            #   So long as this is unique in the pipeline
                            input=task_originate,
                            # requires an anchor from 3.7 onwards, see
                            # https://bugs.python.org/issue34982
                            filter=regex(r"^(.*)"),
                            add_inputs=add_inputs(
                                tempdir + "/testdir/whatever.txt"),
                            output=r"\1.22")
    test_pipeline.transform(task_func=task_1_to_1,
                            name="22_to_33",
                            # Lookup Task from Task name
                            #   Function name is not unique in the pipeline
                            input=output_from("add_input"),
                            filter=suffix(".22"),
                            output=".33")
    tail_task = test_pipeline.transform(task_func=task_1_to_1,
                                        name="33_to_44",
                                        # Ask Pipeline to lookup Task from Task name
                                        input=test_pipeline["22_to_33"],
                                        filter=suffix(".33"),
                                        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
예제 #10
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='fastq2bam')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_files,
                       name='original_files',
                       output=input_files)

    pipeline.transform(
        task_func=stages.fastq2bam,
        name='fastq2bam',
        input=output_from('original_files'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        extras=['{sample[0]}'],
        output='{path[0]}/out/{sample[0]}.bam')

    pipeline.transform(
        task_func=stages.validate_prealigned_bam,
        name='validate_prealigned_bam',
        input=output_from('fastq2bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'),
        output='{path[0]}/{sample[0]}.validation')

    pipeline.transform(
        task_func=stages.align,
        name='align',
        input=output_from('validate_prealigned_bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'),
        output='{path[0]}/{sample[0]}.mapped.bam')

    return pipeline
예제 #11
0
 def handle_input(in_, in_key, formatter_key):
     formatter_inputs, simple_inputs, generator_inputs = \
         aggregate_task_inputs(
             ensure_list(task.get(in_, []), tuple_ok=False))
     if len(generator_inputs) > 0:  # generator_inputs goes to unnamed arg
         task_args.extend(generator_inputs)
     # simple_inputs get common general formatter
     if len(simple_inputs) > 0:
         formatter_inputs.append((simple_inputs, r'.+'))
     # handle formatter_inputs
     task_inputs = []
     task_formatters = []
     for in_, reg in formatter_inputs:
         in_ = [
             i['name'] if isinstance(i, dict) else i
             for i in ensure_list(in_)
         ]
         temp_in = []
         temp_reg = reg
         for i in in_:
             if i in task_name_list:
                 temp_in.append(output_from(i))
                 continue
             elif not os.path.isabs(i):  # prepend default io dir
                 i = os.path.join(config.task_io_default_dir, i)
                 temp_reg = r'(?:[^/]*/)*' + reg
             else:
                 pass
             if re.search(r'[?*,\[\]{}]', i) is not None:
                 config.logger.info('{0:^27s} (glob): {1}'.format(
                     task_name, i))
                 temp_in.append(i)
             else:
                 config.logger.info('{0:^27s} (file): {1}'.format(
                     task_name, i))
                 temp_in.append(i)
         task_inputs.append(temp_in)  # list of list
         task_formatters.append(temp_reg)  # list of regex
     if len(task_inputs) > 0:
         task_inputs = reduce(lambda a, b: a + b, task_inputs)  # flatten
         if len(task_inputs) > 0:
             task_kwargs[in_key] = unwrap_if_len_one(task_inputs)
         if task['pipe'] != 'merge':  # require formatter for non-merge pipe
             task_kwargs[formatter_key] = formatter(*task_formatters)
예제 #12
0
                               "{subpath[0][0][0]}",
                               "{subdir[0][0][0]}"]).follows("WOWWWEEE").follows(gen_task1).follows(generate_initial_files1).follows("generate_initial_files1")
test_pipeline1.merge(task_func=check_product_merged_task,
                     input=check_product_task,
                     output=tempdir + "/merged.results")
test_pipeline1.product(task_func=check_product_misspelt_capture_error_task,
                       input=gen_task1,
                       filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
                       output="{path[0][0]}/{FILEPART[0][0]}.tmp2")
test_pipeline1.product(task_func=check_product_out_of_range_formatter_ref_error_task,
                       input=generate_initial_files1,  #
                       filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
                       output="{path[2][0]}/{basename[0][0]}.tmp2",
                       extras=["{FILE_PART[0][0]}"])
test_pipeline1.product(task_func=check_product_formatter_ref_index_error_task,
                       input=output_from("generate_initial_files1"),
                       filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
                       output="{path[0][0][1000]}/{basename[0][0]}.tmp2",
                       extras=["{FILE_PART[0][0]}"])
test_pipeline1.combinations(task_func=check_combinations2_task,
                            input=generate_initial_files1,      # gen_task1
                            filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
                            tuple_size=2,
                            output="{path[0][0]}/{FILE_PART[0][0]}.{basename[1][0]}.tmp2",
                            extras=["{basename[0][0][0]}{basename[1][0][0]}",       # extra: prefices
                                    # extra: path for 2nd input, 1st file
                                    "{subpath[0][0][0]}",
                                    "{subdir[0][0][0]}"])
test_pipeline1.merge(task_func=check_combinations2_merged_task,
                     input=check_combinations2_task,
                     output=tempdir + "/merged.results")
예제 #13
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='methylation_pipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = PipelineStages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Run bismark genome preparation on the reference genome
    pipeline.originate(task_func=stages.bismark_genome_prepare,
                       name='bismark_genome_prepare',
                       output='reference/Bisulfite_Genome')

    # Run FASTQC on the input fastq files
    pipeline.transform(
        task_func=stages.fastqc,
        name='fastqc',
        input=output_from('original_fastqs'),
        filter=formatter('(?P<path>.+)/(?P<filename>.+).fastq.gz'),
        output='{path[0]}/{filename[0]}_fastqc')

    # Run bismark on the input fastq files
    (pipeline.transform(
        task_func=stages.bismark,
        name='bismark',
        input=output_from('original_fastqs'),
        filter=formatter(
            '(?P<path>.+)/(?P<filename>.+)_R1_(?P<num>.+).fastq.gz'),
        add_inputs=add_inputs('{path[0]}/{filename[0]}_R2_{num[0]}.fastq.gz'),
        extras=['{path[0]}/bismark_output/'],
        output=
        '{path[0]}/bismark_output/{filename[0]}_R1_{num[0]}_bismark_bt2_pe.sam.gz'
    )).follows('bismark_genome_prepare')

    # Run bismark methylation extractor on the bismark output
    pipeline.transform(
        task_func=stages.bismark_methylation_extractor,
        name='bismark_methylation_extractor',
        input=output_from('bismark'),
        filter=formatter(
            '(?P<path>.+)/(?P<filename>.+)_R1_(?P<num>.+)_bismark_bt2_pe.sam.gz'
        ),
        extras=['{path[0]}'],
        output=
        '{path[0]}/CpG_context_{filename[0]}_R1_{num[0]}_bismark_bt2_pe.sam.gz.txt'
    )

    # Run methpt on the bismark methylation extractor output
    pipeline.transform(
        task_func=stages.methpat,
        name='methpat',
        input=output_from('bismark_methylation_extractor'),
        filter=formatter('(?P<path>.+)/CpG_context_(?P<filename>.+)'),
        extras=['{path[0]}', '{filename[0]}'],
        output='{path[0]}/CpG_context_{filename[0]}.methpat.html')

    return pipeline
        generate_initial_files1).follows("generate_initial_files1")
test_pipeline1.merge(task_func=test_product_merged_task,
                     input=test_product_task,
                     output=tempdir + "/merged.results")
test_pipeline1.product(task_func=test_product_misspelt_capture_error_task,
                       input=gen_task1,
                       filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
                       output="{path[0][0]}/{FILEPART[0][0]}.tmp2")
test_pipeline1.product(
    task_func=test_product_out_of_range_formatter_ref_error_task,
    input=generate_initial_files1,  #
    filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
    output="{path[2][0]}/{basename[0][0]}.tmp2",
    extras=["{FILE_PART[0][0]}"])
test_pipeline1.product(task_func=test_product_formatter_ref_index_error_task,
                       input=output_from("generate_initial_files1"),
                       filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
                       output="{path[0][0][1000]}/{basename[0][0]}.tmp2",
                       extras=["{FILE_PART[0][0]}"])
test_pipeline1.combinations(
    task_func=test_combinations2_task,
    input=generate_initial_files1,  # gen_task1
    filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
    tuple_size=2,
    output="{path[0][0]}/{FILE_PART[0][0]}.{basename[1][0]}.tmp2",
    extras=[
        "{basename[0][0][0]}{basename[1][0][0]}",  # extra: prefices
        "{subpath[0][0][0]}",  # extra: path for 2nd input, 1st file
        "{subdir[0][0][0]}"
    ])
test_pipeline1.merge(task_func=test_combinations2_merged_task,
예제 #15
0
def make_pipeline_process(state):
    #originate process pipeline state

    # Define empty pipeline
    pipeline = Pipeline(name='haloplexpipe')
    # Get a list of paths to all the directories to be combined for variant calling
    run_directories = state.config.get_option('runs')
    #grab files from each of the processed directories in "runs"
    gatk_files = []
    for directory in run_directories:
        gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf'))

    stages = Stages(state)

    #dummy stage to take the globbed outputs of each run that is to be processed
    pipeline.originate(task_func=stages.glob_gatk,
                       name='glob_gatk',
                       output=gatk_files)

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('glob_gatk'),
                   output='processed/gatk/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Apply GT filters to genotyped vcf
    pipeline.transform(task_func=stages.genotype_filter_gatk,
                       name='genotype_filter_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.gt-filter.vcf')

    # Decompose and normalise multiallelic sites
    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('genotype_filter_gatk'),
                       filter=suffix('.raw.gt-filter.vcf'),
                       output='.raw.gt-filter.decomp.norm.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.raw.gt-filter.decomp.norm.vcf'),
                       output='.raw.gt-filter.decomp.norm.annotate.vcf')

    # Filter vcf
    pipeline.transform(
        task_func=stages.gatk_filter,
        name='gatk_filter',
        input=output_from('variant_annotator_gatk'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vcf')

    #Apply VEP
    pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('gatk_filter'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf')

    ####### vardict stuff

    vardict_files = []
    for directory in run_directories:
        vardict_files.extend(
            glob.glob(directory + '/variants/vardict/*sorted.vcf.gz'))

    #dummy stage to take the globbed outputs of each run that is to be processed
    pipeline.originate(task_func=stages.glob_vardict,
                       name='glob_vardict',
                       output=vardict_files)

    safe_make_dir('processed/vardict')

    #concatenate all vardict vcfs
    pipeline.merge(task_func=stages.concatenate_vcfs,
                   name='concatenate_vcfs',
                   input=output_from('glob_vardict'),
                   output='processed/vardict/combined.vcf.gz')

    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise_vardict',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.decomp.norm.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_final_vcf',
                       input=output_from('vt_decompose_normalise_vardict'),
                       filter=suffix('.decomp.norm.vcf.gz'),
                       output='.decomp.norm.vcf.gz.tbi')

    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep_vardict',
        input=output_from('vt_decompose_normalise_vardict'),
        filter=suffix('.decomp.norm.vcf.gz'),
        output='.decomp.norm.vep.vcf').follows('index_final_vcf'))

    return pipeline
예제 #16
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='complexo')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(task_func=stages.sort_bam_picard,
                       name='sort_bam_picard',
                       input=output_from('align_bwa'),
                       filter=suffix('.bam'),
                       output='.sort.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(task_func=stages.chrom_intervals_gatk,
                       name='chrom_intervals_gatk',
                       input=output_from('mark_duplicates_picard'),
                       filter=suffix('.sort.dedup.bam'),
                       output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam').follows(
            'mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(task_func=stages.base_recalibration_gatk,
                       name='base_recalibration_gatk',
                       input=output_from('local_realignment_gatk'),
                       filter=suffix('.sort.dedup.realn.bam'),
                       output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam').follows(
            'local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(task_func=stages.call_variants_gatk,
                       name='call_variants_gatk',
                       input=output_from('print_reads_gatk'),
                       filter=suffix('.sort.dedup.realn.recal.bam'),
                       output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('call_variants_gatk'),
                   output='COMPLEXO.mergedgvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.mergedgvcf.vcf'),
                       output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(task_func=stages.snp_recalibrate_gatk,
                       name='snp_recalibrate_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.genotyped.vcf'),
                       output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['COMPLEXO.snp_recal', 'COMPLEXO.snp_tranches']),
        output='.recal_SNP.vcf').follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(
            ['COMPLEXO.indel_recal', 'COMPLEXO.indel_tranches']),
        output='.recal_INDEL.vcf').follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['COMPLEXO.recal_INDEL.vcf']),
        output='.combined.vcf').follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK
    pipeline.transform(task_func=stages.select_variants_gatk,
                       name='select_variants_gatk',
                       input=output_from('combine_variants_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.selected.vcf')

    return pipeline
예제 #17
0
파일: __main__.py 프로젝트: TomHarrop/uvb
def main():
    # prepare the ruffus pipeline
    main_pipeline = ruffus.Pipeline.pipelines["main"]

    # catch jgi logon and password from cli
    parser = ruffus.cmdline.get_argparse(description='UV-B analysis pipeline.')
    parser.add_argument('--email', '-e',
                        help='Logon email address for JGI',
                        type=str,
                        dest='jgi_logon')
    parser.add_argument('--password', '-p',
                        help='JGI password',
                        type=str,
                        dest='jgi_password')
    options = parser.parse_args()
    jgi_logon = options.jgi_logon
    jgi_password = options.jgi_password

    # need a dictionary of species to genome URL and species to gff.
    # supply this in a text file
    fasta_urls = {}
    annotation_urls = {}
    with open('data/genomeUrls.txt') as tsv:
        genome_urls = csv.reader(tsv, delimiter='\t')
        next(genome_urls, None)
        for row in genome_urls:
            fasta_urls[row[0]] = row[1]
            annotation_urls[row[0]] = row[2]

    # iterate over fasta_urls keys to run jobs
    for species in fasta_urls.keys():
        # call download script
        main_pipeline.originate(
            name=species + "_genome",
            task_func=download_genome,
            output="data/genome/" + species + "/METADATA.csv",
            extras=[species, fasta_urls[species], annotation_urls[species],
                    jgi_logon, jgi_password])
        # generate a star genome for each species
        main_pipeline.transform(
            name=species + "_index",
            task_func=generate_index,
            input=ruffus.output_from(species + "_genome"),
            filter=ruffus.regex(r"data/genome/(.*)/METADATA.csv"),
            output=r"output/\1/star-index/METADATA.csv",
            extras=[r"\1"])
        # define the reads
        main_pipeline.originate(name=species + "_reads",
                                task_func=define_reads,
                                output="ruffus/" + species + "_reads",
                                extras=[species])
        # first mapping step
        main_pipeline.collate(
            name=species + "_mapped_reads",
            task_func=star,
            input=[[ruffus.output_from(species + "_reads"),
                    ruffus.output_from(species + "_index")]],
            filter=ruffus.formatter(),
            output=["output/{subdir[1][1]}/star/METADATA.csv"],
            extras=["{subdir[1][1]}"])
    # FOR LOOP ENDS

    # parse the mapping stats
    mapping_stats = main_pipeline.merge(
        task_func=parse_star_stats_R,
        input=ruffus.output_from(
            list(species + "_mapped_reads" for species in fasta_urls.keys())),
        output="output/mapping_stats/SessionInfo.txt")

    # generate plots for mapping
    mapping_plots = main_pipeline.transform(
        task_func=plot_reads_in_genes_R,
        input=mapping_stats,
        filter=ruffus.formatter(),
        output="{subpath[0][0]}/Figure S1.pdf")

    # use generator in the input field to collate the previous results
    deseq_results = main_pipeline.transform(
                task_func=deseq2_R,
                input=ruffus.output_from(
                        list(species + "_mapped_reads"
                             for species in fasta_urls.keys())),
                filter=ruffus.formatter(),
                output=[r"output/{subdir[0][1]}/deseq2/SessionInfo.txt"],
                extras=[r"{subdir[0][1]}"])

    # combine the deseq results
    de_lists = main_pipeline.merge(
        task_func=list_de_genes_R,
        input=deseq_results,
        output="output/merged/deseq2/SessionInfo.de_genes.txt")

    # run clustering
    mfuzz_results = main_pipeline.transform(
        task_func=mfuzz_R,
        input=deseq_results,
        filter=ruffus.formatter(),
        output='output/{subdir[0][1]}/mfuzz/SessionInfo.mfuzz.txt',
        extras=['{subdir[0][1]}'])

    # combine mfuzz_results
    mfuzz_plot = main_pipeline.merge(
        task_func=combine_mfuzz_results_R,
        input=mfuzz_results,
        output='output/merged/mfuzz/SessionInfo.mfuzz.txt')

    # compare flavonoid synthesis genes
    flavonoid_genes = main_pipeline.transform(
        task_func=compare_saito_genes_R,
        input=de_lists,
        filter=ruffus.formatter(),
        output='{path[0]}/SessionInfo.flavonoid_synthesis.txt')

    # run the pipeline
    ruffus.cmdline.run(options, multithread=8)

    # print the flowchart
    ruffus.pipeline_printout_graph("ruffus/flowchart.pdf", "pdf",
                                   pipeline_name="UV-B analysis pipeline")
예제 #18
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='thepipeline')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        # filter=formatter('(?P<path>.+)/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+)_1.fastq.gz'),
        # 1_HFYLVCCXX:2:TCCGCGAA_2_GE0343_1.fastq.gz
        # 1_HCJWFBCXX:GGACTCCT_L001_9071584415739518822-AGRF-023_R2.fastq.gz
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz
        add_inputs=add_inputs(
            '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'],
        # extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Local realignment using GATK
    # Generate RealignerTargetCreator using GATK
    pipeline.transform(
        task_func=stages.realigner_target_creator,
        name='realigner_target_creator',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('realigner_target_creator'),
        # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        filter=formatter(
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).intervals'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.bam'),
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        filter=formatter(
            # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'),
            # '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'),
        # output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Merge lane bams to sample bams
    pipeline.collate(
        task_func=stages.merge_sample_bams,
        name='merge_sample_bams',
        filter=formatter(
            # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).sort.dedup.realn.recal.bam'),
            '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).sort.dedup.realn.recal.bam'),
        # inputs=add_inputs('alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'),
        input=output_from('print_reads_gatk'),
        output='alignments/{sample[0]}/{sample[0]}.merged.bam')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard2',
        input=output_from('merge_sample_bams'),
        # filter=formatter(
        # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).merged.bam'),
        filter=suffix('.merged.bam'),
        # XXX should make metricsup an extra output?
        output=['.merged.dedup.bam', '.metricsdup'])

    # Local realignment2 using GATK
    # Generate RealignerTargetCreator using GATK
    pipeline.transform(
        task_func=stages.realigner_target_creator,
        name='realigner_target_creator2',
        input=output_from('mark_duplicates_picard2'),
        filter=suffix('.dedup.bam'),
        output='.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk2',
        input=output_from('realigner_target_creator2'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.intervals'),
        # filter=formatter(
        # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).intervals'),
        # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        add_inputs=add_inputs(
            'alignments/{sample[0]}/{sample[0]}.merged.dedup.bam'),
        output='alignments/{sample[0]}/{sample[0]}.merged.dedup.realn.bam')
        .follows('mark_duplicates_picard2'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('local_realignment_gatk2'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.dedup.realn.bam'),
        output='variants/{sample[0]}.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_haplotypecaller_gatk'),
        output='variants/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.combined.vcf'),
        output='.raw.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        add_inputs=add_inputs(['ALL.snp_recal', 'ALL.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.raw.vcf'),
        add_inputs=add_inputs(
            ['ALL.indel_recal', 'ALL.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['ALL.recal_INDEL.vcf']),
        # output='.combined.vcf')
        output='ALL.raw.vqsr.vcf')
        .follows('apply_indel_recalibrate_gatk'))
    #
    # # Select variants using GATK
    # pipeline.transform(
    #     task_func=stages.select_variants_gatk,
    #     name='select_variants_gatk',
    #     input=output_from('combine_variants_gatk'),
    #     filter=suffix('.combined.vcf'),
    #     output='.selected.vcf')


    return pipeline
예제 #19
0
파일: pipeline.py 프로젝트: pearg/radpipe
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name="radpipe")

    # Stages are dependent on the state
    stages = PipelineStages(state)

    # Get a list of library objects.
    libraries = parse_libraries(
        libraries=state.config.get_options("libraries"))

    # Get a list of input files
    input_files = [l.files for l in libraries]
    # input_files = [item for sublist in input_files for item in sublist]
    state.logger.info("Input files: " + str(input_files))

    # Get a list of all samples for each library
    samples_dict = OrderedDict()
    for l in libraries:
        samples_dict[l.name] = l.samples
    state.logger.debug("Samples: " + str(samples_dict))

    # Make sure that there are no duplicate samples
    sample_list = [
        item for sublist in samples_dict.values() for item in sublist
    ]
    sample_counts = Counter(sample_list)
    for sample in sample_counts:
        if sample_counts[sample] > 1:
            print("Sample {} appears {} times in the barcodes files. "
                  "Sample names must be unique".format(sample,
                                                       sample_counts[sample]))
            sys.exit(radpipe.error_codes.INVALID_INPUT_FILE)

    # Define output directories
    output_dir = get_output_paths(state)
    state.logger.debug(output_dir)

    # Allow multiple comma-separated tasks
    if len(state.options.target_tasks) == 1:
        state.options.target_tasks = state.options.target_tasks[0].split(",")
    if len(state.options.forced_tasks) == 1:
        state.options.forced_tasks = state.options.forced_tasks[0].split(",")
    state.logger.debug("Target tasks: " + str(state.options.target_tasks))
    state.logger.debug("Forced tasks: " + str(state.options.forced_tasks))

    # Check if alignment_method is valid
    alignment_method = state.config.get_options(
        "alignment_method").strip().lower()
    if alignment_method not in ["bwa mem", "bowtie"]:
        print("Error: Invalid alignment_method in config file. " \
              "Valid options are ['bwa mem', 'bowtie'].")
        sys.exit(radpipe.error_codes.INVALID_ARGUMENT)
    if alignment_method == "bwa mem":
        align_task_name = "bwa_mem"
        index_task_name = "bwa_index"
    else:
        align_task_name = "bowtie"
        index_task_name = "bowtie_index"

    # TODO: Refactor this
    # If 'alignment' is in target_tasks or forced_tasks, specify which
    # type of alignment job
    if "alignment" in state.options.target_tasks:
        index = state.options.target_tasks.index("alignment")
        state.options.target_tasks[index] = align_task_name
    if "alignment" in state.options.forced_tasks:
        index = state.options.forced_tasks.index("alignment")
        state.options.forced_tasks[index] = align_task_name

    # If 'build_index' is in target_tasks or forced_tasks, specify which
    # type of index job
    if "build_index" in state.options.target_tasks:
        index = state.options.target_tasks.index("build_index")
        state.options.target_tasks[index] = index_task_name
    if "build_index" in state.options.forced_tasks:
        index = state.options.forced_tasks.index("build_index")
        state.options.forced_tasks[index] = index_task_name
    state.logger.debug(state)

    # Whether to include filter_bam stage or not
    filter_bams = False
    try:
        samtools_view_options = state.config.get_options(
            "samtools_view_options")
        if samtools_view_options:
            filter_bams = True
    except:
        pass
    state.logger.info("Filter bams: {}".format(filter_bams))

    # Population map filenames
    popmap_file = "{output_dir}/{name}_popmap.txt".format(
        output_dir=output_dir["populations"],
        name=state.config.get_options("analysis_id"))
    try:
        config_popmap_file = state.config.get_options("popmap_file")
        if config_popmap_file:
            state.logger.info(
                "Using popmap file: {}".format(config_popmap_file))
        else:
            raise (Exception)
    except Exception:
        config_popmap_file = None
        state.logger.info("Creating new popmap file: {}".format(popmap_file))

    # Population r values
    populations_r = state.config.get_options("populations_r")
    assert (isinstance(populations_r, list))

    # Dummy stages. These do nothing except provide a node at the beginning
    # for the pipeline graph, giving the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.do_nothing,
                       name="original_fastqs",
                       output=input_files)

    pipeline.originate(task_func=stages.do_nothing,
                       name="reference_genome",
                       output=state.config.get_options("reference_genome"))

    # Create a copy of the population map file needed for stacks, or create
    # one denovo using the sample list.
    pipeline.originate(task_func=stages.create_popmap_file,
                       name="create_popmap_file",
                       output=[popmap_file],
                       extras=[config_popmap_file, sample_list])

    # Create index for reference genome based on alignment method.
    if alignment_method == "bwa mem":
        pipeline.transform(
            task_func=stages.bwa_index,
            name="bwa_index",
            input=output_from("reference_genome"),
            filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"),
            output=path_list_join(output_dir["reference"],
                                  ["reference.fa.bwt", "reference.fa.sa"]),
            extras=[output_dir["reference"]])

    if alignment_method == "bowtie":
        pipeline.transform(task_func=stages.bowtie_index,
                           name="bowtie_index",
                           input=output_from("reference_genome"),
                           filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"),
                           output=path_list_join(
                               output_dir["reference"],
                               ["reference.1.ebwt", "reference.rev.1.ebwt"]),
                           extras=[output_dir["reference"]])

    # FastQC
    pipeline.transform(
        task_func=stages.fastqc,
        name="fastqc",
        input=output_from("original_fastqs"),
        filter=formatter(".+/(?P<lib>[^/]+)/(?P<fn>[^/]+).(fastq|fq).gz"),
        output="%s/{lib[0]}/{fn[0]}_fastqc.zip" % output_dir["fastqc"],
        extras=[output_dir["fastqc"], "{lib[0]}"])

    # MultiQC: FastQC
    pipeline.merge(task_func=stages.multiqc_fastqc,
                   name="multiqc_fastqc",
                   input=output_from("fastqc"),
                   output="%s/multiqc_fastqc_report.html" % output_dir["qc"],
                   extras=[output_dir["qc"], output_dir["fastqc"]])

    # Stacks: Process RAD-Tags
    pipeline.transform(task_func=stages.process_radtags,
                       name="process_radtags",
                       input=output_from("original_fastqs"),
                       filter=formatter(".+/(?P<lib>[^/]+)/[^/]+"),
                       output="%s/{lib[0]}/{lib[0]}.success" %
                       output_dir["process_radtags"],
                       extras=[
                           output_dir["process_radtags"], "{lib[0]}",
                           state.config.get_options("renz_1"),
                           state.config.get_options("renz_2"),
                           state.config.get_options("process_radtags_options")
                       ])

    # Create a list for alignment with the input fastq files from process_radtags
    process_radtags_outputs = []
    for l in libraries:
        for s in l.samples:
            base = "{dir}/{lib}/{sample}".format(
                dir=output_dir["process_radtags"], lib=l.lib_id, sample=s)
            process_radtags_outputs.append(
                [base + ".1.fq.gz", base + ".2.fq.gz"])
    # print(process_radtags_outputs)

    # Alignment
    if align_task_name == "bwa_mem":
        (pipeline.transform(
            task_func=stages.bwa_align,
            name=align_task_name,
            input=process_radtags_outputs,
            filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"),
            output="%s/{sm[0]}.bwa.bam" % output_dir["alignments"],
            extras=[
                os.path.join(output_dir["reference"], "reference.fa"),
                "{path[0]}", output_dir["alignments"], "{sm[0]}",
                state.config.get_options("alignment_options")
            ])).follows("bwa_index").follows("process_radtags")

    if align_task_name == "bowtie":
        (pipeline.transform(
            task_func=stages.bowtie_align,
            name=align_task_name,
            input=process_radtags_outputs,
            filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"),
            output="%s/{sm[0]}.bowtie.bam" % output_dir["alignments"],
            extras=[
                os.path.join(output_dir["reference"], "reference"),
                "{path[0]}", output_dir["alignments"], "{sm[0]}",
                state.config.get_options("alignment_options")
            ])).follows("bowtie_index").follows("process_radtags")

    # Sort BAM and index
    pipeline.transform(task_func=stages.sort_bam,
                       name="sort_bam",
                       input=output_from(align_task_name),
                       filter=suffix(".bam"),
                       output=".sorted.bam")

    if filter_bams:
        final_bam_task_name = "filter_bam"
        pipeline.transform(
            task_func=stages.filter_bam,
            name="filter_bam",
            input=output_from("sort_bam"),
            filter=suffix(".sorted.bam"),
            output=".sorted.filtered.bam",
            extras=[state.config.get_options("samtools_view_options")])
    else:
        final_bam_task_name = "sort_bam"

    # Samtools flagstat
    pipeline.transform(task_func=stages.flagstat,
                       name="flagstat",
                       input=output_from(final_bam_task_name),
                       filter=suffix(".bam"),
                       output=".flagstat.txt",
                       output_dir=output_dir["flagstat"])

    # MultiQC: flagstat
    pipeline.merge(task_func=stages.multiqc_flagstat,
                   name="multiqc_flagstat",
                   input=output_from("flagstat"),
                   output="%s/multiqc_flagstat_report.html" % output_dir["qc"],
                   extras=[output_dir["qc"], output_dir["flagstat"]])

    # Stacks: gstacks
    pipeline.merge(task_func=stages.gstacks,
                   name="gstacks",
                   input=output_from(final_bam_task_name),
                   output="%s/catalog.fa.gz" % output_dir["gstacks"],
                   extras=[
                       output_dir["alignments"], output_dir["gstacks"],
                       align_task_name, final_bam_task_name, sample_list,
                       state.config.get_options("gstacks_options")
                   ])

    # Define outputs from each run of populations
    populations_outputs = []
    for r in populations_r:
        dir_name = "{pop_dir}/{analysis_name}_r{r}".format(
            pop_dir=output_dir["populations"],
            analysis_name=state.config.get_options("analysis_id"),
            r=r)
        populations_outputs.append(
            os.path.join(dir_name, "populations.snps.vcf"))
    # print(populations_outputs)

    # Stacks: populations
    pipeline.originate(task_func=stages.populations,
                       name="popluations",
                       output=populations_outputs,
                       extras=[
                           output_dir["gstacks"], output_dir["populations"],
                           popmap_file,
                           state.config.get_options("populations_options")
                       ]).follows("gstacks").follows("create_popmap_file")

    return pipeline
예제 #20
0
def make_pipeline_process(state):
    # Define empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the directories to be combined for variant calling
    run_directories = state.config.get_option('runs')
    #grab files from each of the processed directories in "runs"
    gatk_files = []
    undr_rover_files = []
    for directory in run_directories:
        gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf'))
        undr_rover_files.extend(
            glob.glob(directory + '/variants/undr_rover/*sorted.vcf.gz'))

    # Stages are dependent on the state
    stages = Stages(state)

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.glob_gatk,
                       name='glob_gatk',
                       output=gatk_files)

    #Dummy stage to grab the undr rover files
    pipeline.originate(task_func=stages.glob_undr_rover,
                       name='glob_undr_rover',
                       output=undr_rover_files)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/undr_rover')

    pipeline.merge(task_func=stages.concatenate_vcfs,
                   name='concatenate_vcfs',
                   input=output_from('glob_undr_rover'),
                   output='variants/undr_rover/combined_undr_rover.vcf.gz')

    pipeline.transform(task_func=stages.index_final_vcf,
                       name='index_final_vcf',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.vcf.gz.tbi')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('glob_gatk'),
                   output='ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Apply GT filters to genotyped vcf
    pipeline.transform(task_func=stages.genotype_filter_gatk,
                       name='genotype_filter_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.gt-filter.vcf')

    # Decompose and normalise multiallelic sites
    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('genotype_filter_gatk'),
                       filter=suffix('.raw.gt-filter.vcf'),
                       output='.raw.gt-filter.decomp.norm.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.raw.gt-filter.decomp.norm.vcf'),
                       output='.raw.gt-filter.decomp.norm.annotate.vcf')

    # Filter vcf
    pipeline.transform(
        task_func=stages.gatk_filter,
        name='gatk_filter',
        input=output_from('variant_annotator_gatk'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vcf')

    #Apply VEP
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('gatk_filter'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'),
        add_inputs=add_inputs(
            ['variants/undr_rover/combined_undr_rover.vcf.gz']),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf').follows(
            'index_final_vcf'))

    return pipeline
예제 #21
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='vcf_annotation')
    # Get a list of paths to all the FASTQ files
    vcf_files = state.config.get_option('vcfs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original VCF files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_vcf,
        name='original_vcf',
        output=vcf_file)

    # Decompose VCF using Vt
    pipeline.transform(
        task_func=stages.decompose_vcf,
        name='decompose_vcf',
        input=output_from('original_vcf'),
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).vcf'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the VCF file name (e.g. study/family name.
        # This is needed within the stage for finding out sample specific
        # configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.decompose.normalize.vcf')

    # FILTER COMMON VARIANTS
    # ADD FILTER COMMON VARIANTS USING VEP

    # Annotate using VEP
    pipeline.transform(
        task_func=stages.annotate_vep,
        name='annotate_vep',
        input=output_from('decompose_vcf'),
        filter=suffix('.vcf'),
        output='.vep.vcf')

    # Annotate using SnpEff
    pipeline.transform(
        task_func=stages.annotate_snpeff,
        name='annotate_snpeff',
        input=output_from('annotate_vep'),
        filter=suffix('.vcf'),
        output='.snpeff.vcf')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(
            ['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline
예제 #22
0
def make_pipeline_map(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')
    safe_make_dir('metrics')
    safe_make_dir('metrics/amplicon')
    safe_make_dir('metrics/summary')

    # The original FASTQ files
    fastq_files = glob.glob('fastqs/*')

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9_-]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq.gz'
        ),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2_{lib[0]}.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}.clipped.sort.hq.bam')

    # generate mapping metrics.
    pipeline.transform(
        task_func=stages.generate_amplicon_metrics,
        name='generate_amplicon_metrics',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt',
        extras=['{sample[0]}'])

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats',
        input=output_from('coverage_bed', 'genome_reads', 'target_reads',
                          'total_reads'),
        #filter=regex(r'.+/(.+BS\d{4,6}.+)\..+\.txt'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'all_sample.summary.txt'])

    summary_file = 'all_sample.summary.txt'

    (pipeline.originate(task_func=stages.grab_summary_file,
                        name='grab_summary_file',
                        output=summary_file).follows('generate_stats'))

    pipeline.transform(task_func=stages.filter_stats,
                       name='filter_stats',
                       input=output_from('grab_summary_file'),
                       filter=suffix('.summary.txt'),
                       output='.passed.summary.txt',
                       extras=['all_sample.failed.summary.txt'])

    return pipeline
예제 #23
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='complexo')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort the BAM file using Picard 
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard 
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK 
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK 
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK 
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK 
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK 
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK 
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK  
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK 
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline
예제 #24
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='fastq2bam')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_files,
                       name='original_files',
                       output=input_files)

    #
    # performs fastqc on fastq inputs
    #
    pipeline.transform(
        task_func=stages.fastqc,
        name='fastqc',
        input=output_from('original_files'),
        filter=formatter('(?P<path>.+)/(?P<filename>.+).fastq.gz'),
        output='{path[0]}/{filename[0]}_fastqc')

    #
    # converts the fastq inputs to pre-aligned bams
    #
    pipeline.transform(
        task_func=stages.fastq2bam,
        name='fastq2bam',
        input=output_from('original_files'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        extras=['{sample[0]}'],
        output='{path[0]}/{sample[0]}.bam')

    #
    # validates pre-aligned bams x.bam -> x.validation
    #
    pipeline.transform(
        task_func=stages.validate_prealigned_bam,
        name='validate_prealigned_bam',
        input=output_from('fastq2bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'),
        output='{path[0]}/{sample[0]}.validation')

    # aligns pre-aligned bam x.bam -> x.mapped.bam
    pipeline.transform(
        task_func=stages.align,
        name='align',
        input=output_from('validate_prealigned_bam'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'),
        output='{path[0]}/{sample[0]}.mapped.bam')

    # generates stats about an aligned bam
    pipeline.transform(
        task_func=stages.align_stats_bedtools,
        name='align_stats_bedtools',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.genomecov.stats')

    # generates stats about an aligned bam
    pipeline.transform(
        task_func=stages.align_stats_picard,
        name='align_stats_picard',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.picard.stats')

    #
    # runs the Sanger variant calling pipeline
    #
    #pipeline.transform(
    #    task_func=stages.analyse_wgs,
    #    name='analyse_wgs',
    #    input=output_from('align'),
    #    filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
    #    output='{path[0]}/{sample[0]}.wgs/manifest')

    # runs the components of the Sanger variant calling pipeline
    pipeline.transform(
        task_func=stages.analyse_wgs_prepare,
        name='analyse_wgs_prepare',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.prepare')

    pipeline.transform(
        task_func=stages.analyse_wgs_reference_files,
        name='analyse_wgs_reference_files',
        input=[output_from('align'),
               output_from('analyse_wgs_prepare')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.reference_files')

    pipeline.transform(
        task_func=stages.analyse_wgs_init,
        name='analyse_wgs_init',
        input=[
            output_from('align'),
            output_from('analyse_wgs_reference_files')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.init')

    # block 1
    pipeline.transform(
        task_func=stages.analyse_wgs_verify_WT,
        name='analyse_wgs_verify_WT',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.verify_WT')

    pipeline.transform(
        task_func=stages.analyse_wgs_cgpPindel_input,
        name='analyse_wgs_cgpPindel_input',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel_input')

    pipeline.transform(
        task_func=stages.analyse_wgs_alleleCount,
        name='analyse_wgs_alleleCount',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.alleleCount')

    # block 2
    pipeline.transform(
        task_func=stages.analyse_wgs_ascat,
        name='analyse_wgs_ascat',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.ascat')

    pipeline.transform(
        task_func=stages.analyse_wgs_cgpPindel,
        name='analyse_wgs_cgpPindel',
        input=[
            output_from('align'),
            output_from('analyse_wgs_cgpPindel_input')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel')

    pipeline.transform(
        task_func=stages.analyse_wgs_BRASS_input,
        name='analyse_wgs_BRASS_input',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.BRASS_input')

    pipeline.transform(
        task_func=stages.analyse_wgs_BRASS_cover,
        name='analyse_wgs_BRASS_cover',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.BRASS_cover')

    pipeline.transform(
        task_func=stages.analyse_wgs_CaVEMan_split,
        name='analyse_wgs_CaVEMan_split',
        input=[output_from('align'),
               output_from('analyse_wgs_init')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan_split')

    # after block 2
    pipeline.transform(
        task_func=stages.analyse_wgs_ascat_prep,
        name='analyse_wgs_ascat_prep',
        input=[output_from('align'),
               output_from('analyse_wgs_ascat')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.ascat_prep')

    pipeline.transform(
        task_func=stages.analyse_wgs_pindel_prep,
        name='analyse_wgs_pindel_prep',
        input=[output_from('align'),
               output_from('analyse_wgs_cgpPindel')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.pindel_prep')

    # parallel block 3
    pipeline.transform(
        task_func=stages.analyse_wgs_verify_MT,
        name='analyse_wgs_verify_MT',
        input=[
            output_from('align'),
            output_from('analyse_wgs_verify_WT'),
            output_from('analyse_wgs_ascat_prep')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.verify_MT')

    pipeline.transform(
        task_func=stages.analyse_wgs_CaVEMan,
        name='analyse_wgs_CaVEMan',
        input=[
            output_from('align'),
            output_from('analyse_wgs_CaVEMan_split'),
            output_from('analyse_wgs_ascat_prep'),
            output_from('analyse_wgs_cgpPindel')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan')

    pipeline.transform(
        task_func=stages.analyse_wgs_BRASS,
        name='analyse_wgs_BRASS',
        input=[
            output_from('align'),
            output_from('analyse_wgs_BRASS_cover'),
            output_from('analyse_wgs_BRASS_input'),
            output_from('analyse_wgs_ascat_prep')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.BRASS')

    pipeline.transform(
        task_func=stages.analyse_wgs_cgpPindel_annot,
        name='analyse_wgs_cgpPindel_annot',
        input=[output_from('align'),
               output_from('analyse_wgs_pindel_prep')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel_annot')

    # pre block 4
    pipeline.transform(
        task_func=stages.analyse_wgs_caveman_prep,
        name='analyse_wgs_caveman_prep',
        input=[output_from('align'),
               output_from('analyse_wgs_CaVEMan')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.caveman_prep')

    # block 4
    pipeline.transform(
        task_func=stages.analyse_wgs_CaVEMan_annot,
        name='analyse_wgs_CaVEMan_annot',
        input=[output_from('align'),
               output_from('analyse_wgs_caveman_prep')],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan_annot')

    # done
    pipeline.transform(
        task_func=stages.analyse_wgs_finish,
        name='analyse_wgs_finish',
        input=[
            output_from('align'),
            output_from('analyse_wgs_CaVEMan_annot'),
            output_from('analyse_wgs_BRASS'),
            output_from('analyse_wgs_cgpPindel_annot'),
            output_from('analyse_wgs_alleleCount'),
            output_from('analyse_wgs_verify_MT')
        ],
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.wgs/completed.finish')

    #
    # runs the delly singularity container
    #

    pipeline.transform(
        task_func=stages.delly,
        name='delly',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.delly.completed')

    pipeline.transform(
        task_func=stages.gridss,
        name='gridss',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.gridss.completed')

    pipeline.transform(
        task_func=stages.muse,
        name='muse',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.muse.completed')

    pipeline.transform(
        task_func=stages.mutect2,
        name='mutect2',
        input=output_from('align'),
        filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'),
        output='{path[0]}/{sample[0]}.mutect2.completed')

    return pipeline
예제 #25
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='cellfree_seq')
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')

    # The original FASTQ files
    fastq_files = glob.glob('fastqs/*')

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}.sort.hq.bam')

    pipeline.transform(task_func=stages.run_connor,
                       name='run_connor',
                       input=output_from('align_bwa'),
                       filter=suffix('.sort.hq.bam'),
                       output='.sort.hq.connor.bam')

    safe_make_dir('metrics')
    safe_make_dir('metrics/summary')
    safe_make_dir('metrics/connor')

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed_raw',
                       input=output_from('intersect_bed_raw'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads_raw',
                       input=output_from('intersect_bed_raw'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads_raw',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats_raw',
        input=output_from('coverage_bed_raw', 'genome_reads_raw',
                          'target_reads_raw', 'total_reads_raw'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'summary.txt'])

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/connor/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed_connor',
                       input=output_from('intersect_bed_connor'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads_connor',
                       input=output_from('intersect_bed_connor'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads_connor',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats_connor',
        input=output_from('coverage_bed_connor', 'genome_reads_connor',
                          'target_reads_connor', 'total_reads_connor'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/connor/all_sample.summary.\1.txt',
        extras=[r'\1', 'connor.summary.txt'])

    safe_make_dir('variants')
    safe_make_dir('variants/vardict')

    pipeline.transform(
        task_func=stages.run_vardict,
        name='run_vardict',
        input=output_from('run_connor'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'),
        output='variants/vardict/{sample[0]}.vcf',
        extras=['{sample[0]}'])

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('run_vardict'),
        filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/vardict/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    (pipeline.merge(
        task_func=stages.concatenate_vcfs,
        name='concatenate_vcfs',
        input=output_from('sort_vcfs'),
        output='variants/vardict/combined.vcf.gz').follows('index_vcfs'))

    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('concatenate_vcfs'),
                       filter=suffix('.vcf.gz'),
                       output='.decomp.norm.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_final_vcf',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.decomp.norm.vcf.gz'),
                       output='.decomp.norm.vcf.gz.tbi')

    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('vt_decompose_normalise'),
        filter=suffix('.decomp.norm.vcf.gz'),
        output='.decomp.norm.vep.vcf').follows('index_final_vcf'))

    return pipeline
예제 #26
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq
        # new sample name = OHI031002-P02F04
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'
        ),

        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),

        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}_{readid[0]}/{sample[0]}_{readid[0]}.bam'
    )

    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'
        ),
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),
        # extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        extras=['{sample[0]}', '{readid[0]}'],

        # The output file name is the sample name with a .bam extension.
        output='variants/undr_rover/{sample[0]}_{readid[0]}.vcf')

    # Sort the BAM file using Picard
    pipeline.transform(task_func=stages.sort_bam_picard,
                       name='sort_bam_picard',
                       input=output_from('align_bwa'),
                       filter=suffix('.bam'),
                       output='.sort.bam')

    # High quality and primary alignments
    pipeline.transform(task_func=stages.primary_bam,
                       name='primary_bam',
                       input=output_from('sort_bam_picard'),
                       filter=suffix('.sort.bam'),
                       output='.primary.bam')

    # index bam file
    pipeline.transform(task_func=stages.index_sort_bam_picard,
                       name='index_bam',
                       input=output_from('primary_bam'),
                       filter=suffix('.primary.bam'),
                       output='.primary.bam.bai')

    # Clip the primer_seq from BAM File
    (pipeline.transform(
        task_func=stages.clip_bam,
        name='clip_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.primerclipped.bam').follows('index_bam'))

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('clip_bam'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-_]+).primary.primerclipped.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')
    # .follows('index_sort_bam_picard'))

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('call_haplotypecaller_gatk'),
                   output='variants/gatk/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.annotate.vcf')

    # Apply VariantFiltration using GATK
    pipeline.transform(task_func=stages.apply_variant_filtration_gatk,
                       name='apply_variant_filtration_gatk',
                       input=output_from('variant_annotator_gatk'),
                       filter=suffix('.raw.annotate.vcf'),
                       output='.raw.annotate.filtered.vcf')

    # Apply NORM
    (pipeline.transform(
        task_func=stages.apply_vt,
        name='apply_vt',
        input=output_from('apply_variant_filtration_gatk'),
        filter=suffix('.raw.annotate.filtered.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.raw.annotate.filtered.norm.vcf').follows(
            'apply_variant_filtration_gatk'))

    # Apply VEP
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('apply_vt'),
        filter=suffix('.raw.annotate.filtered.norm.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.raw.annotate.filtered.norm.vep.vcf').follows('apply_vt'))

    # Apply SnpEff
    (pipeline.transform(
        task_func=stages.apply_snpeff,
        name='apply_snpeff',
        input=output_from('apply_vep'),
        filter=suffix('.raw.annotate.filtered.norm.vep.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.raw.annotate.filtered.norm.vep.snpeff.vcf').follows(
            'apply_vep'))

    # Apply vcfanno
    (pipeline.transform(
        task_func=stages.apply_vcfanno,
        name='apply_vcfanno',
        input=output_from('apply_snpeff'),
        filter=suffix('.raw.annotate.filtered.norm.vep.snpeff.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.annotated.vcf').follows('apply_snpeff'))

    # Concatenate undr_rover vcf files
    pipeline.merge(task_func=stages.apply_cat_vcf,
                   name='apply_cat_vcf',
                   input=output_from('apply_undr_rover'),
                   output='variants/undr_rover/ur.vcf.gz')

    # # Apple VEP on concatenated undr_rover vcf file
    # (pipeline.transform(
    #     task_func=stages.apply_vep,
    #     name='apply_vep_ur',
    #     input=output_from('apply_cat_vcf'),
    #     filter=suffix('.vcf.gz'),
    #     output='.vep.vcf')
    #     .follows('apply_cat_vcf'))
    #
    # # Apply vcfanno on concatenated/vep undr_rover vcf file
    # (pipeline.transform(
    #     task_func=stages.apply_vcfanno,
    #     name='apply_vcfanno_ur',
    #     input=output_from('apply_vep_ur'),
    #     filter=suffix('.vep.vcf'),
    #     output='.vep.anno.vcf')
    #     .follows('apply_vep_ur'))
    #
    # # Apply snpeff
    # (pipeline.transform(
    #     task_func=stages.apply_snpeff,
    #     name='apply_snpeff_ur',
    #     input=output_from('apply_vcfanno_ur'),
    #     filter=suffix('.vep.anno.vcf'),
    #     output='.vep.anno.snpeff.vcf.gz')
    #     .follows('apply_vcfanno_ur'))
    #
    # Apply tabix
    pipeline.transform(task_func=stages.apply_tabix,
                       name='apply_tabix',
                       input=output_from('apply_cat_vcf'),
                       filter=suffix('.vcf.gz'),
                       output='.vcf.gz.tbi')

    # # Apply HomopolymerRun
    # (pipeline.transform(
    #     task_func=stages.apply_homopolymer_ann,
    #     name='apply_homopolymer_ann',
    #     input=output_from('apply_snpeff_ur'),
    #     filter=suffix('.vep.anno.snpeff.vcf.gz'),
    #     output='.annotated.vcf')
    #     .follows('apply_tabix'))

    # # Apply summarize multi coverage
    # (pipeline.merge(
    #     task_func=stages.apply_multicov,
    #     name='apply_multicov',
    #     input=output_from('primary_bam'),
    #     # filter=suffix('.primary.bam'),
    #     output='coverage/all.multicov.txt')
    #     .follows('index_bam'))

    # Apply summarize picard coverage
    # (pipeline.merge(
    #     task_func=stages.apply_summarize_picard,
    #     name='apply_summarize_picard',
    #     input=output_from('target_coverage'),
    #     output='coverage/all.hsmetrics.txt')
    #     .follows('target_coverage'))

    # # Apply summarize multicov coverage plots
    # (pipeline.merge(
    #     task_func=stages.apply_multicov_plots,
    #     name='apply_multicov_plots',
    #     input=output_from('apply_multicov'),
    #     output='coverage/coverage_analysis_main.html')
    #     .follows('apply_multicov'))

    return pipeline
예제 #27
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name="rnapipe")

    # Get the details of the experiment (samples, config, inputs, ...)
    experiment = Experiment(state)

    # Get reference file locations
    reference_genome = state.config.get_options("reference_genome")
    gene_ref = state.config.get_options("gene_ref")

    # Print out samples
    sample_text = [s.info() for s in experiment.sample_list]
    logging.info("Analysis samples:\n{}".format("\n".join(sample_text)))

    # Stages are dependent on the state. Experiment object is also passed so
    # we can access metadata later.
    stages = PipelineStages(state, experiment=experiment)

    # Make directories
    output_dir = get_output_paths(
        results_dir=state.config.get_options("results_dir"),
        default_paths=OUTPUT_PATHS)
    make_output_dirs(output_dir)
    logging.debug(output_dir)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.do_nothing,
                       name="original_fastqs",
                       output=experiment.R1_files)

    # Create reference index for alignment
    if not experiment.index_provided:
        pipeline.originate(task_func=stages.do_nothing,
                           name="reference_genome",
                           output=reference_genome)

        if experiment.alignment_method == "star":
            # Create reference index for STAR
            pipeline.transform(task_func=stages.create_star_index,
                               name="create_star_index",
                               input=output_from("reference_genome"),
                               filter=formatter(".*"),
                               add_inputs=add_inputs(gene_ref),
                               output=path_list_join(
                                   output_dir["star_index"],
                                   ["SA", "Genome", "genomeParameters.txt"]),
                               extras=[output_dir["star_index"]])

        elif experiment.alignment_method == "hisat2":
            # Create reference index for HISAT2
            hisat_basename = path.join(output_dir["hisat_index"], "genome")
            pipeline.transform(
                task_func=stages.create_hisat_index,
                name="create_hisat_index",
                input=output_from("reference_genome"),
                filter=formatter(".*"),
                add_inputs=add_inputs(gene_ref),
                output=path_list_join(output_dir["hisat_index"],
                                      ["genome.1.ht2", "genome.2.ht2"]),
                extras=[hisat_basename])
    else:
        # Don't create index if index is supplied
        if experiment.alignment_method == "star":
            output_dir["star_index"] = state.config.get_options("star_index")
            pipeline.originate(task_func=stages.do_nothing,
                               name="create_star_index",
                               output=path_list_join(
                                   output_dir["star_index"],
                                   ["SA", "Genome", "genomeParameters.txt"]))
        elif experiment.alignment_method == "hisat2":
            hisat_basename = state.config.get_options("hisat_index")
            output_dir["hisat_index"] = path.dirname(hisat_basename)
            prefix = path.basename(hisat_basename)
            pipeline.originate(task_func=stages.do_nothing,
                               name="create_hisat_index",
                               output=path_list_join(
                                   output_dir["hisat_index"], [
                                       "{prefix}.1.ht2".format(prefix=prefix),
                                       "{prefix}.2.ht2".format(prefix=prefix)
                                   ]))

    # Pre-trim FastQC
    if experiment.paired_end:
        pipeline.transform(
            task_func=stages.fastqc,
            name="fastqc",
            input=output_from("original_fastqs"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"),
            add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"),
            output=path_list_join(
                output_dir["fastqc"],
                ["{sample[0]}_R1_fastqc.zip", "{sample[0]}_R2_fastqc.zip"]),
            extras=[output_dir["fastqc"]])
    else:
        pipeline.transform(task_func=stages.fastqc,
                           name="fastqc",
                           input=output_from("original_fastqs"),
                           filter=suffix(".fastq.gz"),
                           output="_fastqc.zip",
                           output_dir=output_dir["fastqc"],
                           extras=[output_dir["fastqc"]])

    # Trimmomatic
    if experiment.trim_reads and experiment.paired_end:
        pipeline.transform(
            task_func=stages.trim_reads,
            name="trim_reads",
            input=output_from("original_fastqs"),
            # Get R1 file and the corresponding R2 file
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"),
            add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"),
            output=path_list_join(output_dir["seq"], [
                "{sample[0]}_R1.trimmed.fastq.gz",
                "{sample[0]}_R2.trimmed.fastq.gz"
            ]),
            extras=path_list_join(output_dir["seq"], [
                "{sample[0]}_R1.unpaired.fastq.gz",
                "{sample[0]}_R2.unpaired.fastq.gz"
            ]))
    elif experiment.trim_reads:
        pipeline.transform(
            task_func=stages.trim_reads,
            name="trim_reads",
            input=output_from("original_fastqs"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"),
            output=path.join(output_dir["seq"],
                             "{sample[0]}_R1.trimmed.fastq.gz"))

    # Post-trim FastQC
    if experiment.paired_end and experiment.trim_reads:
        pipeline.transform(
            task_func=stages.fastqc,
            name="post_trim_fastqc",
            input=output_from("trim_reads"),
            filter=formatter(
                ".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.trimmed.fastq.gz"),
            output=path_list_join(output_dir["post_trim_fastqc"], [
                "{sample[0]}_R1.trimmed_fastqc.gz",
                "{sample[0]}_R2.trimmed_fastqc.gz"
            ]),
            extras=["results/qc/post_trim_fastqc/"])
    elif experiment.trim_reads:
        pipeline.transform(task_func=stages.fastqc,
                           name="post_trim_fastqc",
                           input=output_from("trim_reads"),
                           filter=suffix(".trimmed.fastq.gz"),
                           output=".trimmed_fastqc.gz",
                           output_dir=output_dir["post_trim_fastqc"],
                           extras=[output_dir["post_trim_fastqc"]])

    # If there are technical replicates, each is mapped independently.
    # This is so each technical replicate maintains a separate read group.
    if experiment.alignment_method == "star":
        align_task_name = "star_align"
        if experiment.trim_reads:
            (pipeline.transform(
                task_func=stages.star_align,
                name=align_task_name,
                input=output_from("trim_reads"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \
                        % output_dir["alignments"],
                extras=[output_dir["star_index"], "{sample[0]}"])
            ).follows("create_star_index")
        else:
            (pipeline.transform(
                task_func=stages.star_align,
                name=align_task_name,
                input=output_from("original_fastqs"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \
                        % output_dir["alignments"],
                extras=[output_dir["star_index"], "{sample[0]}"])
            ).follows("create_star_index")

    if experiment.alignment_method == "hisat2":
        align_task_name = "hisat_align"
        if experiment.trim_reads:
            (pipeline.transform(
                task_func=stages.hisat_align,
                name="hisat_align",
                input=output_from("trim_reads"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \
                        % output_dir["alignments"],
                extras=[hisat_basename, "{sample[0]}"])
            ).follows("create_hisat_index")
        else:
            (pipeline.transform(
                task_func=stages.hisat_align,
                name="hisat_align",
                input=output_from("original_fastqs"),
                filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \
                                 "_R[12](.trimmed)?.fastq.gz"),
                output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \
                        % output_dir["alignments"],
                extras=[hisat_basename, "{sample[0]}"])
            ).follows("create_hisat_index")

    # Sort BAM by coordinates
    pipeline.transform(
        task_func=stages.sort_bam_by_coordinate,
        name="sort_bam_by_coordinate",
        input=output_from(align_task_name),
        filter=formatter(
            ".+/(?P<sample>[a-zA-Z0-9-_]+)\.(?P<method>(star|hisat2))\..*bam"),
        output=[
            "{path[0]}/{sample[0]}.{method[0]}.sorted.bam",
            "{path[0]}/{sample[0]}.{method[0]}.sorted.bam.bai"
        ])

    # Merge files with the same sample name
    if experiment.multiple_technical_replicates:
        pipeline.collate(
            task_func=stages.merge_bams,
            name="merge_bams",
            input=output_from("sort_bam_by_coordinate"),
            filter=formatter(
                ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam"
            ),
            output=path_list_join(
                output_dir["alignments"],
                ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"]))
    else:
        pipeline.transform(
            task_func=stages.create_symlinks,
            name="merge_bams",
            input=output_from("sort_bam_by_coordinate"),
            filter=formatter(
                ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam"
            ),
            output=path_list_join(
                output_dir["alignments"],
                ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"]))

    # Sort BAM by name for counting features
    pipeline.transform(task_func=stages.sort_bam_by_name,
                       name="sort_bam_by_name",
                       input=output_from("merge_bams"),
                       filter=suffix(".bam"),
                       output=".nameSorted.bam")

    # Count features with HTSeq-count
    pipeline.transform(task_func=stages.htseq_count,
                       name="htseq_count",
                       input=output_from("sort_bam_by_name"),
                       filter=suffix(".nameSorted.bam"),
                       output_dir=output_dir["counts"],
                       output=".htseq.txt")

    # Count features with featureCounts
    pipeline.transform(task_func=stages.featurecounts,
                       name="featurecounts",
                       input=output_from("sort_bam_by_name"),
                       filter=suffix(".nameSorted.bam"),
                       output_dir=output_dir["counts"],
                       output=".featureCounts.txt")

    # TODO: add multiqc step

    #     # Stringtie assembly
    #     pipeline.transform(
    #         task_func=stages.stringtie_assembly,
    #         name="stringtie_assembly",
    #         input=output_from("merge_bams"),
    #         filter=suffix(".bam"),
    #         output_dir=output_dir["stringtie_assembly"],
    #         output=".gtf")

    # Stringtie estimates
    pipeline.transform(
        task_func=stages.stringtie_estimates,
        name="stringtie_estimates",
        input=output_from("merge_bams"),
        filter=formatter(
            ".+/(?P<sm>[a-zA-Z0-9-]+)\.(?P<method>(star|hisat2)).bam"),
        output=path_list_join(output_dir["stringtie_estimates"],
                              ["{sm[0]}/{sm[0]}.gtf", "{sm[0]}/e_data.ctab"]))

    # Stringtie counts
    pipeline.collate(
        task_func=stages.stringtie_prepDE,
        name="stringtie_prepDE",
        input=output_from("stringtie_estimates"),
        filter=formatter(".+\.gtf"),
        output=path_list_join(
            output_dir["stringtie_estimates"],
            ["gene_count_matrix.csv", "transcript_count_matrix.csv"]))
    return pipeline
예제 #28
0
def make_pipeline_map(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='haloplexpipe')
    # Get a list of paths to all the FASTQ files
    #fastq_files = state.config.get_option('fastqs')
    fastq_files = glob.glob("fastqs/*.gz")
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')
    safe_make_dir('processed_fastqs')
    safe_make_dir('metrics')
    safe_make_dir('metrics/amplicon')
    safe_make_dir('metrics/summary')
    safe_make_dir('metrics/pass_samples')
    safe_make_dir('variants')
    safe_make_dir('variants/gatk')
    safe_make_dir('variants/vardict')

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    pipeline.transform(
        task_func=stages.run_surecalltrimmer,
        name='run_surecalltrimmer',
        input=output_from('original_fastqs'),
        filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'),
        #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'),
        #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'),
        extras=['{sample[0]}'],
        # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step.
        output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz')
    #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('run_surecalltrimmer'),
        filter=formatter(
            'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz'
        ),
        add_inputs=add_inputs(
            'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'),
        #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'),
        #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'),
        extras=['{sample[0]}'],
        output='alignments/{sample[0]}.bam')

    # Run locatit from agilent.  this should produce sorted bam files, so no sorting needed at the next step
    pipeline.collate(task_func=stages.run_locatit,
                     name='run_locatit',
                     input=output_from('align_bwa', 'original_fastqs'),
                     filter=regex(r'.+/(.+_L\d\d\d).+'),
                     output=r'alignments/\1.locatit.bam')

    pipeline.transform(task_func=stages.sort_bam,
                       name='sort_bam',
                       input=output_from('run_locatit'),
                       filter=suffix('.locatit.bam'),
                       output='.sorted.locatit.bam')

    # # # # # Metrics stages # # # # #
    # generate mapping metrics (post locatit)
    pipeline.transform(
        task_func=stages.generate_amplicon_metrics,
        name='generate_amplicon_metrics',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt',
        extras=['{sample[0]}'])

    # Intersect the bam file with the region of interest
    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    # Calculate coverage metrics from the intersected bam file
    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    # Count the number of mapped reads
    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    # Count the number of on-target reads
    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    # Count the number of total reads
    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads',
        input=output_from('sort_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    # Generate summary metrics from the stats files produces
    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats',
        input=output_from('coverage_bed', 'genome_reads', 'target_reads',
                          'total_reads'),
        #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'all_sample.summary.txt'])
    # # # # # Metrics stages end # # # # #

    # # # # # Checking metrics and calling # # # # #
    # Originate to set the location of the metrics summary file
    (pipeline.originate(
        task_func=stages.grab_summary_file,
        name='grab_summary_file',
        output='all_sample.summary.txt').follows('generate_stats'))

    # Awk command to produce a list of bam files passing filters
    pipeline.transform(task_func=stages.filter_stats,
                       name='filter_stats',
                       input=output_from('grab_summary_file'),
                       filter=suffix('.summary.txt'),
                       output='.passed.summary.txt')

    # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller
    pipeline.subdivide(name='passed_filter_files',
                       task_func=stages.read_samples,
                       input=output_from('filter_stats'),
                       filter=formatter(),
                       output="metrics/pass_samples/*.bam")

    # Call variants using GATK
    (pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'),
        output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam'))

    # Call variants with vardict
    (pipeline.transform(
        task_func=stages.run_vardict,
        name='run_vardict',
        input=output_from('passed_filter_files'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'),
        output='variants/vardict/{sample[0]}.vcf',
        extras=['{sample[0]}']).follows('sort_bam'))

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('run_vardict'),
        filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'),
        output='variants/vardict/{sample[0]}.sorted.vcf.gz')

    pipeline.transform(task_func=stages.index_vcfs,
                       name='index_vcfs',
                       input=output_from('sort_vcfs'),
                       filter=suffix('.sorted.vcf.gz'),
                       output='.sorted.vcf.gz.tbi')

    return (pipeline)
예제 #29
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq
        # new sample name = OHI031002-P02F04
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)-(?P<tumor>[TN]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'),

        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}-{tumor[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),

        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{tumor[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}/{sample[0]}_{tumor[0]}.bam')

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # High quality and primary alignments
    pipeline.transform(
        task_func=stages.primary_bam,
        name='primary_bam',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        output='.primary.bam')

    # index bam file
    pipeline.transform(
        task_func=stages.index_sort_bam_picard,
        name='index_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.bam.bai')

    # Clip the primer_seq from BAM File
    (pipeline.transform(
        task_func=stages.clip_bam,
        name='clip_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.primerclipped.bam')
        .follows('index_bam'))

    ###### GATK VARIANT CALLING - MuTect2 ######

    # Call somatics variants using MuTect2
    pipeline.transform(
        task_func=stages.call_mutect2_gatk,
        name='call_mutect2_gatk',
        input=output_from('clip_bam'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+)_T.primary.primerclipped.bam'),
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_N.primary.primerclipped.bam'),
        # extras=['{sample[0]}'],
        output='variants/mutect2/{sample[0]}.mutect2.vcf')
        # .follows('clip_bam')

    ###### GATK VARIANT CALLING - MuTect2 ######

    # -------- VEP ----------
    # Apply NORM
    (pipeline.transform(
        task_func=stages.apply_vt,
        name='apply_vt',
        input=output_from('call_mutect2_gatk'),
        filter=suffix('.mutect2.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.vt.vcf')
        .follows('call_mutect2_gatk'))
    #
    # Apply VEP
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('apply_vt'),
        filter=suffix('.mutect2.vt.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.vt.vep.vcf')
        .follows('apply_vt'))
    #
    # Apply vcfanno
    (pipeline.transform(
        task_func=stages.apply_vcfanno,
        name='apply_vcfanno',
        input=output_from('apply_vep'),
        filter=suffix('.mutect2.vt.vep.vcf'),
        # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']),
        output='.mutect2.annotated.vcf')
        .follows('apply_vep'))

    return pipeline
예제 #30
0
파일: pipeline.py 프로젝트: khalidm/crpipe
def make_pipeline(state):
    """Build the pipeline by constructing stages and connecting them together"""
    # Build an empty pipeline
    pipeline = Pipeline(name="crpipe")
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option("fastqs")
    # Find the path to the reference genome
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs, name="original_fastqs", output=fastq_files)

    # Convert FASTQ file to FASTA using fastx toolkit
    # pipeline.transform(
    #     task_func=stages.fastq_to_fasta,
    #     name='fastq_to_fasta',
    #     input=output_from('original_fastqs'),
    #     filter=suffix('.fastq.gz'),
    #     output='.fasta')

    # The original reference file
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    # pipeline.originate(
    #    task_func=stages.original_reference,
    #    name='original_reference',
    #    output=reference_file)

    # Run fastQC on the FASTQ files
    pipeline.transform(
        task_func=stages.fastqc,
        name="fastqc",
        input=output_from("original_fastqs"),
        filter=suffix(".fastq.gz"),
        output="_fastqc",
    )

    # Index the reference using BWA
    # pipeline.transform(
    #    task_func=stages.index_reference_bwa,
    #    name='index_reference_bwa',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Index the reference using samtools
    # pipeline.transform(
    #     task_func=stages.index_reference_samtools,
    #    name='index_reference_samtools',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output='.fa.fai')

    # Index the reference using bowtie 2
    # pipeline.transform(
    #     task_func=stages.index_reference_bowtie2,
    #     name='index_reference_bowtie2',
    #     input=output_from('original_reference'),
    #     filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'),
    #     output=['{path[0]}/{refname[0]}.1.bt2',
    #             '{path[0]}/{refname[0]}.2.bt2',
    #             '{path[0]}/{refname[0]}.3.bt2',
    #             '{path[0]}/{refname[0]}.4.bt2',
    #             '{path[0]}/{refname[0]}.rev.1.bt2',
    #             '{path[0]}/{refname[0]}.rev.2.bt2'],
    #     extras=['{path[0]}/{refname[0]}'])

    # # Create a FASTA sequence dictionary for the reference using picard
    # pipeline.transform(
    #     task_func=stages.reference_dictionary_picard,
    #     name='reference_dictionary_picard',
    #     input=output_from('original_reference'),
    #     filter=suffix('.fa'),
    #     output='.dict')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name="align_bwa",
        input=output_from("original_fastqs"),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz"),
        # Add two more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=["{sample[0]}"],
        # The output file name is the sample name with a .bam extension.
        output="{path[0]}/{sample[0]}.bam",
    )

    # Sort alignment with sambamba
    pipeline.transform(
        task_func=stages.sort_bam_sambamba,
        name="sort_alignment",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.sorted.bam",
    )

    # Extract MMR genes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_genes_bedtools,
        name="extract_genes_bedtools",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.mmr.bam",
    )

    # Extract selected chromosomes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_chromosomes_samtools,
        name="extract_chromosomes_samtools",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.chroms.bam",
    )

    # Index the MMR genes bam file with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name="index_mmr_alignment",
        input=output_from("extract_genes_bedtools"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).mmr.bam"),
        output="{path[0]}/{sample[0]}.mmr.bam.bai",
    )

    # Compute depth of coverage of the alignment with GATK DepthOfCoverage
    # pipeline.transform(
    #    task_func=stages.alignment_coverage_gatk,
    #    name='alignment_coverage_gatk',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs([reference_file]),
    #    output='{path[0]}/{sample[0]}.coverage_summary',
    #    extras=['{path[0]}/{sample[0]}_coverage'])

    # Index the alignment with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name="index_alignment",
        input=output_from("sort_alignment"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
        output="{path[0]}/{sample[0]}.sorted.bam.bai",
    )

    # Generate alignment stats with bamtools
    pipeline.transform(
        task_func=stages.bamtools_stats,
        name="bamtools_stats",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.stats.txt",
    )

    # Extract the discordant paired-end alignments
    pipeline.transform(
        task_func=stages.extract_discordant_alignments,
        name="extract_discordant_alignments",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.discordants.unsorted.bam",
    )

    # Extract split-read alignments
    pipeline.transform(
        task_func=stages.extract_split_read_alignments,
        name="extract_split_read_alignments",
        input=output_from("align_bwa"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"),
        output="{path[0]}/{sample[0]}.splitters.unsorted.bam",
    )

    # Sort discordant reads.
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name="sort_discordants",
        input=output_from("extract_discordant_alignments"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam"),
        extras=["{path[0]}/{sample[0]}.discordants"],
        output="{path[0]}/{sample[0]}.discordants.bam",
    )

    # Index the sorted discordant bam with samtools
    # pipeline.transform(
    #   task_func=stages.index_bam,
    #   name='index_discordants',
    #   input=output_from('sort_discordants'),
    #   filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'),
    #   output='{path[0]}/{sample[0]}.discordants.bam.bai')

    # Sort discordant reads
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name="sort_splitters",
        input=output_from("extract_split_read_alignments"),
        filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam"),
        extras=["{path[0]}/{sample[0]}.splitters"],
        output="{path[0]}/{sample[0]}.splitters.bam",
    )

    # Index the sorted splitters bam with samtools
    # pipeline.transform(
    #    task_func=stages.index_bam,
    #    name='index_splitters',
    #    input=output_from('sort_splitters'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'),
    #    output='{path[0]}/{sample[0]}.splitters.bam.bai')

    # Call structural variants with lumpy
    (
        pipeline.transform(
            task_func=stages.structural_variants_lumpy,
            name="structural_variants_lumpy",
            input=output_from("sort_alignment"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
            add_inputs=add_inputs(["{path[0]}/{sample[0]}.splitters.bam", "{path[0]}/{sample[0]}.discordants.bam"]),
            output="{path[0]}/{sample[0]}.lumpy.vcf",
        )
        .follows("index_alignment")
        .follows("sort_splitters")
        .follows("sort_discordants")
    )

    # Call genotypes on lumpy output using SVTyper
    # (pipeline.transform(
    #    task_func=stages.genotype_svtyper,
    #    name='genotype_svtyper',
    #    input=output_from('structural_variants_lumpy'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']),
    #    output='{path[0]}/{sample[0]}.svtyper.vcf')
    #    .follows('align_bwa')
    #    .follows('sort_splitters')
    #    .follows('index_alignment')
    #    .follows('index_splitters')
    #    .follows('index_discordants'))

    # Call SVs with Socrates
    (
        pipeline.transform(
            task_func=stages.structural_variants_socrates,
            name="structural_variants_socrates",
            input=output_from("sort_alignment"),
            filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"),
            # output goes to {path[0]}/socrates/
            output="{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt",
            extras=["{path[0]}"],
        )
    )

    # Call DELs with DELLY
    pipeline.merge(
        task_func=stages.deletions_delly,
        name="deletions_delly",
        input=output_from("sort_alignment"),
        output="delly.DEL.vcf",
    )

    # Call DUPs with DELLY
    pipeline.merge(
        task_func=stages.duplications_delly,
        name="duplications_delly",
        input=output_from("sort_alignment"),
        output="delly.DUP.vcf",
    )

    # Call INVs with DELLY
    pipeline.merge(
        task_func=stages.inversions_delly,
        name="inversions_delly",
        input=output_from("sort_alignment"),
        output="delly.INV.vcf",
    )

    # Call TRAs with DELLY
    pipeline.merge(
        task_func=stages.translocations_delly,
        name="translocations_delly",
        input=output_from("sort_alignment"),
        output="delly.TRA.vcf",
    )

    # Join both read pair files using gustaf_mate_joining
    # pipeline.transform(
    #    task_func=stages.gustaf_mate_joining,
    #    name='gustaf_mate_joining',
    #    input=output_from('fastq_to_fasta'),
    #    # Match the R1 (read 1) FASTA file and grab the path and sample name.
    #    # This will be the first input to the stage.
    #    # We assume the sample name may consist of only alphanumeric
    #    # characters.
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'),
    #    # Add one more input to the stage:
    #    #    1. The corresponding R2 FASTA file
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']),
    #    output='{path[0]}/{sample[0]}.joined_mates.fasta')

    # Call structural variants with pindel
    # (pipeline.transform(
    #    task_func=stages.structural_variants_pindel,
    #    name='structural_variants_pindel',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]),
    #    output='{path[0]}/{sample[0]}.pindel')
    #    .follows('index_reference_bwa')
    #    .follows('index_reference_samtools'))

    return pipeline
예제 #31
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='crpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Find the path to the reference genome
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Convert FASTQ file to FASTA using fastx toolkit
    # pipeline.transform(
    #     task_func=stages.fastq_to_fasta,
    #     name='fastq_to_fasta',
    #     input=output_from('original_fastqs'),
    #     filter=suffix('.fastq.gz'),
    #     output='.fasta')

    # The original reference file
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    #pipeline.originate(
    #    task_func=stages.original_reference,
    #    name='original_reference',
    #    output=reference_file)

    # Run fastQC on the FASTQ files
    pipeline.transform(task_func=stages.fastqc,
                       name='fastqc',
                       input=output_from('original_fastqs'),
                       filter=suffix('.fastq.gz'),
                       output='_fastqc')

    # Index the reference using BWA
    #pipeline.transform(
    #    task_func=stages.index_reference_bwa,
    #    name='index_reference_bwa',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])

    # Index the reference using samtools
    # pipeline.transform(
    #     task_func=stages.index_reference_samtools,
    #    name='index_reference_samtools',
    #    input=output_from('original_reference'),
    #    filter=suffix('.fa'),
    #    output='.fa.fai')

    # Index the reference using bowtie 2
    # pipeline.transform(
    #     task_func=stages.index_reference_bowtie2,
    #     name='index_reference_bowtie2',
    #     input=output_from('original_reference'),
    #     filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'),
    #     output=['{path[0]}/{refname[0]}.1.bt2',
    #             '{path[0]}/{refname[0]}.2.bt2',
    #             '{path[0]}/{refname[0]}.3.bt2',
    #             '{path[0]}/{refname[0]}.4.bt2',
    #             '{path[0]}/{refname[0]}.rev.1.bt2',
    #             '{path[0]}/{refname[0]}.rev.2.bt2'],
    #     extras=['{path[0]}/{refname[0]}'])

    # # Create a FASTA sequence dictionary for the reference using picard
    # pipeline.transform(
    #     task_func=stages.reference_dictionary_picard,
    #     name='reference_dictionary_picard',
    #     input=output_from('original_reference'),
    #     filter=suffix('.fa'),
    #     output='.dict')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add two more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort alignment with sambamba
    pipeline.transform(task_func=stages.sort_bam_sambamba,
                       name='sort_alignment',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.sorted.bam')

    # Extract MMR genes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_genes_bedtools,
        name='extract_genes_bedtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.mmr.bam')

    # Extract selected chromosomes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_chromosomes_samtools,
        name='extract_chromosomes_samtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.chroms.bam')

    # Index the MMR genes bam file with samtools
    pipeline.transform(task_func=stages.index_bam,
                       name='index_mmr_alignment',
                       input=output_from('extract_genes_bedtools'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).mmr.bam'),
                       output='{path[0]}/{sample[0]}.mmr.bam.bai')

    # Compute depth of coverage of the alignment with GATK DepthOfCoverage
    #pipeline.transform(
    #    task_func=stages.alignment_coverage_gatk,
    #    name='alignment_coverage_gatk',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs([reference_file]),
    #    output='{path[0]}/{sample[0]}.coverage_summary',
    #    extras=['{path[0]}/{sample[0]}_coverage'])

    # Index the alignment with samtools
    pipeline.transform(
        task_func=stages.index_bam,
        name='index_alignment',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.sorted.bam.bai')

    # Generate alignment stats with bamtools
    pipeline.transform(task_func=stages.bamtools_stats,
                       name='bamtools_stats',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.stats.txt')

    # Extract the discordant paired-end alignments
    pipeline.transform(task_func=stages.extract_discordant_alignments,
                       name='extract_discordant_alignments',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.discordants.unsorted.bam')

    # Extract split-read alignments
    pipeline.transform(task_func=stages.extract_split_read_alignments,
                       name='extract_split_read_alignments',
                       input=output_from('align_bwa'),
                       filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
                       output='{path[0]}/{sample[0]}.splitters.unsorted.bam')

    # Sort discordant reads.
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_discordants',
        input=output_from('extract_discordant_alignments'),
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.discordants'],
        output='{path[0]}/{sample[0]}.discordants.bam')

    # Index the sorted discordant bam with samtools
    # pipeline.transform(
    #   task_func=stages.index_bam,
    #   name='index_discordants',
    #   input=output_from('sort_discordants'),
    #   filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'),
    #   output='{path[0]}/{sample[0]}.discordants.bam.bai')

    # Sort discordant reads
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_splitters',
        input=output_from('extract_split_read_alignments'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.splitters'],
        output='{path[0]}/{sample[0]}.splitters.bam')

    # Index the sorted splitters bam with samtools
    # pipeline.transform(
    #    task_func=stages.index_bam,
    #    name='index_splitters',
    #    input=output_from('sort_splitters'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'),
    #    output='{path[0]}/{sample[0]}.splitters.bam.bai')

    # Call structural variants with lumpy
    (pipeline.transform(
        task_func=stages.structural_variants_lumpy,
        name='structural_variants_lumpy',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        add_inputs=add_inputs([
            '{path[0]}/{sample[0]}.splitters.bam',
            '{path[0]}/{sample[0]}.discordants.bam'
        ]),
        output='{path[0]}/{sample[0]}.lumpy.vcf').follows('index_alignment').
     follows('sort_splitters').follows('sort_discordants'))

    # Call genotypes on lumpy output using SVTyper
    #(pipeline.transform(
    #    task_func=stages.genotype_svtyper,
    #    name='genotype_svtyper',
    #    input=output_from('structural_variants_lumpy'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']),
    #    output='{path[0]}/{sample[0]}.svtyper.vcf')
    #    .follows('align_bwa')
    #    .follows('sort_splitters')
    #    .follows('index_alignment')
    #    .follows('index_splitters')
    #    .follows('index_discordants'))

    # Call SVs with Socrates
    (pipeline.transform(
        task_func=stages.structural_variants_socrates,
        name='structural_variants_socrates',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        # output goes to {path[0]}/socrates/
        output=
        '{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt',
        extras=['{path[0]}']))

    # Call DELs with DELLY
    pipeline.merge(task_func=stages.deletions_delly,
                   name='deletions_delly',
                   input=output_from('sort_alignment'),
                   output='delly.DEL.vcf')

    # Call DUPs with DELLY
    pipeline.merge(task_func=stages.duplications_delly,
                   name='duplications_delly',
                   input=output_from('sort_alignment'),
                   output='delly.DUP.vcf')

    # Call INVs with DELLY
    pipeline.merge(task_func=stages.inversions_delly,
                   name='inversions_delly',
                   input=output_from('sort_alignment'),
                   output='delly.INV.vcf')

    # Call TRAs with DELLY
    pipeline.merge(task_func=stages.translocations_delly,
                   name='translocations_delly',
                   input=output_from('sort_alignment'),
                   output='delly.TRA.vcf')

    # Join both read pair files using gustaf_mate_joining
    #pipeline.transform(
    #    task_func=stages.gustaf_mate_joining,
    #    name='gustaf_mate_joining',
    #    input=output_from('fastq_to_fasta'),
    #    # Match the R1 (read 1) FASTA file and grab the path and sample name.
    #    # This will be the first input to the stage.
    #    # We assume the sample name may consist of only alphanumeric
    #    # characters.
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'),
    #    # Add one more input to the stage:
    #    #    1. The corresponding R2 FASTA file
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']),
    #    output='{path[0]}/{sample[0]}.joined_mates.fasta')

    # Call structural variants with pindel
    #(pipeline.transform(
    #    task_func=stages.structural_variants_pindel,
    #    name='structural_variants_pindel',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]),
    #    output='{path[0]}/{sample[0]}.pindel')
    #    .follows('index_reference_bwa')
    #    .follows('index_reference_samtools'))

    return pipeline
예제 #32
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq
        # new sample name = OHI031002-P02F04
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'
        ),

        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),

        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}_{readid[0]}/{sample[0]}_{readid[0]}.bam'
    )

    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'
        ),
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'),
        # extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'],
        extras=['{sample[0]}', '{readid[0]}'],

        # The output file name is the sample name with a .bam extension.
        output='variants/undr_rover/{sample[0]}_{readid[0]}.vcf')

    # Sort the BAM file using Picard
    pipeline.transform(task_func=stages.sort_bam_picard,
                       name='sort_bam_picard',
                       input=output_from('align_bwa'),
                       filter=suffix('.bam'),
                       output='.sort.bam')

    # High quality and primary alignments
    pipeline.transform(task_func=stages.primary_bam,
                       name='primary_bam',
                       input=output_from('sort_bam_picard'),
                       filter=suffix('.sort.bam'),
                       output='.primary.bam')

    # index bam file
    pipeline.transform(task_func=stages.index_sort_bam_picard,
                       name='index_bam',
                       input=output_from('primary_bam'),
                       filter=suffix('.primary.bam'),
                       output='.primary.bam.bai')

    # Clip the primer_seq from BAM File
    (pipeline.transform(
        task_func=stages.clip_bam,
        name='clip_bam',
        input=output_from('primary_bam'),
        filter=suffix('.primary.bam'),
        output='.primary.primerclipped.bam').follows('index_bam'))

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('clip_bam'),
        # filter=suffix('.merged.dedup.realn.bam'),
        filter=formatter(
            '.+/(?P<sample>[a-zA-Z0-9-_]+).primary.primerclipped.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')
    # .follows('index_sort_bam_picard'))

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('call_haplotypecaller_gatk'),
                   output='variants/gatk/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.annotate.vcf')

    # Apply VariantFiltration using GATK
    pipeline.transform(task_func=stages.apply_variant_filtration_gatk_lenient,
                       name='apply_variant_filtration_gatk_lenient',
                       input=output_from('variant_annotator_gatk'),
                       filter=suffix('.raw.annotate.vcf'),
                       output='.raw.annotate.filtered_lenient.vcf')

    return pipeline