Exemplo n.º 1
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='md5')
    # Get a list of paths to all the input files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_files,
        name='original_files',
        output=input_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.md5_checksum,
        name='md5_checksum',
        input=output_from('original_files'),
        filter=suffix(''),
        output='.md5')


    return pipeline
Exemplo n.º 2
0
    def test_newstyle_mkdir (self):
        test_pipeline = Pipeline("test")
        test_pipeline.follows(task_which_makes_directories, mkdir(directories), mkdir(tempdir + 'c'), mkdir(tempdir + 'd', tempdir + 'e'), mkdir(tempdir + 'e'))
        test_pipeline.run(multiprocess = 10, verbose = 0)

        for d in 'abcde':
            fullpath = os.path.join(os.path.dirname(__file__), tempdir, d)
            self.assertTrue(os.path.exists(fullpath))
Exemplo n.º 3
0
 def test_newstyle_ruffus (self):
     test_pipeline = Pipeline("test")
     test_pipeline.parallel(parallel_task, [['A', 1], ['B',3], ['C',3], ['D',4], ['E',4], ['F',4]])
     try:
         test_pipeline.run(multiprocess = 50, verbose = 0)
     except ruffus.ruffus_exceptions.RethrownJobError:
         return
     raise Exception("Missing exception")
Exemplo n.º 4
0
 def test_newstyle_ruffus(self):
     test_pipeline = Pipeline("test")
     test_pipeline.parallel(parallel_task, [["A", 1], ["B", 3], ["C", 3], ["D", 4], ["E", 4], ["F", 4]])
     try:
         test_pipeline.run(multiprocess=50, verbose=0)
     except ruffus.ruffus_exceptions.RethrownJobError:
         return
     raise Exception("Missing exception")
Exemplo n.º 5
0
    def test_newstyle_task(self):
        test_pipeline = Pipeline("test")
        test_pipeline.files(task1, a)

        save_to_str_logger = t_save_to_str_logger()
        test_pipeline.run(multiprocess=10, logger=save_to_str_logger, verbose=1)
        self.assertTrue("@files() was empty" in save_to_str_logger.warning_str)
        print("\n    Warning printed out correctly", file=sys.stderr)
    def create_pipeline(self):
        """
        Create new pipeline on the fly without using decorators
        """
        global count_pipelines
        count_pipelines = count_pipelines + 1
        test_pipeline = Pipeline("test %d" % count_pipelines)

        test_pipeline.transform(task_func=transform1,
                                input=input_file,
                                filter=suffix('.txt'),
                                output='.output',
                                extras=[runtime_data])

        test_pipeline.transform(task_func=transform_raise_error,
                                input=input_file,
                                filter=suffix('.txt'),
                                output='.output',
                                extras=[runtime_data])

        test_pipeline.split(task_func=split1,
                            input=input_file,
                            output=split1_outputs)

        test_pipeline.merge(task_func=merge2,
                            input=split1,
                            output=merge2_output)
        return test_pipeline
Exemplo n.º 7
0
 def test_newstyle_simpler (self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(task1, input_file_names, extras = [logger_proxy, logging_mutex])
     test_pipeline.transform(task2, task1, suffix(".1"), ".2", extras = [logger_proxy, logging_mutex])
     test_pipeline.transform(task3, task2, suffix(".2"), ".3", extras = [logger_proxy, logging_mutex])
     test_pipeline.merge(task4, task3, final_file_name, extras = [logger_proxy, logging_mutex])
     #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex})
     test_pipeline.run(multiprocess = 500, verbose = 0)
 def test_newstyle_no_re_match (self):
     try:
         test_pipeline = Pipeline("test")
         test_pipeline.transform(task_func = task_2,
                                 input = None,
                                 filter = regex(tempdir + "b"),
                                 replace_inputs = inputs(tempdir + "a", tempdir + "b"),
                                 output = "task_1.output")
         test_pipeline.run(multiprocess = 10, verbose = 0)
     except ruffus.ruffus_exceptions.error_task_transform_inputs_multiple_args:
         print("\tExpected exception thrown 1")
         return
     except ruffus.ruffus_exceptions.error_inputs_multiple_args:
         print("\tExpected exception thrown 2")
         return
     raise Exception("Inputs(...) with multiple arguments should have thrown an exception")
Exemplo n.º 9
0
    def test_newstyle_ruffus (self):

        test_pipeline = Pipeline("test")

        test_pipeline.follows(setup_simulation_data, mkdir(gene_data_dir, simulation_data_dir))

        test_pipeline.files(gwas_simulation, generate_simulation_params)\
            .follows(setup_simulation_data)\
            .follows(mkdir(working_dir, os.path.join(working_dir, "simulation_results")))

        test_pipeline.collate(statistical_summary, gwas_simulation, regex(r"simulation_results/(\d+).\d+.simulation_res"), r"\1.mean")\
            .posttask(lambda : sys.stdout.write("\nOK\n"))

        test_pipeline.run(multiprocess = 50, verbose = 0)
        for oo in "000.mean", "001.mean":
            results_file_name = os.path.join(working_dir, oo)
            if not os.path.exists(results_file_name):
                raise Exception("Missing %s" % results_file_name)
    def test_newstyle_no_re_match (self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir)
        test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output")


        save_to_str_logger = t_save_to_str_logger()
        test_pipeline.run(multiprocess = 10, logger = save_to_str_logger, verbose = 1)
        print(save_to_str_logger.warning_str)
        self.assertTrue("no file names matched" in save_to_str_logger.warning_str)
        print("\n    Warning printed out correctly", file=sys.stderr)
Exemplo n.º 11
0
def make_pipeline2( pipeline_name = "pipeline2"):
    test_pipeline2 = Pipeline(pipeline_name)
    test_pipeline2.transform(task_func   = task_1_to_1,
                             # task name
                            name        = "44_to_55",
                             # placeholder: will be replaced later with set_input()
                            input       = None,
                            filter      = suffix(".44"),
                            output      = ".55")
    test_pipeline2.merge(   task_func   = task_m_to_1,
                            input       = test_pipeline2["44_to_55"],
                            output      = tempdir + "/final.output",)

    # Set head and tail
    test_pipeline2.set_tail_tasks([test_pipeline2[task_m_to_1]])
    if not DEBUG_do_not_define_head_task:
        test_pipeline2.set_head_tasks([test_pipeline2["44_to_55"]])

    return test_pipeline2
Exemplo n.º 12
0
    def test_newstyle_ruffus(self):
        test_pipeline = Pipeline("test")
        test_pipeline.split(task_func=prepare_files,
                            input=None,
                            output=tempdir + '*.animal')\
            .follows(mkdir(tempdir, tempdir + "test"))\
            .posttask(lambda: do_write(tempdir + "task.done", "Task 1 Done\n"))

        test_pipeline.collate(task_func=summarise_by_grouping,
                              input=prepare_files,
                              filter=regex(r'(.*/).*\.(.*)\.animal'),
                              output=r'\1\2.results')\
            .posttask(lambda: do_write(tempdir + "task.done", "Task 2 Done\n"))

        test_pipeline.run(multiprocess=10, verbose=0)
        check_species_correct()
Exemplo n.º 13
0
    def test_transform_with_missing_formatter_args_b(self):
        test_pipeline = Pipeline("test")


        test_pipeline.originate(task_func   = generate_initial_files,
                                output      = [os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])\
            .mkdir(tempdir)


        test_pipeline.transform(task_func   = transform_with_missing_formatter_args,
                                input       = generate_initial_files,
                                filter      = formatter(),
                                output      = "{path[0]}/{basename[0]}.task1",
                                extras      =['echo {dynamic_message} > {some_file}'])
        s = StringIO()
        test_pipeline.printout(s, [transform_with_missing_formatter_args], verbose=4, wrap_width = 10000, pipeline= "test")
        self.assertIn("Missing key = {dynamic_message}", s.getvalue())

        #log to stream
        s = StringIO()
        logger = t_stream_logger(s)
        test_pipeline.run([transform_with_missing_formatter_args], verbose=5, pipeline= "test", logger=logger)
        self.assertIn("Missing key = {dynamic_message}", s.getvalue())
Exemplo n.º 14
0
    def test_newstyle_mkdir (self):
        test_pipeline = Pipeline("test")

        test_pipeline.follows(task_which_makes_directories,
                         mkdir(directories),
                         mkdir(unicode(tempdir + "c")),
                         mkdir(unicode(tempdir + "d"),
                               unicode(tempdir + "e")),
                         mkdir(unicode(tempdir + "e")))\
            .posttask(touch_file(unicode(tempdir + "f")))

        test_pipeline.originate(task_which_makes_files, [tempdir + "g", tempdir + "h"])
        test_pipeline.run(multiprocess = 10, verbose = 0)

        for d in 'abcdefgh':
            fullpath = os.path.join(os.path.dirname(__file__), tempdir, d)
            self.assertTrue(os.path.exists(fullpath))
Exemplo n.º 15
0
    def test_newstyle_mkdir_run(self):
        test_pipeline = Pipeline("test")

        test_pipeline.split(task_func = generate_initial_files1,
                            input = 1,
                            output = [tempdir +  "/" + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.transform( task_func = test_transform,
                                 input     = generate_initial_files1,
                                 filter    = formatter(),
                                 output    = "{path[0]}/{basename[0]}.dir/{basename[0]}.tmp2")\
            .mkdir(tempdir + "/test1")\
            .mkdir(tempdir + "/test2")\
            .mkdir(generate_initial_files1, formatter(),
                        ["{path[0]}/{basename[0]}.dir", 3, "{path[0]}/{basename[0]}.dir2"])

        test_pipeline.mkdir(test_transform2, tempdir + "/test3")\
            .mkdir(generate_initial_files1, formatter(),
                    "{path[0]}/{basename[0]}.dir2")
        cleanup_tmpdir()
        pipeline_run([test_transform, test_transform2], verbose=0, multiprocess = 2, pipeline= "main")
Exemplo n.º 16
0
        outf.write(prefices + ",")


def check_combinations_with_replacement3_merged_task(infiles, outfile):
    with open(outfile, "w") as p:
        for infile in sorted(infiles):
            with open(infile) as ii:
                p.write(ii.read())


def cleanup_tmpdir():
    os.system('rm -f %s %s' %
              (os.path.join(tempdir, '*'), RUFFUS_HISTORY_FILE))


test_pipeline1 = Pipeline("test1")
test_pipeline2 = Pipeline("test2")
gen_task1 = test_pipeline1.originate(task_func=generate_initial_files1,
                                     name="WOWWWEEE",
                                     output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"])
test_pipeline1.originate(task_func=generate_initial_files2,
                         output=[tempdir + "/e_name.tmp1", tempdir + "/f_name.tmp1"])
test_pipeline1.originate(task_func=generate_initial_files3,
                         output=[tempdir + "/g_name.tmp1", tempdir + "/h_name.tmp1"])
test_pipeline1.product(task_func=check_product_task,
                       input=[tempdir + "/" + prefix +
                              "_name.tmp1" for prefix in "abcd"],
                       filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
                       input2=generate_initial_files2,
                       filter2=formatter(),
                       input3=generate_initial_files3,
Exemplo n.º 17
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='hiplexpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq.gz'),

        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_R2_{lib[0]}.fastq.gz'),

        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}', '{lib[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}.clipped.bam')


    # Call variants using undr_rover
    pipeline.transform(
        task_func=stages.apply_undr_rover,
        name='apply_undr_rover',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq.gz'),
        add_inputs=add_inputs(
            '{path[0]}/{sample[0]}_R2_{lib[0]}.fastq.gz'),
        extras=['{sample[0]}'],

        # The output file name is the sample name with a .bam extension.
        output='variants/undr_rover/{sample[0]}.vcf')

    # Sort the BAM file using Picard
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.clipped.bam'),
        output='.clipped.sort.bam')

    # High quality and primary alignments
    pipeline.transform(
        task_func=stages.primary_bam,
        name='primary_bam',
        input=output_from('sort_bam_picard'),
        filter=suffix('.clipped.sort.bam'),
        output='.clipped.sort.hq.bam')

    # index bam file
    pipeline.transform(
        task_func=stages.index_sort_bam_picard,
        name='index_bam',
        input=output_from('primary_bam'),
        filter=suffix('.clipped.sort.hq.bam'),
        output='.clipped.sort.hq.bam.bai')

    # generate mapping metrics.
    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed',
        input=output_from('primary_bam'),
        filter=suffix('.clipped.sort.hq.bam'),
        output='.intersectbed.bam')

    pipeline.transform(
        task_func=stages.coverage_bed,
        name='coverage_bed',
        input=output_from('intersect_bed'),
        filter=suffix('.intersectbed.bam'),
        output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads',
        input=output_from('primary_bam'),
        filter=suffix('.clipped.sort.hq.bam'),
        output='.mapped_to_genome.txt')

    pipeline.transform(
        task_func=stages.target_reads,
        name='target_reads',
        input=output_from('intersect_bed'),
        filter=suffix('.intersectbed.bam'),
        output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads',
        input=output_from('align_bwa'),
        filter=suffix('.clipped.bam'),
        output='.total_raw_reads.txt')

#    pipeline.transform(
#        task_func=stages.generate_stats,
#        name='generate_stats',
#        input=output_from(['coverage_bed', 'genome_reads', 'target_reads', 'total_reads']), 
#        filter=formatter('.+/(?P<sample>.+).txt'),
#        extras=['{sample[0]}'],
#        output='all_sample.summary.txt')

    ###### GATK VARIANT CALLING ######
    # Call variants using GATK
    (pipeline.transform(
        task_func=stages.call_haplotypecaller_gatk,
        name='call_haplotypecaller_gatk',
        input=output_from('primary_bam'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'),
        output='variants/gatk/{sample[0]}.g.vcf')
        .follows('index_sort_bam_picard'))

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_haplotypecaller_gatk'),
        output='variants/gatk/ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.combined.vcf'),
        output='.raw.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(
       task_func=stages.variant_annotator_gatk,
       name='variant_annotator_gatk',
       input=output_from('genotype_gvcf_gatk'),
       filter=suffix('.raw.vcf'),
       output='.raw.annotate.vcf')

    # Apply VariantFiltration using GATK
    pipeline.transform(
        task_func=stages.apply_variant_filtration_gatk,
        name='apply_variant_filtration_gatk',
        input=output_from('variant_annotator_gatk'),
        filter=suffix('.raw.annotate.vcf'),
        output='.raw.annotate.filtered.vcf')

# Apply VEP 
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('apply_variant_filtration_gatk'),
        filter=suffix('.raw.annotate.filtered.vcf'),
        output='.raw.annotate.filtered.vep.vcf')
        .follows('apply_variant_filtration_gatk'))

    pipeline.transform(
        task_func=stages.sort_vcfs,
        name='sort_vcfs',
        input=output_from('apply_undr_rover'),
        filter=suffix('.vcf'),
        output='.sorted.vcf')

    pipeline.transform(
        task_func=stages.index_vcfs,
        name='index_vcfs',
        input=output_from('sort_vcfs'),
        filter=suffix('.sorted.vcf'),     
        output='.sorted.vcf.tbi')         

    (pipeline.transform(
        task_func=stages.concatenate_vcfs,
        name='sort_vcfs',
        input=output_from('apply_undr_rover'),
        filter=suffix('.sorted.vcf'),     
        output='variants/undr_rover/combined_undr_rover.vcf')
        .follows('index_vcfs'))

    pipeline.transform(
        task_func=stages.index_final_vcf,
        name='index_final_vcf',
        input=output_from('concatenate_vcfs'),
        filter=suffix('.vcf'),
        output='.vcf.tbi')









    return pipeline
Exemplo n.º 18
0
 def test_newstyle_simpler(self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(task1,
                             input_file_names,
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.transform(task2,
                             task1,
                             suffix(".1"),
                             ".2",
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.transform(task3,
                             task2,
                             suffix(".2"),
                             ".3",
                             extras=[logger_proxy, logging_mutex])
     test_pipeline.merge(task4,
                         task3,
                         final_file_name,
                         extras=[logger_proxy, logging_mutex])
     #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex})
     test_pipeline.run(multiprocess=500, verbose=0)
Exemplo n.º 19
0
        "test_active_if/b.3" -> "test_active_if/b.4"
            "test_active_if/b.4" -> "test_active_if/summary.5"
"""

expected_inactive_text = """null -> "test_active_if/a.1"
    "test_active_if/a.1" -> "test_active_if/a.2"
        "test_active_if/a.2" -> "test_active_if/a.4"
null -> "test_active_if/b.1"
    "test_active_if/b.1" -> "test_active_if/b.2"
        "test_active_if/b.2" -> "test_active_if/b.4"
            "test_active_if/b.4" -> "test_active_if/summary.5"
"""


# alternative syntax
test_pipeline = Pipeline("test")
test_pipeline.originate(task1, ['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")\
    .follows(mkdir("test_active_if"))
test_pipeline.transform(task2, task1, suffix(".1"), ".2")
test_pipeline.transform(task3, task1, suffix(
    ".1"), ".3").active_if(lambda: pipeline_active_if)
test_pipeline.collate(task4, [task2, task3], regex(r"(.+)\.[23]"), r"\1.4")
test_pipeline.merge(task5, task4, "test_active_if/summary.5")


class Test_ruffus(unittest.TestCase):
    def setUp(self):
        try:
            shutil.rmtree(tempdir)
        except:
            pass
Exemplo n.º 20
0
        with open(output_file, "w") as oo: pass

#---------------------------------------------------------------
#   first task
@transform(create_initial_file_pairs, suffix(".start"), ".output.1")
def first_task(input_files, output_file):
    with open(output_file, "w"): pass


#---------------------------------------------------------------
#   second task
@transform(first_task, suffix(".output.1"), ".output.2")
def second_task(input_files, output_file):
    with open(output_file, "w"): pass

test_pipeline = Pipeline("test")
test_pipeline.originate(output = [    [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job1.a.start',  tempdir + 'job1.b.start'],
                                       [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job2.a.start', tempdir + 'job2.b.start'],
                                       [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job3.a.start', tempdir + 'job3.b.start']    ],
                                       task_func = create_initial_file_pairs)
test_pipeline.transform(task_func = first_task, input = create_initial_file_pairs, filter = suffix(".start"), output = ".output.1")
test_pipeline.transform(input = first_task, filter = suffix(".output.1"), output = ".output.2", task_func= second_task)


decorator_syntax = 0
oop_syntax = 1

class Test_verbosity(unittest.TestCase):
    #___________________________________________________________________________
    #
    #   test_printout_abbreviated_path1
Exemplo n.º 21
0
    def test_newstyle_ruffus (self):

        test_pipeline = Pipeline("test")


        test_pipeline.split(task_func   = split_fasta_file,
                            input       = tempdir  + "original.fa",
                            output      = [tempdir  + "files.split.success",
                                           tempdir + "files.split.*.fa"])\
            .posttask(lambda: verbose_output.write("    Split into %d files\n" % 10))


        test_pipeline.transform(task_func   = align_sequences,
                                input       = split_fasta_file,
                                filter      = suffix(".fa"),
                                output      = ".aln"                     # fa -> aln
                                )\
            .posttask(lambda: verbose_output.write("    Sequences aligned\n"))

        test_pipeline.transform(task_func   = percentage_identity,
                                input       = align_sequences,      # find all results from align_sequences
                                filter      = suffix(".aln"),       # replace suffix with:
                                output      = [r".pcid",            #   .pcid suffix for the result
                                               r".pcid_success"]    #   .pcid_success to indicate job completed
                                )\
            .posttask(lambda: verbose_output.write("    %Identity calculated\n"))


        test_pipeline.merge(task_func   = combine_results,
                            input       = percentage_identity,
                            output      = [tempdir + "all.combine_results",
                                           tempdir + "all.combine_results_success"])\
            .posttask(lambda: verbose_output.write("    Results recombined\n"))

        test_pipeline.run(multiprocess = 50, verbose = 0)
        if not os.path.exists(tempdir + "all.combine_results"):
            raise Exception("Missing %s" % (tempdir + "all.combine_results"))
Exemplo n.º 22
0
def make_pipeline_process(state):
    # Define empty pipeline
    pipeline = Pipeline(name='genericpipe')
    # Get a list of paths to all the directories to be combined for variant calling
    run_directories = state.config.get_option('runs')
    #grab files from each of the processed directories in "runs"
    gatk_files = []

    for directory in run_directories:
        gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf'))

    # Stages are dependent on the state
    stages = Stages(state)

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.glob_gatk,
                       name='glob_gatk',
                       output=gatk_files)

    safe_make_dir('variants')
    safe_make_dir('variants/gatk')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(task_func=stages.combine_gvcf_gatk,
                   name='combine_gvcf_gatk',
                   input=output_from('glob_gatk'),
                   output='ALL.combined.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(task_func=stages.genotype_gvcf_gatk,
                       name='genotype_gvcf_gatk',
                       input=output_from('combine_gvcf_gatk'),
                       filter=suffix('.combined.vcf'),
                       output='.raw.vcf')

    # Apply GT filters to genotyped vcf
    pipeline.transform(task_func=stages.genotype_filter_gatk,
                       name='genotype_filter_gatk',
                       input=output_from('genotype_gvcf_gatk'),
                       filter=suffix('.raw.vcf'),
                       output='.raw.gt-filter.vcf')

    # Decompose and normalise multiallelic sites
    pipeline.transform(task_func=stages.vt_decompose_normalise,
                       name='vt_decompose_normalise',
                       input=output_from('genotype_filter_gatk'),
                       filter=suffix('.raw.gt-filter.vcf'),
                       output='.raw.gt-filter.decomp.norm.vcf')

    # Annotate VCF file using GATK
    pipeline.transform(task_func=stages.variant_annotator_gatk,
                       name='variant_annotator_gatk',
                       input=output_from('vt_decompose_normalise'),
                       filter=suffix('.raw.gt-filter.decomp.norm.vcf'),
                       output='.raw.gt-filter.decomp.norm.annotate.vcf')

    # Filter vcf
    pipeline.transform(
        task_func=stages.gatk_filter,
        name='gatk_filter',
        input=output_from('variant_annotator_gatk'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vcf')

    #Apply VEP
    (pipeline.transform(
        task_func=stages.apply_vep,
        name='apply_vep',
        input=output_from('gatk_filter'),
        filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'),
        output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf').follows(
            'index_final_vcf'))

    return pipeline
Exemplo n.º 23
0
def make_pipeline1(
        pipeline_name,  # Pipelines need to have a unique name
        starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(
        task_func=task_m_to_1,
        name="add_input",
        # Lookup Task from function name task_originate()
        #   So long as this is unique in the pipeline
        input=task_originate,
        # requires an anchor from 3.7 onwards, see
        # https://bugs.python.org/issue34982
        filter=regex(r"^(.*)"),
        add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"),
        output=r"\1.22")
    test_pipeline.transform(
        task_func=task_1_to_1,
        name="22_to_33",
        # Lookup Task from Task name
        #   Function name is not unique in the pipeline
        input=output_from("add_input"),
        filter=suffix(".22"),
        output=".33")
    tail_task = test_pipeline.transform(
        task_func=task_1_to_1,
        name="33_to_44",
        # Ask Pipeline to lookup Task from Task name
        input=test_pipeline["22_to_33"],
        filter=suffix(".33"),
        output=".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
Exemplo n.º 24
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='vcf_annotation')
    # Get a list of paths to all the FASTQ files
    vcf_files = state.config.get_option('vcfs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original VCF files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_vcf,
        name='original_vcf',
        output=vcf_file)

    # Decompose VCF using Vt
    pipeline.transform(
        task_func=stages.decompose_vcf,
        name='decompose_vcf',
        input=output_from('original_vcf'),
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).vcf'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the VCF file name (e.g. study/family name.
        # This is needed within the stage for finding out sample specific
        # configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.decompose.normalize.vcf')

    # FILTER COMMON VARIANTS
    # ADD FILTER COMMON VARIANTS USING VEP

    # Annotate using VEP
    pipeline.transform(
        task_func=stages.annotate_vep,
        name='annotate_vep',
        input=output_from('decompose_vcf'),
        filter=suffix('.vcf'),
        output='.vep.vcf')

    # Annotate using SnpEff
    pipeline.transform(
        task_func=stages.annotate_snpeff,
        name='annotate_snpeff',
        input=output_from('annotate_vep'),
        filter=suffix('.vcf'),
        output='.snpeff.vcf')

    # Mark duplicates in the BAM file using Picard
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(
            ['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline
                                             extension):
    raise Exception("Should blow up first")


# ___________________________________________________________________________
#
#   check_regex_out_of_range_regex_reference_error_task
# ___________________________________________________________________________
def check_regex_out_of_range_regex_reference_error_task(infiles, outfile,
                                                        prefix1,
                                                        prefix2,
                                                        extension):
    raise Exception("Should blow up first")


test_pipeline = Pipeline("test")

test_pipeline.originate(task_func=generate_initial_files1,
                        output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcdefghi"])

test_pipeline.transform(task_func=check_regex_task,
                        input=generate_initial_files1,
                        filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.tmp1)"),
                        output=r"\1/\g<PREFIX>\3.tmp2",  # output file
                        extras=[r"\2",                # extra: prefix = \2
                                r"\g<PREFIX>",        # extra: prefix = \2
                                r"\4"])               # extra: extension
test_pipeline.transform(task_func=check_regex_unmatched_task,
                        input=generate_initial_files1,
                        filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.xxx)"),
                        output=r"\1/\g<PREFIXA>\3.tmp2",  # output file
Exemplo n.º 26
0
        outf.write(prefices + ",")


def test_combinations_with_replacement3_merged_task(infiles, outfile):
    with open(outfile, "w") as p:
        for infile in sorted(infiles):
            with open(infile) as ii:
                p.write(ii.read())


def cleanup_tmpdir():
    os.system('rm -f %s %s' %
              (os.path.join(tempdir, '*'), RUFFUS_HISTORY_FILE))


test_pipeline1 = Pipeline("test1")
test_pipeline2 = Pipeline("test2")
gen_task1 = test_pipeline1.originate(
    task_func=generate_initial_files1,
    name="WOWWWEEE",
    output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"])
test_pipeline1.originate(
    task_func=generate_initial_files2,
    output=[tempdir + "/e_name.tmp1", tempdir + "/f_name.tmp1"])
test_pipeline1.originate(
    task_func=generate_initial_files3,
    output=[tempdir + "/g_name.tmp1", tempdir + "/h_name.tmp1"])
test_pipeline1.product(
    task_func=test_product_task,
    input=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"],
    filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"),
    def test_newstyle_collate(self):
        """
        As above but create pipeline on the fly using object orientated syntax rather than decorators
        """

        #
        # Create pipeline on the fly, joining up tasks
        #
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func   = generate_initial_files,
                                output      = original_files)\
            .mkdir(tempdir, tempdir+"/test")


        test_pipeline.subdivide(    task_func   = split_fasta_file,
                                    input       = generate_initial_files,
                                    filter      = regex(r".*\/original_(\d+).fa"),       # match original files
                                    output      = [tempdir + r"/files.split.\1.success", # flag file for each original file
                                                   tempdir + r"/files.split.\1.*.fa"],   # glob pattern
                                    extras      = [r"\1"])\
            .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))


        test_pipeline.transform(task_func   = align_sequences,
                                input       = split_fasta_file,
                                filter      = suffix(".fa"),
                                output      = ".aln")  \
            .posttask(lambda: sys.stderr.write("\tSequences aligned\n"))

        test_pipeline.transform(task_func   = percentage_identity,
                                input       = align_sequences,             # find all results from align_sequences
                                filter      = suffix(".aln"),             # replace suffix with:
                                output      = [r".pcid",                  #   .pcid suffix for the result
                                               r".pcid_success"]         #   .pcid_success to indicate job completed
                                )\
            .posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))


        test_pipeline.collate(task_func   = combine_results,
                              input       = percentage_identity,
                              filter      = regex(r".*files.split\.(\d+)\.\d+.pcid"),
                              output      = [tempdir + r"/\1.all.combine_results",
                                             tempdir + r"/\1.all.combine_results_success"])\
            .posttask(lambda: sys.stderr.write("\tResults recombined\n"))

        #
        # Cleanup, printout and run
        #
        self.cleanup_tmpdir()
        s = StringIO()
        test_pipeline.printout(s, [combine_results],
                               verbose=5,
                               wrap_width=10000)
        self.assertTrue(
            re.search('Job needs update:.*Missing files.*', s.getvalue(),
                      re.DOTALL) is not None)
        test_pipeline.run(verbose=0)
Exemplo n.º 28
0
    def create_pipeline(self):
        # each pipeline has a different name
        global cnt_pipelines
        cnt_pipelines = cnt_pipelines + 1
        test_pipeline = Pipeline("test %d" % cnt_pipelines)

        test_pipeline.originate(task_func=generate_initial_files1,
                                output=[tempdir + prefix + "_name.tmp1" for prefix in "abcd"])

        test_pipeline.originate(task_func=generate_initial_files2,
                                output=[tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"])

        test_pipeline.originate(task_func=generate_initial_files3,
                                output=[tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"])

        test_pipeline.originate(task_func=generate_initial_files4,
                                output=tempdir + "i_name.tmp1")

        test_pipeline.collate(task_func=check_task2,
                              input=[generate_initial_files1,
                                     generate_initial_files2,
                                     generate_initial_files3,
                                     generate_initial_files4],
                              filter=formatter(),
                              output="{path[0]}/all.tmp2")

        test_pipeline.transform(task_func=check_task3,
                                input=check_task2,
                                filter=suffix(".tmp2"),
                                output=".tmp3")

        test_pipeline.transform(task_func=check_task4,
                                input=check_task3,
                                filter=suffix(".tmp3"),
                                output=".tmp4")
        return test_pipeline
Exemplo n.º 29
0
    def test_newstyle_ruffus (self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\
            .follows(mkdir(tempdir))


        test_pipeline.split(task_func = step_4_split_numbers_into_chunks,
                       input = tempdir + "random_numbers.list",
                       output = tempdir + "*.chunks")\
            .follows(create_random_numbers)

        test_pipeline.transform(task_func = step_5_calculate_sum_of_squares,
                           input = step_4_split_numbers_into_chunks,
                           filter = suffix(".chunks"),
                           output = ".sums")

        test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\
            .posttask(lambda: sys.stdout.write("     hooray\n"))\
            .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done")))

        test_pipeline.run(multiprocess = 50, verbose = 0)
        output_file = os.path.join(tempdir, "variance.result")
        if not os.path.exists (output_file):
            raise Exception("Missing %s" % output_file)
Exemplo n.º 30
0
def make_pipeline1(pipeline_name,   # Pipelines need to have a unique name
                   starting_file_names):
    test_pipeline = Pipeline(pipeline_name)

    #   We can change the starting files later using
    #          set_input() for transform etc.
    #       or set_output() for originate
    #   But it can be more convenient to just pass this to the function making the pipeline
    #
    test_pipeline.originate(task_originate, starting_file_names)\
        .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\
        .posttask(touch_file(tempdir + "/testdir/whatever.txt"))
    test_pipeline.transform(task_func   = task_m_to_1,
                            name        = "add_input",
                            # Lookup Task from function name task_originate()
                            #   So long as this is unique in the pipeline
                            input       = task_originate,
                            filter      = regex(r"(.*)"),
                            add_inputs  = add_inputs(tempdir + "/testdir/whatever.txt"),
                            output      = r"\1.22")
    test_pipeline.transform(task_func   = task_1_to_1,
                            name        = "22_to_33",
                            # Lookup Task from Task name
                            #   Function name is not unique in the pipeline
                            input       = output_from("add_input"),
                            filter      = suffix(".22"),
                            output      = ".33")
    tail_task = test_pipeline.transform(task_func   = task_1_to_1,
                                        name        = "33_to_44",
                                        # Ask Pipeline to lookup Task from Task name
                                        input       = test_pipeline["22_to_33"],
                                        filter      = suffix(".33"),
                                        output      = ".44")

    #   Set the tail task so that users of my sub pipeline can use it as a dependency
    #       without knowing the details of task names
    #
    #   Use Task() object directly without having to lookup
    test_pipeline.set_tail_tasks([tail_task])

    #   If we try to connect a Pipeline without tail tasks defined, we have to
    #       specify the exact task within the Pipeline.
    #   Otherwise Ruffus will not know which task we mean and throw an exception
    if DEBUG_do_not_define_tail_task:
        test_pipeline.set_tail_tasks([])

    # Set the head task so that users of my sub pipeline send input into it
    #   without knowing the details of task names
    test_pipeline.set_head_tasks([test_pipeline[task_originate]])

    return test_pipeline
Exemplo n.º 31
0
 def test_newstyle_ruffus (self):
     test_pipeline = Pipeline("test")
     test_pipeline.originate(start_task, ["a.1", "b.1"])
     test_pipeline.transform(same_file_name_task, start_task, suffix(".1"), ".1")
     test_pipeline.transform(linked_file_name_task, start_task, suffix(".1"), ".linked.1")
     test_pipeline.transform(final_task, [linked_file_name_task, same_file_name_task], suffix(".1"), ".3")
     test_pipeline.run(log_exceptions = True, verbose = 0)
Exemplo n.º 32
0
    def test_newstyle_collate(self):
        """
        As above but create pipeline on the fly using object orientated syntax rather than decorators
        """

        #
        # Create pipeline on the fly, joining up tasks
        #
        test_pipeline = Pipeline("test")

        test_pipeline.originate(task_func=generate_initial_files,
                                output=original_files)\
            .mkdir(tempdir, tempdir+"/test")

        test_pipeline.subdivide(task_func=split_fasta_file,
                                input=generate_initial_files,
                                # match original files
                                filter=regex(r".*\/original_(\d+).fa"),
                                output=[tempdir + r"/files.split.\1.success",  # flag file for each original file
                                        tempdir + r"/files.split.\1.*.fa"],   # glob pattern
                                extras=[r"\1"])\
            .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK))

        test_pipeline.transform(task_func=align_sequences,
                                input=split_fasta_file,
                                filter=suffix(".fa"),
                                output=".aln")  \
            .posttask(lambda: sys.stderr.write("\tSequences aligned\n"))

        test_pipeline.transform(task_func=percentage_identity,
                                input=align_sequences,             # find all results from align_sequences
                                # replace suffix with:
                                filter=suffix(".aln"),
                                output=[r".pcid",  # .pcid suffix for the result
                                        r".pcid_success"]  # .pcid_success to indicate job completed
                                )\
            .posttask(lambda: sys.stderr.write("\t%Identity calculated\n"))

        test_pipeline.collate(task_func=combine_results,
                              input=percentage_identity,
                              filter=regex(r".*files.split\.(\d+)\.\d+.pcid"),
                              output=[tempdir + r"/\1.all.combine_results",
                                      tempdir + r"/\1.all.combine_results_success"])\
            .posttask(lambda: sys.stderr.write("\tResults recombined\n"))

        #
        # Cleanup, printout and run
        #
        self.cleanup_tmpdir()
        s = StringIO()
        test_pipeline.printout(s, [combine_results],
                               verbose=5, wrap_width=10000)
        self.assertTrue(re.search(
            'Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None)
        test_pipeline.run(verbose=0)
Exemplo n.º 33
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='complexo')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')

    # Sort the BAM file using Picard 
    pipeline.transform(
        task_func=stages.sort_bam_picard,
        name='sort_bam_picard',
        input=output_from('align_bwa'),
        filter=suffix('.bam'),
        output='.sort.bam')

    # Mark duplicates in the BAM file using Picard 
    pipeline.transform(
        task_func=stages.mark_duplicates_picard,
        name='mark_duplicates_picard',
        input=output_from('sort_bam_picard'),
        filter=suffix('.sort.bam'),
        # XXX should make metricsup an extra output?
        output=['.sort.dedup.bam', '.metricsdup'])

    # Generate chromosome intervals using GATK 
    pipeline.transform(
        task_func=stages.chrom_intervals_gatk,
        name='chrom_intervals_gatk',
        input=output_from('mark_duplicates_picard'),
        filter=suffix('.sort.dedup.bam'),
        output='.chr.intervals')

    # Local realignment using GATK 
    (pipeline.transform(
        task_func=stages.local_realignment_gatk,
        name='local_realignment_gatk',
        input=output_from('chrom_intervals_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.bam')
        .follows('mark_duplicates_picard'))

    # Base recalibration using GATK 
    pipeline.transform(
        task_func=stages.base_recalibration_gatk,
        name='base_recalibration_gatk',
        input=output_from('local_realignment_gatk'),
        filter=suffix('.sort.dedup.realn.bam'),
        output=['.recal_data.csv', '.count_cov.log'])

    # Print reads using GATK 
    (pipeline.transform(
        task_func=stages.print_reads_gatk,
        name='print_reads_gatk',
        input=output_from('base_recalibration_gatk'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'),
        add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'),
        output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam')
        .follows('local_realignment_gatk'))

    # Call variants using GATK 
    pipeline.transform(
        task_func=stages.call_variants_gatk,
        name='call_variants_gatk',
        input=output_from('print_reads_gatk'),
        filter=suffix('.sort.dedup.realn.recal.bam'),
        output='.raw.snps.indels.g.vcf')

    # Combine G.VCF files for all samples using GATK
    pipeline.merge(
        task_func=stages.combine_gvcf_gatk,
        name='combine_gvcf_gatk',
        input=output_from('call_variants_gatk'),
        output='PCExomes.mergegvcf.vcf')

    # Genotype G.VCF files using GATK 
    pipeline.transform(
        task_func=stages.genotype_gvcf_gatk,
        name='genotype_gvcf_gatk',
        input=output_from('combine_gvcf_gatk'),
        filter=suffix('.mergegvcf.vcf'),
        output='.genotyped.vcf')

    # SNP recalibration using GATK
    pipeline.transform(
        task_func=stages.snp_recalibrate_gatk,
        name='snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.snp_recal', '.snp_tranches', '.snp_plots.R'])

    # INDEL recalibration using GATK
    pipeline.transform(
        task_func=stages.indel_recalibrate_gatk,
        name='indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        output=['.indel_recal', '.indel_tranches', '.indel_plots.R'])

    # Apply SNP recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_snp_recalibrate_gatk,
        name='apply_snp_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']),
        output='.recal_SNP.vcf')
        .follows('snp_recalibrate_gatk'))

    # Apply INDEL recalibration using GATK  
    (pipeline.transform(
        task_func=stages.apply_indel_recalibrate_gatk,
        name='apply_indel_recalibrate_gatk',
        input=output_from('genotype_gvcf_gatk'),
        filter=suffix('.genotyped.vcf'),
        add_inputs=add_inputs(['PCExomes.indel_recal', 'PCExomes.indel_tranches']),
        output='.recal_INDEL.vcf')
        .follows('indel_recalibrate_gatk'))

    # Combine variants using GATK  
    (pipeline.transform(
        task_func=stages.combine_variants_gatk,
        name='combine_variants_gatk',
        input=output_from('apply_snp_recalibrate_gatk'),
        filter=suffix('.recal_SNP.vcf'),
        add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']),
        output='.combined.vcf')
        .follows('apply_indel_recalibrate_gatk'))

    # Select variants using GATK 
    pipeline.transform(
        task_func=stages.select_variants_gatk,
        name='select_variants_gatk',
        input=output_from('combine_variants_gatk'),
        filter=suffix('.combined.vcf'),
        output='.selected.vcf')

    return pipeline
Exemplo n.º 34
0
@transform(create_initial_file_pairs, suffix(".start"), ".output.1")
def first_task(input_files, output_file):
    with open(output_file, "w"):
        pass


# ---------------------------------------------------------------
#   second task
@transform(first_task, suffix(".output.1"), ".output.2")
def second_task(input_files, output_file):
    with open(output_file, "w"):
        pass


test_pipeline = Pipeline("test")
test_pipeline.originate(output=[
    [
        tempdir +
        'data/scratch/lg/what/one/two/three/four/five/six/seven/job1.a.start',
        tempdir + 'job1.b.start'
    ],
    [
        tempdir +
        'data/scratch/lg/what/one/two/three/four/five/six/seven/job2.a.start',
        tempdir + 'job2.b.start'
    ],
    [
        tempdir +
        'data/scratch/lg/what/one/two/three/four/five/six/seven/job3.a.start',
        tempdir + 'job3.b.start'
Exemplo n.º 35
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='test_pipeline')
    # Get a list of paths to all the FASTQ files
    input_files = state.config.get_option('files')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_files,
        name='original_files',
        output=input_files)

    pipeline.transform(
        task_func=stages.stage1,
        name='stage1',
        input=output_from('original_files'),
        filter=suffix('.0'),
        output='.1')

    pipeline.transform(
        task_func=stages.stage2,
        name='stage2',
        input=output_from('stage1'),
        filter=suffix('.1'),
        output='.2')

    pipeline.transform(
        task_func=stages.stage3,
        name='stage3',
        input=output_from('stage2'),
        filter=suffix('.2'),
        output='.3')

    pipeline.transform(
        task_func=stages.stage4,
        name='stage4',
        input=output_from('stage3'),
        filter=suffix('.3'),
        output='.4')

    pipeline.transform(
        task_func=stages.stage5,
        name='stage5',
        input=output_from('stage4'),
        filter=suffix('.4'),
        output='.5')

    return pipeline
Exemplo n.º 36
0
def make_pipeline(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='crpipe')
    # Get a list of paths to all the FASTQ files
    fastq_files = state.config.get_option('fastqs')
    # Find the path to the reference genome
    reference_file = state.config.get_option('reference')
    # Stages are dependent on the state
    stages = Stages(state)

    # The original FASTQ files
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_fastqs,
        name='original_fastqs',
        output=fastq_files)

    # Convert FASTQ file to FASTA using fastx toolkit
    # pipeline.transform(
    #     task_func=stages.fastq_to_fasta,
    #     name='fastq_to_fasta',
    #     input=output_from('original_fastqs'),
    #     filter=suffix('.fastq.gz'),
    #     output='.fasta')

    # The original reference file
    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(
        task_func=stages.original_reference,
        name='original_reference',
        output=reference_file)

    # Run fastQC on the FASTQ files
    pipeline.transform(
        task_func=stages.fastqc,
        name='fastqc',
        input=output_from('original_fastqs'),
        filter=suffix('.fastq.gz'),
        output='_fastqc')

    # Index the reference using BWA 
    pipeline.transform(
        task_func=stages.index_reference_bwa,
        name='index_reference_bwa',
        input=output_from('original_reference'),
        filter=suffix('.fa'),
        output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt'])
    
    # Index the reference using samtools 
    pipeline.transform(
        task_func=stages.index_reference_samtools,
        name='index_reference_samtools',
        input=output_from('original_reference'),
        filter=suffix('.fa'),
        output='.fa.fai')

    # Index the reference using bowtie 2 
    pipeline.transform(
        task_func=stages.index_reference_bowtie2,
        name='index_reference_bowtie2',
        input=output_from('original_reference'),
        filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'),
        output=['{path[0]}/{refname[0]}.1.bt2',
                '{path[0]}/{refname[0]}.2.bt2',
                '{path[0]}/{refname[0]}.3.bt2',
                '{path[0]}/{refname[0]}.4.bt2',
                '{path[0]}/{refname[0]}.rev.1.bt2',
                '{path[0]}/{refname[0]}.rev.2.bt2'],
        extras=['{path[0]}/{refname[0]}'])

    # Create a FASTA sequence dictionary for the reference using picard
    pipeline.transform(
        task_func=stages.reference_dictionary_picard,
        name='reference_dictionary_picard',
        input=output_from('original_reference'),
        filter=suffix('.fa'),
        output='.dict')

    # Align paired end reads in FASTQ to the reference producing a BAM file
    (pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name. 
        # This will be the first input to the stage.
        # We assume the sample name may consist of only alphanumeric
        # characters.
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'),
        # Add two more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        #    2. The reference genome file
        add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fastq.gz', reference_file]),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='{path[0]}/{sample[0]}.bam')
        # Ensure the reference is indexed before we run this stage
        .follows('index_reference_bwa')
        .follows('index_reference_samtools'))

    # Sort alignment with sambamba
    pipeline.transform(
        task_func=stages.sort_bam_sambamba,
        name='sort_alignment',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
        output='{path[0]}/{sample[0]}.sorted.bam')

    # Extract MMR genes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_genes_bedtools,
        name='extract_genes_bedtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.mmr.bam')

    # Extract selected chromosomes from the sorted BAM file
    pipeline.transform(
        task_func=stages.extract_chromosomes_samtools,
        name='extract_chromosomes_samtools',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.chroms.bam')

    # Index the MMR genes bam file with samtools 
    pipeline.transform(
        task_func=stages.index_bam,
        name='index_mmr_alignment',
        input=output_from('extract_genes_bedtools'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).mmr.bam'),
        output='{path[0]}/{sample[0]}.mmr.bam.bai')

    # Compute depth of coverage of the alignment with GATK DepthOfCoverage
    #pipeline.transform(
    #    task_func=stages.alignment_coverage_gatk,
    #    name='alignment_coverage_gatk',
    #    input=output_from('sort_alignment'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
    #    add_inputs=add_inputs([reference_file]),
    #    output='{path[0]}/{sample[0]}.coverage_summary',
    #    extras=['{path[0]}/{sample[0]}_coverage'])

    # Index the alignment with samtools 
    pipeline.transform(
        task_func=stages.index_bam,
        name='index_alignment',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        output='{path[0]}/{sample[0]}.sorted.bam.bai')

    # Generate alignment stats with bamtools
    pipeline.transform(
        task_func=stages.bamtools_stats,
        name='bamtools_stats',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
        output='{path[0]}/{sample[0]}.stats.txt')

    # Extract the discordant paired-end alignments
    pipeline.transform(
        task_func=stages.extract_discordant_alignments,
        name='extract_discordant_alignments',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
        output='{path[0]}/{sample[0]}.discordants.unsorted.bam')

    # Extract split-read alignments
    pipeline.transform(
        task_func=stages.extract_split_read_alignments,
        name='extract_split_read_alignments',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'),
        output='{path[0]}/{sample[0]}.splitters.unsorted.bam')

    # Sort discordant reads.
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_discordants',
        input=output_from('extract_discordant_alignments'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.discordants'],
        output='{path[0]}/{sample[0]}.discordants.bam')

    # Index the sorted discordant bam with samtools 
    # pipeline.transform(
    #   task_func=stages.index_bam,
    #   name='index_discordants',
    #   input=output_from('sort_discordants'),
    #   filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'),
    #   output='{path[0]}/{sample[0]}.discordants.bam.bai')

    # Sort discordant reads 
    # Samtools annoyingly takes the prefix of the output bam name as its argument.
    # So we pass this as an extra argument. However Ruffus needs to know the full name
    # of the output bam file, so we pass that as the normal output parameter.
    pipeline.transform(
        task_func=stages.sort_bam,
        name='sort_splitters',
        input=output_from('extract_split_read_alignments'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam'),
        extras=['{path[0]}/{sample[0]}.splitters'],
        output='{path[0]}/{sample[0]}.splitters.bam')

    # Index the sorted splitters bam with samtools 
    # pipeline.transform(
    #    task_func=stages.index_bam,
    #    name='index_splitters',
    #    input=output_from('sort_splitters'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'),
    #    output='{path[0]}/{sample[0]}.splitters.bam.bai')

    # Call structural variants with lumpy
    (pipeline.transform(
        task_func=stages.structural_variants_lumpy,
        name='structural_variants_lumpy',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        add_inputs=add_inputs(['{path[0]}/{sample[0]}.splitters.bam', '{path[0]}/{sample[0]}.discordants.bam']),
        output='{path[0]}/{sample[0]}.lumpy.vcf')
        .follows('sort_splitters')
        .follows('sort_discordants')
        .follows('index_alignment'))

    # Call genotypes on lumpy output using SVTyper 
    #(pipeline.transform(
    #    task_func=stages.genotype_svtyper,
    #    name='genotype_svtyper',
    #    input=output_from('structural_variants_lumpy'),
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'),
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']),
    #    output='{path[0]}/{sample[0]}.svtyper.vcf')
    #    .follows('align_bwa')
    #    .follows('sort_splitters')
    #    .follows('index_alignment')
    #    .follows('index_splitters')
    #    .follows('index_discordants'))

    # Call SVs with Socrates
    (pipeline.transform(
        task_func=stages.structural_variants_socrates,
        name='structural_variants_socrates',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        # output goes to {path[0]}/socrates/
        output='{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt',
        extras=['{path[0]}'])
        .follows('index_reference_bowtie2'))

    # Join both read pair files using gustaf_mate_joining
    #pipeline.transform(
    #    task_func=stages.gustaf_mate_joining,
    #    name='gustaf_mate_joining',
    #    input=output_from('fastq_to_fasta'),
    #    # Match the R1 (read 1) FASTA file and grab the path and sample name. 
    #    # This will be the first input to the stage.
    #    # We assume the sample name may consist of only alphanumeric
    #    # characters.
    #    filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'),
    #    # Add one more input to the stage:
    #    #    1. The corresponding R2 FASTA file
    #    add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']),
    #    output='{path[0]}/{sample[0]}.joined_mates.fasta')


    # Call structural variants with pindel 
    (pipeline.transform(
        task_func=stages.structural_variants_pindel,
        name='structural_variants_pindel',
        input=output_from('sort_alignment'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'),
        add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]),
        output='{path[0]}/{sample[0]}.pindel')
        .follows('index_reference_bwa')
        .follows('index_reference_samtools'))

    return pipeline
    def test_newstyle_ruffus (self):

        test_pipeline = Pipeline("test")
        test_pipeline.originate(task_func = make_start, output = [tempdir + 'start'])
        test_pipeline.split(task_func = split_start, input = make_start, output = tempdir + '*.split')
        test_pipeline.subdivide(task_func = subdivide_start, input = split_start, filter = formatter(), output = tempdir + '{basename[0]}_*.subdivided', extras = [tempdir + '{basename[0]}'])

        expected_files_after_1_runs = ["start", "0.split", "0_0.subdivided"]
        expected_files_after_2_runs = ["1.split", "0_1.subdivided", "1_0.subdivided"]
        expected_files_after_3_runs = ["2.split", "0_2.subdivided", "1_1.subdivided", "2_0.subdivided"]
        expected_files_after_4_runs = ["3.split", "0_3.subdivided", "1_2.subdivided", "2_1.subdivided", "3_0.subdivided"]

        print("     Run pipeline normally...")
        test_pipeline.run(multiprocess = 10, verbose=0)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs,
                                                 expected_files_after_2_runs)

        print("     Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess = 10, verbose=0)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs,
                                                 expected_files_after_2_runs)

        print("     Running again with forced tasks to generate more files...")
        test_pipeline.run(forcedtorun_tasks = ["test::make_start"], multiprocess = 10, verbose=0)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                 + expected_files_after_2_runs,
                                                 expected_files_after_3_runs)

        print("     Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess = 10, verbose=0)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                 + expected_files_after_2_runs,
                                                 expected_files_after_3_runs)


        print("     Running again with forced tasks to generate even more files...")
        test_pipeline.run(forcedtorun_tasks = make_start, multiprocess = 10, verbose=0)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                 + expected_files_after_2_runs
                                                 + expected_files_after_3_runs,
                                                 expected_files_after_4_runs)
        print("     Check that running again does nothing. (All up to date).")
        test_pipeline.run(multiprocess = 10, verbose=0)
        self.check_file_exists_or_not_as_expected(expected_files_after_1_runs
                                                 + expected_files_after_2_runs
                                                 + expected_files_after_3_runs,
                                                 expected_files_after_4_runs)
Exemplo n.º 38
0
    def test_newstyle_task(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(task1, [[None, tempdir + "a.1"], [None, tempdir + "b.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.files(task2, [[None, tempdir + "c.1"], [None, tempdir + "d.1"]])\
            .follows(mkdir(tempdir))

        test_pipeline.transform(task_func=task3,
                                input=task1,
                                filter=regex(r"(.+)"),
                                replace_inputs=ruffus.inputs(
                                    ((r"\1"), task2, "test_transform_inputs.*y")),
                                output=r"\1.output")
        test_pipeline.merge(task4, (task3), tempdir + "final.output")

        test_pipeline.run([task4], multiprocess=10, verbose=0)

        correct_output = "{tempdir}a.1.output:test_transform_inputs.py,{tempdir}a.1,{tempdir}c.1,{tempdir}d.1;{tempdir}b.1.output:test_transform_inputs.py,{tempdir}b.1,{tempdir}c.1,{tempdir}d.1;".format(
            tempdir=tempdir)
        with open(tempdir + "final.output") as ff:
            real_output = ff.read()
        self.assertEqual(correct_output, real_output)
Exemplo n.º 39
0
    def test_newstyle_graphviz_dot(self):
        test_pipeline = Pipeline("test")
        test_pipeline.check_if_uptodate (Up_to_date_task1, lambda : (False, ""))
        test_pipeline.follows(Up_to_date_task2, Up_to_date_task1)\
            .check_if_uptodate (lambda : (False, ""))\
            .graphviz(URL='"http://cnn.com"', fillcolor = '"#FFCCCC"',
                            color = '"#FF0000"', pencolor='"#FF0000"', fontcolor='"#4B6000"',
                            label_suffix = "???", label_prefix = "What is this?<BR/> ",
                            label = "<What <FONT COLOR=\"red\">is</FONT>this>",
                            shape= "component", height = 1.5, peripheries = 5,
                            style="dashed")
        test_pipeline.follows(Up_to_date_task3, Up_to_date_task2)\
            .check_if_uptodate (lambda : (False, ""))
        test_pipeline.follows(Up_to_date_final_target, Up_to_date_task3)\
            .check_if_uptodate (lambda : (False, ""))
        test_pipeline.follows(Explicitly_specified_task, Up_to_date_task1)\
            .check_if_uptodate (lambda : (False, ""))
        test_pipeline.follows(Task_to_run1, Explicitly_specified_task)
        test_pipeline.follows(Task_to_run2, Task_to_run1)
        test_pipeline.follows(Task_to_run3, Task_to_run2)
        test_pipeline.follows(Up_to_date_task_forced_to_rerun, Task_to_run2)\
            .check_if_uptodate (lambda : (False, ""))
        test_pipeline.follows(Final_target, Up_to_date_task_forced_to_rerun, Task_to_run3)
        test_pipeline.follows(Downstream_task1_ignored, Final_target)
        test_pipeline.follows(Downstream_task2_ignored, Final_target)

        if sys.hexversion >= 0x03000000:
            # everything is unicode in python3
            s = BytesIO()
        else:
            s = StringIO()


        test_pipeline.printout_graph (
                                        s,
                                        # use flowchart file name extension to decide flowchart format
                                        #   e.g. svg, jpg etc.
                                        "dot",
                                        [Final_target, Up_to_date_final_target])
        self.assertTrue('[URL="http://cnn.com", color="#FF0000", fillcolor="#FFCCCC", fontcolor="#4B6000", height=1.5, label=<What is this?<BR/> What <FONT COLOR="red">is</FONT>this???>, pencolor="#FF0000", peripheries=5, shape=component, style=dashed]' in s.getvalue().decode())
Exemplo n.º 40
0
    def test_newstyle_ruffus(self):
        test_pipeline = Pipeline("test")

        test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\
            .follows(mkdir(tempdir))

        test_pipeline.split(task_func=step_4_split_numbers_into_chunks,
                            input=tempdir + "random_numbers.list",
                            output=tempdir + "*.chunks")\
            .follows(create_random_numbers)

        test_pipeline.transform(task_func=step_5_calculate_sum_of_squares,
                                input=step_4_split_numbers_into_chunks,
                                filter=suffix(".chunks"),
                                output=".sums")

        test_pipeline.merge(task_func=step_6_calculate_variance, input=step_5_calculate_sum_of_squares, output=os.path.join(tempdir, "variance.result"))\
            .posttask(lambda: sys.stdout.write("     hooray\n"))\
            .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done")))

        test_pipeline.run(multiprocess=50, verbose=0)
        output_file = os.path.join(tempdir, "variance.result")
        if not os.path.exists(output_file):
            raise Exception("Missing %s" % output_file)
Exemplo n.º 41
0
 def test_newstyle_task (self):
     """
     Same as above but construct a new pipeline on the fly without decorators
     """
     test_pipeline = Pipeline("test")
     test_pipeline.files(task1, None, tempdir + 'a.1')\
         .follows(mkdir(tempdir))
     test_pipeline.transform(task_func   = task2,
                             input       = task1,
                             filter      = regex(r".*"),
                             output      = tempdir + 'b.1')
     test_pipeline.files(task3, task2, tempdir + 'c.1')
     test_pipeline.files(task4, [[None, tempdir + 'd.1'], [None, tempdir + 'e.1']])\
         .follows(task3)
     test_pipeline.files(task5, task4, tempdir + "f.1")
     test_pipeline.run(multiprocess = 10, verbose = 0)
Exemplo n.º 42
0
def make_pipeline_map(state):
    '''Build the pipeline by constructing stages and connecting them together'''
    # Build an empty pipeline
    pipeline = Pipeline(name='genericpipe')
    # Stages are dependent on the state
    stages = Stages(state)

    safe_make_dir('alignments')
    safe_make_dir('metrics')
    safe_make_dir('metrics/amplicon')
    safe_make_dir('metrics/summary')

    # The original FASTQ files
    fastq_files = glob.glob('fastqs/*')

    # This is a dummy stage. It is useful because it makes a node in the
    # pipeline graph, and gives the pipeline an obvious starting point.
    pipeline.originate(task_func=stages.original_fastqs,
                       name='original_fastqs',
                       output=fastq_files)

    # Align paired end reads in FASTQ to the reference producing a BAM file
    pipeline.transform(
        task_func=stages.align_bwa,
        name='align_bwa',
        input=output_from('original_fastqs'),
        # Match the R1 (read 1) FASTQ file and grab the path and sample name.
        # This will be the first input to the stage.
        filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'),
        # Add one more inputs to the stage:
        #    1. The corresponding R2 FASTQ file
        add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'),
        # Add an "extra" argument to the state (beyond the inputs and outputs)
        # which is the sample name. This is needed within the stage for finding out
        # sample specific configuration options
        extras=['{sample[0]}'],
        # The output file name is the sample name with a .bam extension.
        output='alignments/{sample[0]}.sort.hq.bam')

    # generate mapping metrics.
    pipeline.transform(
        task_func=stages.generate_amplicon_metrics,
        name='generate_amplicon_metrics',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt',
        extras=['{sample[0]}'])

    pipeline.transform(
        task_func=stages.intersect_bed,
        name='intersect_bed',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.intersectbed.bam')

    pipeline.transform(task_func=stages.coverage_bed,
                       name='coverage_bed',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.bedtools_hist_all.txt')

    pipeline.transform(
        task_func=stages.genome_reads,
        name='genome_reads',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.mapped_to_genome.txt')

    pipeline.transform(task_func=stages.target_reads,
                       name='target_reads',
                       input=output_from('intersect_bed'),
                       filter=suffix('.intersectbed.bam'),
                       output='.mapped_to_target.txt')

    pipeline.transform(
        task_func=stages.total_reads,
        name='total_reads',
        input=output_from('align_bwa'),
        filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'),
        output='metrics/summary/{sample[0]}.total_raw_reads.txt')

    pipeline.collate(
        task_func=stages.generate_stats,
        name='generate_stats',
        input=output_from('coverage_bed', 'genome_reads', 'target_reads',
                          'total_reads'),
        #filter=regex(r'.+/(.+BS\d{4,6}.+)\..+\.txt'),
        filter=regex(
            r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt'
        ),
        output=r'metrics/summary/all_sample.summary.\1.txt',
        extras=[r'\1', 'all_sample.summary.txt'])

    summary_file = 'all_sample.summary.txt'

    (pipeline.originate(task_func=stages.grab_summary_file,
                        name='grab_summary_file',
                        output=summary_file).follows('generate_stats'))

    pipeline.transform(task_func=stages.filter_stats,
                       name='filter_stats',
                       input=output_from('grab_summary_file'),
                       filter=suffix('.summary.txt'),
                       output='.passed.summary.txt',
                       extras=['all_sample.failed.summary.txt'])

    return pipeline