def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.originate(start_task, ["a.1", "b.1"]) test_pipeline.transform(same_file_name_task, start_task, suffix(".1"), ".1") test_pipeline.transform(linked_file_name_task, start_task, suffix(".1"), ".linked.1") test_pipeline.transform(final_task, [linked_file_name_task, same_file_name_task], suffix(".1"), ".3") test_pipeline.run(log_exceptions = True, verbose = 0)
def build_pipeline(): pipe = Pipeline("my_pipeline") pipe.originate( name="create_three_new_files", task_func=create_new_file, output=[os.path.join(WORK_DIR, f"file{i}.csv") for i in range(1, 4)], ) pipe.transform( name="convert_csv_files_to_tsv", task_func=csv_to_tsv, input=output_from("create_three_new_files"), filter=suffix(".csv"), output=".tsv", ) pipe.transform( name="calculate_md5", task_func=md5, input=output_from("convert_csv_files_to_tsv"), filter=suffix(".tsv"), output=".md5sum", ) return pipe
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='md5') # Get a list of paths to all the input files input_files = state.config.get_option('files') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_files, name='original_files', output=input_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.md5_checksum, name='md5_checksum', input=output_from('original_files'), filter=suffix(''), output='.md5') return pipeline
def test_newstyle_ruffus(self): # alternative syntax test_pipeline = Pipeline("test") test_pipeline.mkdir(data_dir, work_dir) test_pipeline.originate(task_func=task1, output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"]) test_pipeline.mkdir(filter=suffix(".1"), output=".dir", output_dir=work_dir) test_pipeline.transform(task_func=task2, input=task1, filter=suffix(".1"), output=[".1", ".bak"], extras=["extra.tst", 4, r"orig_dir=\1"], output_dir=work_dir) test_pipeline.subdivide(task3, task2, suffix( ".1"), r"\1.*.2", [r"\1.a.2", r"\1.b.2"], output_dir=data_dir) test_pipeline.transform(task4, task3, suffix( ".2"), ".3", output_dir=work_dir) test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5")) test_pipeline.run(multiprocess=50, verbose=0) with open(os.path.join(data_dir, "summary.5")) as ii: active_text = ii.read() if active_text != expected_active_text: raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" % (expected_active_text, active_text))
def test_transform_with_missing_formatter_args_b(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func=generate_initial_files, output=[os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])\ .mkdir(tempdir) test_pipeline.transform( task_func=transform_with_missing_formatter_args, input=generate_initial_files, filter=formatter(), output="{path[0]}/{basename[0]}.task1", extras=['echo {dynamic_message} > {some_file}']) s = StringIO() test_pipeline.printout(s, [transform_with_missing_formatter_args], verbose=4, wrap_width=10000, pipeline="test") self.assertIn("Unmatched field {dynamic_message}", s.getvalue()) # log to stream s = StringIO() logger = t_stream_logger(s) test_pipeline.run([transform_with_missing_formatter_args], verbose=5, pipeline="test", logger=logger) self.assertIn("Unmatched field {dynamic_message}", s.getvalue())
def test_newstyle_ruffus (self): print(" Run pipeline normally...") test_pipeline = Pipeline("test") test_pipeline.originate(make_start, [tempdir + 'start']) test_pipeline.split(split_start, make_start, tempdir + '*.split') test_pipeline.subdivide(subdivide_start, split_start, formatter(), tempdir + '{basename[0]}_*.subdivided', tempdir + '{basename[0]}') if self.graph_viz_present: test_pipeline.printout_graph(tempdir + "flowchart.dot") test_pipeline.printout_graph(tempdir + "flowchart.jpg", target_tasks =[subdivide_start], forcedtorun_tasks = [split_start], no_key_legend = True) test_pipeline.printout_graph(tempdir + "flowchart.svg", no_key_legend = False) # Unknown format try: test_pipeline.printout_graph(tempdir + "flowchart.unknown", no_key_legend = False) raise Exception("Failed to throw exception for test_pipeline.printout_graph unknown extension ") except CalledProcessError as err: pass test_pipeline.printout_graph(tempdir + "flowchart.unknown", "svg", no_key_legend = False) else: test_pipeline.printout_graph(tempdir + "flowchart.dot", target_tasks =[subdivide_start], forcedtorun_tasks = [split_start], no_key_legend = True)
def make_pipeline_call(state): #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs pipeline = Pipeline(name='genericpipe') with open("all_sample.passed.summary.txt", 'r') as inputf: passed_files = inputf.read().split('\n') stages = Stages(state) safe_make_dir('variants') safe_make_dir('variants/gatk') pipeline.originate(task_func=stages.passed_filter_files, name='passed_filter_files', output=passed_files) ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sort.hq.bam'), output='variants/gatk/{sample[0]}.g.vcf') return pipeline
def test_newstyle_simpler (self): test_pipeline = Pipeline("test") test_pipeline.originate(task1, input_file_names, extras = [logger_proxy, logging_mutex]) test_pipeline.transform(task2, task1, suffix(".1"), ".2", extras = [logger_proxy, logging_mutex]) test_pipeline.transform(task3, task2, suffix(".2"), ".3", extras = [logger_proxy, logging_mutex]) test_pipeline.merge(task4, task3, final_file_name, extras = [logger_proxy, logging_mutex]) #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex}) test_pipeline.run(multiprocess = 500, verbose = 0)
def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func = generate_initial_files, output = original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide( task_func = split_fasta_file, input = generate_initial_files, filter = regex(r".*\/original_(\d+).fa"), # match original files output = [tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras = [r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func = align_sequences, input = split_fasta_file, filter = suffix(".fa"), output = ".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func = percentage_identity, input = align_sequences, # find all results from align_sequences filter = suffix(".aln"), # replace suffix with: output = [r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func = combine_results, input = percentage_identity, filter = regex(r".*files.split\.(\d+)\.\d+.pcid"), output = [tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue( re.search('Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def make_pipeline1( pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform( task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform( task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform( task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func=generate_initial_files, output=original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide(task_func=split_fasta_file, input=generate_initial_files, # match original files filter=regex(r".*\/original_(\d+).fa"), output=[tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras=[r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func=align_sequences, input=split_fasta_file, filter=suffix(".fa"), output=".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func=percentage_identity, input=align_sequences, # find all results from align_sequences # replace suffix with: filter=suffix(".aln"), output=[r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func=combine_results, input=percentage_identity, filter=regex(r".*files.split\.(\d+)\.\d+.pcid"), output=[tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue(re.search( 'Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='ovarian_cancer_pipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') human_reference_genome_file = state.config.get_option('human_reference_genome') # Stages are dependent on the state stages = PipelineStages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # The human reference genome in FASTA format pipeline.originate( task_func=stages.human_reference_genome, name='human_reference_genome', output=human_reference_genome_file) # Index the human reference genome with BWA, needed before we can map reads pipeline.transform( task_func=stages.index_ref_bwa, name='index_ref_bwa', input=output_from('human_reference_genome'), filter=suffix('.fa'), output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Align paired end reads in FASTQ to the reference producing a BAM file (pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[_a-zA-Z0-9]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') .follows('index_ref_bwa')) return pipeline
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func=make_start, output=[tempdir + 'start']) test_pipeline.split(task_func=split_start, input=make_start, output=tempdir + '*.split') test_pipeline.subdivide(task_func=subdivide_start, input=split_start, filter=formatter( ), output=tempdir + '{basename[0]}_*.subdivided', extras=[tempdir + '{basename[0]}']) expected_files_after_1_runs = ["start", "0.split", "0_0.subdivided"] expected_files_after_2_runs = [ "1.split", "0_1.subdivided", "1_0.subdivided"] expected_files_after_3_runs = [ "2.split", "0_2.subdivided", "1_1.subdivided", "2_0.subdivided"] expected_files_after_4_runs = [ "3.split", "0_3.subdivided", "1_2.subdivided", "2_1.subdivided", "3_0.subdivided"] print(" 1 Run pipeline normally...") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) print(" 2 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) time.sleep(2) print(" 3 Running again with forced tasks to generate more files...") test_pipeline.run(forcedtorun_tasks=[ "test::make_start"], multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) print(" 4 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) time.sleep(2) print(" 5 Running again with forced tasks to generate even more files...") test_pipeline.run(forcedtorun_tasks=make_start, multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs) print(" 6 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs)
def make_pipeline1(pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform(task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs( tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform(task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform(task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='test_pipeline') # Get a list of paths to all the FASTQ files input_files = state.config.get_option('files') # Stages are dependent on the state stages = Stages(state) # The original files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_files, name='original_files', output=input_files) pipeline.transform( task_func=stages.stage1, name='stage1', input=output_from('original_files'), filter=suffix('.0'), output='.1') pipeline.transform( task_func=stages.stage2, name='stage2', input=output_from('stage1'), filter=suffix('.1'), output='.2') pipeline.transform( task_func=stages.stage3, name='stage3', input=output_from('stage2'), filter=suffix('.2'), output='.3') pipeline.transform( task_func=stages.stage4, name='stage4', input=output_from('stage3'), filter=suffix('.3'), output='.4') pipeline.transform( task_func=stages.stage5, name='stage5', input=output_from('stage4'), filter=suffix('.4'), output='.5') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='twin ion') # Get a list of paths to all the MZML files mzml_files = state.config.get_option('mzml') # Stages are dependent on the state stages = Stages(state) # The original MZML files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_mzml, name='original_mzml', output=mzml_files) pipeline.transform( task_func=stages.resample, name='resample', input=output_from('original_mzml'), filter=suffix('.mzML'), output='.resample.mzML') pipeline.transform( task_func=stages.noise_filter_sgolay, name='noise_filter_sgolay', input=output_from('resample'), filter=suffix('.resample.mzML'), output='.denoise.mzML') pipeline.transform( task_func=stages.baseline_filter, name='baseline_filter', input=output_from('noise_filter_sgolay'), filter=suffix('.denoise.mzML'), output='.baseline.mzML') pipeline.transform( task_func=stages.peak_picker_hires, name='peak_picker_hires', input=output_from('baseline_filter'), filter=suffix('.baseline.mzML'), output='.peaks.mzML') pipeline.transform( task_func=stages.feature_finder_centroid, name='feature_finder_centroid', input=output_from('peak_picker_hires'), filter=suffix('.peaks.mzML'), output='.featureXML') return pipeline
def make_pipeline_call(state): #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs pipeline = Pipeline(name='hiplexpipe') with open("all_sample.passed.summary.txt", 'r') as inputf: passed_files = inputf.read().split('\n') stages = Stages(state) safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/undr_rover') safe_make_dir('variants/undr_rover/coverdir') pipeline.originate(task_func=stages.passed_filter_files, name='passed_filter_files', output=passed_files) # Call variants using undr_rover pipeline.transform( task_func=stages.apply_undr_rover, name='apply_undr_rover', input=output_from('passed_filter_files'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='variants/undr_rover/{sample[0]}.vcf', extras=['{sample[0]}']) #### concatenate undr_rover vcfs #### pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('apply_undr_rover'), filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/undr_rover/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'), output='variants/gatk/{sample[0]}.g.vcf') return pipeline
def test_newstyle_no_re_match (self): test_pipeline = Pipeline("test") test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir) test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output") save_to_str_logger = t_save_to_str_logger() test_pipeline.run(multiprocess = 10, logger = save_to_str_logger, verbose = 1) print(save_to_str_logger.warning_str) self.assertTrue("no file names matched" in save_to_str_logger.warning_str) print("\n Warning printed out correctly", file=sys.stderr)
def test_newstyle_no_re_match(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_1, tempdir + "a").mkdir(tempdir) test_pipeline.transform(task_2, task_1, regex("b"), "task_2.output") save_to_str_logger = t_save_to_str_logger() test_pipeline.run( multiprocess=10, logger=save_to_str_logger, verbose=1) print(save_to_str_logger.warning_str) self.assertTrue( "no file names matched" in save_to_str_logger.warning_str) print("\n Warning printed out correctly", file=sys.stderr)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func=task1, output=[tempdir + 'a.1'] + runtime_files) test_pipeline.transform(task2, task1, suffix(".1"), ".2") test_pipeline.transform(task_func=task3, input=task2, filter=suffix(".2"), output=".3") test_pipeline.transform(task_func=task4, input=runtime_parameter("a"), filter=suffix(".3"), output=".4").follows(task3) test_pipeline.run(verbose=0, runtime_data={"a": runtime_files})
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func = make_start, output = [tempdir + 'start']) test_pipeline.split(task_func = split_start, input = make_start, output = tempdir + '*.split') test_pipeline.subdivide(task_func = subdivide_start, input = split_start, filter = formatter(), output = tempdir + '{basename[0]}_*.subdivided', extras = [tempdir + '{basename[0]}']) expected_files_after_1_runs = ["start", "0.split", "0_0.subdivided"] expected_files_after_2_runs = ["1.split", "0_1.subdivided", "1_0.subdivided"] expected_files_after_3_runs = ["2.split", "0_2.subdivided", "1_1.subdivided", "2_0.subdivided"] expected_files_after_4_runs = ["3.split", "0_3.subdivided", "1_2.subdivided", "2_1.subdivided", "3_0.subdivided"] print(" 1 Run pipeline normally...") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) print(" 2 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) time.sleep(2) print(" 3 Running again with forced tasks to generate more files...") test_pipeline.run(forcedtorun_tasks = ["test::make_start"], multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) print(" 4 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) time.sleep(2) print(" 5 Running again with forced tasks to generate even more files...") test_pipeline.run(forcedtorun_tasks = make_start, multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs) print(" 6 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess = 10, verbose = TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs)
def test_newstyle_mkdir (self): test_pipeline = Pipeline("test") test_pipeline.follows(task_which_makes_directories, mkdir(directories), mkdir(unicode(tempdir + "c")), mkdir(unicode(tempdir + "d"), unicode(tempdir + "e")), mkdir(unicode(tempdir + "e")))\ .posttask(touch_file(unicode(tempdir + "f"))) test_pipeline.originate(task_which_makes_files, [tempdir + "g", tempdir + "h"]) test_pipeline.run(multiprocess = 10, verbose = 0) for d in 'abcdefgh': fullpath = os.path.join(os.path.dirname(__file__), tempdir, d) self.assertTrue(os.path.exists(fullpath))
def test_newstyle_mkdir(self): test_pipeline = Pipeline("test") test_pipeline.follows(task_which_makes_directories, mkdir(directories), mkdir(unicode(tempdir + "c")), mkdir(unicode(tempdir + "d"), unicode(tempdir + "e")), mkdir(unicode(tempdir + "e")))\ .posttask(touch_file(unicode(tempdir + "f"))) test_pipeline.originate(task_which_makes_files, [tempdir + "g", tempdir + "h"]) test_pipeline.run(multiprocess=10, verbose=0) for d in 'abcdefgh': fullpath = os.path.join(os.path.dirname(__file__), tempdir, d) self.assertTrue(os.path.exists(fullpath))
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='fastq2bam') # Get a list of paths to all the FASTQ files input_files = state.config.get_option('files') # Stages are dependent on the state stages = Stages(state) # The original files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_files, name='original_files', output=input_files) pipeline.transform( task_func=stages.fastq2bam, name='fastq2bam', input=output_from('original_files'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), extras=['{sample[0]}'], output='{path[0]}/out/{sample[0]}.bam') pipeline.transform( task_func=stages.validate_prealigned_bam, name='validate_prealigned_bam', input=output_from('fastq2bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.validation') pipeline.transform( task_func=stages.align, name='align', input=output_from('validate_prealigned_bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'), add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'), output='{path[0]}/{sample[0]}.mapped.bam') return pipeline
def test_newstyle_simpler(self): test_pipeline = Pipeline("test") test_pipeline.originate(task1, input_file_names, extras=[logger_proxy, logging_mutex]) test_pipeline.transform(task2, task1, suffix(".1"), ".2", extras=[logger_proxy, logging_mutex]) test_pipeline.transform(task3, task2, suffix(".2"), ".3", extras=[logger_proxy, logging_mutex]) test_pipeline.merge(task4, task3, final_file_name, extras=[logger_proxy, logging_mutex]) #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex}) test_pipeline.run(multiprocess=500, verbose=0)
def test_newstyle_ruffus(self): # alternative syntax test_pipeline = Pipeline("test") test_pipeline.mkdir(data_dir, work_dir) test_pipeline.originate( task_func=task1, output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"]) test_pipeline.mkdir(filter=suffix(".1"), output=".dir", output_dir=work_dir) test_pipeline.transform(task_func=task2, input=task1, filter=suffix(".1"), output=[".1", ".bak"], extras=["extra.tst", 4, r"orig_dir=\1"], output_dir=work_dir) test_pipeline.subdivide(task3, task2, suffix(".1"), r"\1.*.2", [r"\1.a.2", r"\1.b.2"], output_dir=data_dir) test_pipeline.transform(task4, task3, suffix(".2"), ".3", output_dir=work_dir) test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5")) test_pipeline.run(multiprocess=50, verbose=0) with open(os.path.join(data_dir, "summary.5")) as ii: active_text = ii.read() if active_text != expected_active_text: raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" % (expected_active_text, active_text))
def test_transform_with_missing_formatter_args_b(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func = generate_initial_files, output = [os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])\ .mkdir(tempdir) test_pipeline.transform(task_func = transform_with_missing_formatter_args, input = generate_initial_files, filter = formatter(), output = "{path[0]}/{basename[0]}.task1", extras =['echo {dynamic_message} > {some_file}']) s = StringIO() test_pipeline.printout(s, [transform_with_missing_formatter_args], verbose=4, wrap_width = 10000, pipeline= "test") self.assertIn("Missing key = {dynamic_message}", s.getvalue()) #log to stream s = StringIO() logger = t_stream_logger(s) test_pipeline.run([transform_with_missing_formatter_args], verbose=5, pipeline= "test", logger=logger) self.assertIn("Missing key = {dynamic_message}", s.getvalue())
def test_newstyle_ruffus(self): print(" Run pipeline normally...") test_pipeline = Pipeline("test") test_pipeline.originate(make_start, [tempdir + 'start']) test_pipeline.split(split_start, make_start, tempdir + '*.split') test_pipeline.subdivide(subdivide_start, split_start, formatter(), tempdir + '{basename[0]}_*.subdivided', tempdir + '{basename[0]}') if self.graph_viz_present: test_pipeline.printout_graph(tempdir + "flowchart.dot") test_pipeline.printout_graph(tempdir + "flowchart.jpg", target_tasks=[subdivide_start], forcedtorun_tasks=[split_start], no_key_legend=True) test_pipeline.printout_graph(tempdir + "flowchart.svg", no_key_legend=False) # Unknown format try: test_pipeline.printout_graph(tempdir + "flowchart.unknown", no_key_legend=False) raise Exception( "Failed to throw exception for test_pipeline.printout_graph unknown extension " ) except CalledProcessError as err: pass test_pipeline.printout_graph(tempdir + "flowchart.unknown", "svg", no_key_legend=False) else: test_pipeline.printout_graph(tempdir + "flowchart.dot", target_tasks=[subdivide_start], forcedtorun_tasks=[split_start], no_key_legend=True)
def create_pipeline(self): #each pipeline has a different name global cnt_pipelines cnt_pipelines = cnt_pipelines + 1 test_pipeline = Pipeline("test %d" % cnt_pipelines) test_pipeline.originate( task_func=generate_initial_files1, output=[tempdir + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline.originate( task_func=generate_initial_files2, output=[tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"]) test_pipeline.originate( task_func=generate_initial_files3, output=[tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"]) test_pipeline.originate(task_func=generate_initial_files4, output=tempdir + "i_name.tmp1") test_pipeline.collate(task_func=test_task2, input=[ generate_initial_files1, generate_initial_files2, generate_initial_files3, generate_initial_files4 ], filter=formatter(), output="{path[0]}/all.tmp2") test_pipeline.transform(task_func=test_task3, input=test_task2, filter=suffix(".tmp2"), output=".tmp3") test_pipeline.transform(task_func=test_task4, input=test_task3, filter=suffix(".tmp3"), output=".tmp4") return test_pipeline
def create_pipeline (self): #each pipeline has a different name global cnt_pipelines cnt_pipelines = cnt_pipelines + 1 test_pipeline = Pipeline("test %d" % cnt_pipelines) test_pipeline.originate(task_func = generate_initial_files1, output = [tempdir + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline.originate(task_func = generate_initial_files2, output = [tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"]) test_pipeline.originate(task_func = generate_initial_files3, output = [tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"]) test_pipeline.originate(task_func = generate_initial_files4, output = tempdir + "i_name.tmp1") test_pipeline.collate( task_func = test_task2, input = [generate_initial_files1, generate_initial_files2, generate_initial_files3, generate_initial_files4], filter = formatter(), output = "{path[0]}/all.tmp2") test_pipeline.transform(task_func = test_task3, input = test_task2, filter = suffix(".tmp2"), output = ".tmp3") test_pipeline.transform(task_func = test_task4, input = test_task3, filter = suffix(".tmp3"), output = ".tmp4") return test_pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq # new sample name = OHI031002-P02F04 filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)-(?P<tumor>[TN]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq add_inputs=add_inputs( '{path[0]}/{sample[0]}-{tumor[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{tumor[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}/{sample[0]}_{tumor[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # High quality and primary alignments pipeline.transform( task_func=stages.primary_bam, name='primary_bam', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), output='.primary.bam') # index bam file pipeline.transform( task_func=stages.index_sort_bam_picard, name='index_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.bam.bai') # Clip the primer_seq from BAM File (pipeline.transform( task_func=stages.clip_bam, name='clip_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.primerclipped.bam') .follows('index_bam')) ###### GATK VARIANT CALLING - MuTect2 ###### # Call somatics variants using MuTect2 pipeline.transform( task_func=stages.call_mutect2_gatk, name='call_mutect2_gatk', input=output_from('clip_bam'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+)_T.primary.primerclipped.bam'), add_inputs=add_inputs( '{path[0]}/{sample[0]}_N.primary.primerclipped.bam'), # extras=['{sample[0]}'], output='variants/mutect2/{sample[0]}.mutect2.vcf') # .follows('clip_bam') ###### GATK VARIANT CALLING - MuTect2 ###### # -------- VEP ---------- # Apply NORM (pipeline.transform( task_func=stages.apply_vt, name='apply_vt', input=output_from('call_mutect2_gatk'), filter=suffix('.mutect2.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.mutect2.vt.vcf') .follows('call_mutect2_gatk')) # # Apply VEP (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('apply_vt'), filter=suffix('.mutect2.vt.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.mutect2.vt.vep.vcf') .follows('apply_vt')) # # Apply vcfanno (pipeline.transform( task_func=stages.apply_vcfanno, name='apply_vcfanno', input=output_from('apply_vep'), filter=suffix('.mutect2.vt.vep.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.mutect2.annotated.vcf') .follows('apply_vep')) return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='complexo') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform(task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform(task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam').follows( 'mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform(task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam').follows( 'local_realignment_gatk')) # Call variants using GATK pipeline.transform(task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='COMPLEXO.mergedgvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergedgvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform(task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['COMPLEXO.snp_recal', 'COMPLEXO.snp_tranches']), output='.recal_SNP.vcf').follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs( ['COMPLEXO.indel_recal', 'COMPLEXO.indel_tranches']), output='.recal_INDEL.vcf').follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['COMPLEXO.recal_INDEL.vcf']), output='.combined.vcf').follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform(task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name="rnapipe") # Get the details of the experiment (samples, config, inputs, ...) experiment = Experiment(state) # Get reference file locations reference_genome = state.config.get_options("reference_genome") gene_ref = state.config.get_options("gene_ref") # Print out samples sample_text = [s.info() for s in experiment.sample_list] logging.info("Analysis samples:\n{}".format("\n".join(sample_text))) # Stages are dependent on the state. Experiment object is also passed so # we can access metadata later. stages = PipelineStages(state, experiment=experiment) # Make directories output_dir = get_output_paths( results_dir=state.config.get_options("results_dir"), default_paths=OUTPUT_PATHS) make_output_dirs(output_dir) logging.debug(output_dir) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.do_nothing, name="original_fastqs", output=experiment.R1_files) # Create reference index for alignment if not experiment.index_provided: pipeline.originate(task_func=stages.do_nothing, name="reference_genome", output=reference_genome) if experiment.alignment_method == "star": # Create reference index for STAR pipeline.transform(task_func=stages.create_star_index, name="create_star_index", input=output_from("reference_genome"), filter=formatter(".*"), add_inputs=add_inputs(gene_ref), output=path_list_join( output_dir["star_index"], ["SA", "Genome", "genomeParameters.txt"]), extras=[output_dir["star_index"]]) elif experiment.alignment_method == "hisat2": # Create reference index for HISAT2 hisat_basename = path.join(output_dir["hisat_index"], "genome") pipeline.transform( task_func=stages.create_hisat_index, name="create_hisat_index", input=output_from("reference_genome"), filter=formatter(".*"), add_inputs=add_inputs(gene_ref), output=path_list_join(output_dir["hisat_index"], ["genome.1.ht2", "genome.2.ht2"]), extras=[hisat_basename]) else: # Don't create index if index is supplied if experiment.alignment_method == "star": output_dir["star_index"] = state.config.get_options("star_index") pipeline.originate(task_func=stages.do_nothing, name="create_star_index", output=path_list_join( output_dir["star_index"], ["SA", "Genome", "genomeParameters.txt"])) elif experiment.alignment_method == "hisat2": hisat_basename = state.config.get_options("hisat_index") output_dir["hisat_index"] = path.dirname(hisat_basename) prefix = path.basename(hisat_basename) pipeline.originate(task_func=stages.do_nothing, name="create_hisat_index", output=path_list_join( output_dir["hisat_index"], [ "{prefix}.1.ht2".format(prefix=prefix), "{prefix}.2.ht2".format(prefix=prefix) ])) # Pre-trim FastQC if experiment.paired_end: pipeline.transform( task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"), add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"), output=path_list_join( output_dir["fastqc"], ["{sample[0]}_R1_fastqc.zip", "{sample[0]}_R2_fastqc.zip"]), extras=[output_dir["fastqc"]]) else: pipeline.transform(task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=suffix(".fastq.gz"), output="_fastqc.zip", output_dir=output_dir["fastqc"], extras=[output_dir["fastqc"]]) # Trimmomatic if experiment.trim_reads and experiment.paired_end: pipeline.transform( task_func=stages.trim_reads, name="trim_reads", input=output_from("original_fastqs"), # Get R1 file and the corresponding R2 file filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"), add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"), output=path_list_join(output_dir["seq"], [ "{sample[0]}_R1.trimmed.fastq.gz", "{sample[0]}_R2.trimmed.fastq.gz" ]), extras=path_list_join(output_dir["seq"], [ "{sample[0]}_R1.unpaired.fastq.gz", "{sample[0]}_R2.unpaired.fastq.gz" ])) elif experiment.trim_reads: pipeline.transform( task_func=stages.trim_reads, name="trim_reads", input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"), output=path.join(output_dir["seq"], "{sample[0]}_R1.trimmed.fastq.gz")) # Post-trim FastQC if experiment.paired_end and experiment.trim_reads: pipeline.transform( task_func=stages.fastqc, name="post_trim_fastqc", input=output_from("trim_reads"), filter=formatter( ".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.trimmed.fastq.gz"), output=path_list_join(output_dir["post_trim_fastqc"], [ "{sample[0]}_R1.trimmed_fastqc.gz", "{sample[0]}_R2.trimmed_fastqc.gz" ]), extras=["results/qc/post_trim_fastqc/"]) elif experiment.trim_reads: pipeline.transform(task_func=stages.fastqc, name="post_trim_fastqc", input=output_from("trim_reads"), filter=suffix(".trimmed.fastq.gz"), output=".trimmed_fastqc.gz", output_dir=output_dir["post_trim_fastqc"], extras=[output_dir["post_trim_fastqc"]]) # If there are technical replicates, each is mapped independently. # This is so each technical replicate maintains a separate read group. if experiment.alignment_method == "star": align_task_name = "star_align" if experiment.trim_reads: (pipeline.transform( task_func=stages.star_align, name=align_task_name, input=output_from("trim_reads"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \ % output_dir["alignments"], extras=[output_dir["star_index"], "{sample[0]}"]) ).follows("create_star_index") else: (pipeline.transform( task_func=stages.star_align, name=align_task_name, input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \ % output_dir["alignments"], extras=[output_dir["star_index"], "{sample[0]}"]) ).follows("create_star_index") if experiment.alignment_method == "hisat2": align_task_name = "hisat_align" if experiment.trim_reads: (pipeline.transform( task_func=stages.hisat_align, name="hisat_align", input=output_from("trim_reads"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \ % output_dir["alignments"], extras=[hisat_basename, "{sample[0]}"]) ).follows("create_hisat_index") else: (pipeline.transform( task_func=stages.hisat_align, name="hisat_align", input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \ % output_dir["alignments"], extras=[hisat_basename, "{sample[0]}"]) ).follows("create_hisat_index") # Sort BAM by coordinates pipeline.transform( task_func=stages.sort_bam_by_coordinate, name="sort_bam_by_coordinate", input=output_from(align_task_name), filter=formatter( ".+/(?P<sample>[a-zA-Z0-9-_]+)\.(?P<method>(star|hisat2))\..*bam"), output=[ "{path[0]}/{sample[0]}.{method[0]}.sorted.bam", "{path[0]}/{sample[0]}.{method[0]}.sorted.bam.bai" ]) # Merge files with the same sample name if experiment.multiple_technical_replicates: pipeline.collate( task_func=stages.merge_bams, name="merge_bams", input=output_from("sort_bam_by_coordinate"), filter=formatter( ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam" ), output=path_list_join( output_dir["alignments"], ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"])) else: pipeline.transform( task_func=stages.create_symlinks, name="merge_bams", input=output_from("sort_bam_by_coordinate"), filter=formatter( ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam" ), output=path_list_join( output_dir["alignments"], ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"])) # Sort BAM by name for counting features pipeline.transform(task_func=stages.sort_bam_by_name, name="sort_bam_by_name", input=output_from("merge_bams"), filter=suffix(".bam"), output=".nameSorted.bam") # Count features with HTSeq-count pipeline.transform(task_func=stages.htseq_count, name="htseq_count", input=output_from("sort_bam_by_name"), filter=suffix(".nameSorted.bam"), output_dir=output_dir["counts"], output=".htseq.txt") # Count features with featureCounts pipeline.transform(task_func=stages.featurecounts, name="featurecounts", input=output_from("sort_bam_by_name"), filter=suffix(".nameSorted.bam"), output_dir=output_dir["counts"], output=".featureCounts.txt") # TODO: add multiqc step # # Stringtie assembly # pipeline.transform( # task_func=stages.stringtie_assembly, # name="stringtie_assembly", # input=output_from("merge_bams"), # filter=suffix(".bam"), # output_dir=output_dir["stringtie_assembly"], # output=".gtf") # Stringtie estimates pipeline.transform( task_func=stages.stringtie_estimates, name="stringtie_estimates", input=output_from("merge_bams"), filter=formatter( ".+/(?P<sm>[a-zA-Z0-9-]+)\.(?P<method>(star|hisat2)).bam"), output=path_list_join(output_dir["stringtie_estimates"], ["{sm[0]}/{sm[0]}.gtf", "{sm[0]}/e_data.ctab"])) # Stringtie counts pipeline.collate( task_func=stages.stringtie_prepDE, name="stringtie_prepDE", input=output_from("stringtie_estimates"), filter=formatter(".+\.gtf"), output=path_list_join( output_dir["stringtie_estimates"], ["gene_count_matrix.csv", "transcript_count_matrix.csv"])) return pipeline
def make_pipeline(state): """Build the pipeline by constructing stages and connecting them together""" # Build an empty pipeline pipeline = Pipeline(name="crpipe") # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option("fastqs") # Find the path to the reference genome # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name="original_fastqs", output=fastq_files) # Convert FASTQ file to FASTA using fastx toolkit # pipeline.transform( # task_func=stages.fastq_to_fasta, # name='fastq_to_fasta', # input=output_from('original_fastqs'), # filter=suffix('.fastq.gz'), # output='.fasta') # The original reference file # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. # pipeline.originate( # task_func=stages.original_reference, # name='original_reference', # output=reference_file) # Run fastQC on the FASTQ files pipeline.transform( task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=suffix(".fastq.gz"), output="_fastqc", ) # Index the reference using BWA # pipeline.transform( # task_func=stages.index_reference_bwa, # name='index_reference_bwa', # input=output_from('original_reference'), # filter=suffix('.fa'), # output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Index the reference using samtools # pipeline.transform( # task_func=stages.index_reference_samtools, # name='index_reference_samtools', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.fa.fai') # Index the reference using bowtie 2 # pipeline.transform( # task_func=stages.index_reference_bowtie2, # name='index_reference_bowtie2', # input=output_from('original_reference'), # filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'), # output=['{path[0]}/{refname[0]}.1.bt2', # '{path[0]}/{refname[0]}.2.bt2', # '{path[0]}/{refname[0]}.3.bt2', # '{path[0]}/{refname[0]}.4.bt2', # '{path[0]}/{refname[0]}.rev.1.bt2', # '{path[0]}/{refname[0]}.rev.2.bt2'], # extras=['{path[0]}/{refname[0]}']) # # Create a FASTA sequence dictionary for the reference using picard # pipeline.transform( # task_func=stages.reference_dictionary_picard, # name='reference_dictionary_picard', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.dict') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name="align_bwa", input=output_from("original_fastqs"), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz"), # Add two more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=["{sample[0]}"], # The output file name is the sample name with a .bam extension. output="{path[0]}/{sample[0]}.bam", ) # Sort alignment with sambamba pipeline.transform( task_func=stages.sort_bam_sambamba, name="sort_alignment", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.sorted.bam", ) # Extract MMR genes from the sorted BAM file pipeline.transform( task_func=stages.extract_genes_bedtools, name="extract_genes_bedtools", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), output="{path[0]}/{sample[0]}.mmr.bam", ) # Extract selected chromosomes from the sorted BAM file pipeline.transform( task_func=stages.extract_chromosomes_samtools, name="extract_chromosomes_samtools", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), output="{path[0]}/{sample[0]}.chroms.bam", ) # Index the MMR genes bam file with samtools pipeline.transform( task_func=stages.index_bam, name="index_mmr_alignment", input=output_from("extract_genes_bedtools"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).mmr.bam"), output="{path[0]}/{sample[0]}.mmr.bam.bai", ) # Compute depth of coverage of the alignment with GATK DepthOfCoverage # pipeline.transform( # task_func=stages.alignment_coverage_gatk, # name='alignment_coverage_gatk', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs([reference_file]), # output='{path[0]}/{sample[0]}.coverage_summary', # extras=['{path[0]}/{sample[0]}_coverage']) # Index the alignment with samtools pipeline.transform( task_func=stages.index_bam, name="index_alignment", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), output="{path[0]}/{sample[0]}.sorted.bam.bai", ) # Generate alignment stats with bamtools pipeline.transform( task_func=stages.bamtools_stats, name="bamtools_stats", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.stats.txt", ) # Extract the discordant paired-end alignments pipeline.transform( task_func=stages.extract_discordant_alignments, name="extract_discordant_alignments", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.discordants.unsorted.bam", ) # Extract split-read alignments pipeline.transform( task_func=stages.extract_split_read_alignments, name="extract_split_read_alignments", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.splitters.unsorted.bam", ) # Sort discordant reads. # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name="sort_discordants", input=output_from("extract_discordant_alignments"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam"), extras=["{path[0]}/{sample[0]}.discordants"], output="{path[0]}/{sample[0]}.discordants.bam", ) # Index the sorted discordant bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_discordants', # input=output_from('sort_discordants'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'), # output='{path[0]}/{sample[0]}.discordants.bam.bai') # Sort discordant reads # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name="sort_splitters", input=output_from("extract_split_read_alignments"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam"), extras=["{path[0]}/{sample[0]}.splitters"], output="{path[0]}/{sample[0]}.splitters.bam", ) # Index the sorted splitters bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_splitters', # input=output_from('sort_splitters'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'), # output='{path[0]}/{sample[0]}.splitters.bam.bai') # Call structural variants with lumpy ( pipeline.transform( task_func=stages.structural_variants_lumpy, name="structural_variants_lumpy", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), add_inputs=add_inputs(["{path[0]}/{sample[0]}.splitters.bam", "{path[0]}/{sample[0]}.discordants.bam"]), output="{path[0]}/{sample[0]}.lumpy.vcf", ) .follows("index_alignment") .follows("sort_splitters") .follows("sort_discordants") ) # Call genotypes on lumpy output using SVTyper # (pipeline.transform( # task_func=stages.genotype_svtyper, # name='genotype_svtyper', # input=output_from('structural_variants_lumpy'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']), # output='{path[0]}/{sample[0]}.svtyper.vcf') # .follows('align_bwa') # .follows('sort_splitters') # .follows('index_alignment') # .follows('index_splitters') # .follows('index_discordants')) # Call SVs with Socrates ( pipeline.transform( task_func=stages.structural_variants_socrates, name="structural_variants_socrates", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), # output goes to {path[0]}/socrates/ output="{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt", extras=["{path[0]}"], ) ) # Call DELs with DELLY pipeline.merge( task_func=stages.deletions_delly, name="deletions_delly", input=output_from("sort_alignment"), output="delly.DEL.vcf", ) # Call DUPs with DELLY pipeline.merge( task_func=stages.duplications_delly, name="duplications_delly", input=output_from("sort_alignment"), output="delly.DUP.vcf", ) # Call INVs with DELLY pipeline.merge( task_func=stages.inversions_delly, name="inversions_delly", input=output_from("sort_alignment"), output="delly.INV.vcf", ) # Call TRAs with DELLY pipeline.merge( task_func=stages.translocations_delly, name="translocations_delly", input=output_from("sort_alignment"), output="delly.TRA.vcf", ) # Join both read pair files using gustaf_mate_joining # pipeline.transform( # task_func=stages.gustaf_mate_joining, # name='gustaf_mate_joining', # input=output_from('fastq_to_fasta'), # # Match the R1 (read 1) FASTA file and grab the path and sample name. # # This will be the first input to the stage. # # We assume the sample name may consist of only alphanumeric # # characters. # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'), # # Add one more input to the stage: # # 1. The corresponding R2 FASTA file # add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']), # output='{path[0]}/{sample[0]}.joined_mates.fasta') # Call structural variants with pindel # (pipeline.transform( # task_func=stages.structural_variants_pindel, # name='structural_variants_pindel', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]), # output='{path[0]}/{sample[0]}.pindel') # .follows('index_reference_bwa') # .follows('index_reference_samtools')) return pipeline
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='haloplexpipe') # Get a list of paths to all the FASTQ files #fastq_files = state.config.get_option('fastqs') fastq_files = glob.glob("fastqs/*.gz") # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('processed_fastqs') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') safe_make_dir('metrics/pass_samples') safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/vardict') # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) pipeline.transform( task_func=stages.run_surecalltrimmer, name='run_surecalltrimmer', input=output_from('original_fastqs'), filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'), #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'), #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'), extras=['{sample[0]}'], # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step. output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz') #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('run_surecalltrimmer'), filter=formatter( 'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz' ), add_inputs=add_inputs( 'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'), #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'), #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'), extras=['{sample[0]}'], output='alignments/{sample[0]}.bam') # Run locatit from agilent. this should produce sorted bam files, so no sorting needed at the next step pipeline.collate(task_func=stages.run_locatit, name='run_locatit', input=output_from('align_bwa', 'original_fastqs'), filter=regex(r'.+/(.+_L\d\d\d).+'), output=r'alignments/\1.locatit.bam') pipeline.transform(task_func=stages.sort_bam, name='sort_bam', input=output_from('run_locatit'), filter=suffix('.locatit.bam'), output='.sorted.locatit.bam') # # # # # Metrics stages # # # # # # generate mapping metrics (post locatit) pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) # Intersect the bam file with the region of interest pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') # Calculate coverage metrics from the intersected bam file pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') # Count the number of mapped reads pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') # Count the number of on-target reads pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') # Count the number of total reads pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') # Generate summary metrics from the stats files produces pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) # # # # # Metrics stages end # # # # # # # # # # Checking metrics and calling # # # # # # Originate to set the location of the metrics summary file (pipeline.originate( task_func=stages.grab_summary_file, name='grab_summary_file', output='all_sample.summary.txt').follows('generate_stats')) # Awk command to produce a list of bam files passing filters pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt') # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller pipeline.subdivide(name='passed_filter_files', task_func=stages.read_samples, input=output_from('filter_stats'), filter=formatter(), output="metrics/pass_samples/*.bam") # Call variants using GATK (pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam')) # Call variants with vardict (pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']).follows('sort_bam')) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') return (pipeline)
final task """ with open(tempdir + "jobs.start", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfiles])) test_job_io(infiles, outfiles, extra_params) with open(tempdir + "jobs.finish", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfiles])) # # Use equivalent but new sytle syntax # test_pipeline = Pipeline("test") test_pipeline.originate(task_func = task1, output = [tempdir + d for d in ('a.1', 'b.1', 'c.1')])\ .follows(mkdir(tempdir))\ .posttask(lambda: do_write(test_file, "Task 1 Done\n")) test_pipeline.transform(task_func = task2, input = task1, filter = suffix(".1"), output = ".2") \ .posttask(lambda: do_write(test_file, "Task 2 Done\n")) test_pipeline.transform(task3, task2, regex('(.*).2'), inputs([r"\1.2", tempdir + "a.1"]), r'\1.3')\ .posttask(lambda: do_write(test_file, "Task 3 Done\n")) test_pipeline.transform(task4, tempdir + "*.1", suffix(".1"), ".4")\ .follows(task1)\ .posttask(lambda: do_write(test_file, "Task 4 Done\n"))\
# ___________________________________________________________________________ # # check_regex_out_of_range_regex_reference_error_task # ___________________________________________________________________________ def check_regex_out_of_range_regex_reference_error_task(infiles, outfile, prefix1, prefix2, extension): raise Exception("Should blow up first") test_pipeline = Pipeline("test") test_pipeline.originate(task_func=generate_initial_files1, output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcdefghi"]) test_pipeline.transform(task_func=check_regex_task, input=generate_initial_files1, filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.tmp1)"), output=r"\1/\g<PREFIX>\3.tmp2", # output file extras=[r"\2", # extra: prefix = \2 r"\g<PREFIX>", # extra: prefix = \2 r"\4"]) # extra: extension test_pipeline.transform(task_func=check_regex_unmatched_task, input=generate_initial_files1, filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.xxx)"), output=r"\1/\g<PREFIXA>\3.tmp2", # output file extras=[r"\2", # extra: prefix = \2 r"\g<PREFIX>", # extra: prefix = \2 r"\4"]) # extra: extension
oo.write('job = %s\n' % json.dumps([infiles, outfiles])) test_job_io(infiles, outfiles, extra_params) with open(tempdir + "jobs.finish", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfiles])) # # Use equivalent but new sytle syntax # test_pipeline = Pipeline("test") test_pipeline.originate(task_func = task1, output = [tempdir + d for d in ('a.1', 'b.1', 'c.1')])\ .follows(mkdir(tempdir))\ .posttask(lambda: do_write(test_file, "Task 1 Done\n")) test_pipeline.transform(task_func = task2, input = task1, filter = suffix(".1"), output = ".2") \ .posttask(lambda: do_write(test_file, "Task 2 Done\n")) test_pipeline.transform(task3, task2, regex('(.*).2'), inputs([r"\1.2", tempdir + "a.1"]), r'\1.3')\ .posttask(lambda: do_write(test_file, "Task 3 Done\n")) test_pipeline.transform(task4, tempdir + "*.1", suffix(".1"), ".4")\ .follows(task1)\ .posttask(lambda: do_write(test_file, "Task 4 Done\n"))\
def check_combinations_with_replacement3_merged_task(infiles, outfile): with open(outfile, "w") as p: for infile in sorted(infiles): with open(infile) as ii: p.write(ii.read()) def cleanup_tmpdir(): os.system('rm -f %s %s' % (os.path.join(tempdir, '*'), RUFFUS_HISTORY_FILE)) test_pipeline1 = Pipeline("test1") test_pipeline2 = Pipeline("test2") gen_task1 = test_pipeline1.originate(task_func=generate_initial_files1, name="WOWWWEEE", output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline1.originate(task_func=generate_initial_files2, output=[tempdir + "/e_name.tmp1", tempdir + "/f_name.tmp1"]) test_pipeline1.originate(task_func=generate_initial_files3, output=[tempdir + "/g_name.tmp1", tempdir + "/h_name.tmp1"]) test_pipeline1.product(task_func=check_product_task, input=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"], filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"), input2=generate_initial_files2, filter2=formatter(), input3=generate_initial_files3, filter3=formatter(r"tmp1$"), output="{path[0][0]}/{FILE_PART[0][0]}.{basename[1][0]}.{basename[2][0]}.tmp2", extras=["{basename[0][0][0]}{basename[1][0][0]}{basename[2][0][0]}", # extra: prefices only (abcd etc)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='methylation_pipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = PipelineStages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Run bismark genome preparation on the reference genome pipeline.originate(task_func=stages.bismark_genome_prepare, name='bismark_genome_prepare', output='reference/Bisulfite_Genome') # Run FASTQC on the input fastq files pipeline.transform( task_func=stages.fastqc, name='fastqc', input=output_from('original_fastqs'), filter=formatter('(?P<path>.+)/(?P<filename>.+).fastq.gz'), output='{path[0]}/{filename[0]}_fastqc') # Run bismark on the input fastq files (pipeline.transform( task_func=stages.bismark, name='bismark', input=output_from('original_fastqs'), filter=formatter( '(?P<path>.+)/(?P<filename>.+)_R1_(?P<num>.+).fastq.gz'), add_inputs=add_inputs('{path[0]}/{filename[0]}_R2_{num[0]}.fastq.gz'), extras=['{path[0]}/bismark_output/'], output= '{path[0]}/bismark_output/{filename[0]}_R1_{num[0]}_bismark_bt2_pe.sam.gz' )).follows('bismark_genome_prepare') # Run bismark methylation extractor on the bismark output pipeline.transform( task_func=stages.bismark_methylation_extractor, name='bismark_methylation_extractor', input=output_from('bismark'), filter=formatter( '(?P<path>.+)/(?P<filename>.+)_R1_(?P<num>.+)_bismark_bt2_pe.sam.gz' ), extras=['{path[0]}'], output= '{path[0]}/CpG_context_{filename[0]}_R1_{num[0]}_bismark_bt2_pe.sam.gz.txt' ) # Run methpt on the bismark methylation extractor output pipeline.transform( task_func=stages.methpat, name='methpat', input=output_from('bismark_methylation_extractor'), filter=formatter('(?P<path>.+)/CpG_context_(?P<filename>.+)'), extras=['{path[0]}', '{filename[0]}'], output='{path[0]}/CpG_context_{filename[0]}.methpat.html') return pipeline
def test_combinations_with_replacement3_merged_task(infiles, outfile): with open(outfile, "w") as p: for infile in sorted(infiles): with open(infile) as ii: p.write(ii.read()) def cleanup_tmpdir(): os.system('rm -f %s %s' % (os.path.join(tempdir, '*'), RUFFUS_HISTORY_FILE)) test_pipeline1 = Pipeline("test1") test_pipeline2 = Pipeline("test2") gen_task1 = test_pipeline1.originate( task_func=generate_initial_files1, name="WOWWWEEE", output=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline1.originate( task_func=generate_initial_files2, output=[tempdir + "/e_name.tmp1", tempdir + "/f_name.tmp1"]) test_pipeline1.originate( task_func=generate_initial_files3, output=[tempdir + "/g_name.tmp1", tempdir + "/h_name.tmp1"]) test_pipeline1.product( task_func=test_product_task, input=[tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"], filter=formatter(".*/(?P<FILE_PART>.+).tmp1$"), input2=generate_initial_files2, filter2=formatter(), input3=generate_initial_files3, filter3=formatter(r"tmp1$"),
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='thepipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. # filter=formatter('(?P<path>.+)/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+)_1.fastq.gz'), # 1_HFYLVCCXX:2:TCCGCGAA_2_GE0343_1.fastq.gz # 1_HCJWFBCXX:GGACTCCT_L001_9071584415739518822-AGRF-023_R2.fastq.gz filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz add_inputs=add_inputs( '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'], # extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Local realignment using GATK # Generate RealignerTargetCreator using GATK pipeline.transform( task_func=stages.realigner_target_creator, name='realigner_target_creator', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('realigner_target_creator'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).intervals'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.bam'), output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'), # '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'), # output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Merge lane bams to sample bams pipeline.collate( task_func=stages.merge_sample_bams, name='merge_sample_bams', filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).sort.dedup.realn.recal.bam'), '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).sort.dedup.realn.recal.bam'), # inputs=add_inputs('alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'), input=output_from('print_reads_gatk'), output='alignments/{sample[0]}/{sample[0]}.merged.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard2', input=output_from('merge_sample_bams'), # filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).merged.bam'), filter=suffix('.merged.bam'), # XXX should make metricsup an extra output? output=['.merged.dedup.bam', '.metricsdup']) # Local realignment2 using GATK # Generate RealignerTargetCreator using GATK pipeline.transform( task_func=stages.realigner_target_creator, name='realigner_target_creator2', input=output_from('mark_duplicates_picard2'), filter=suffix('.dedup.bam'), output='.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk2', input=output_from('realigner_target_creator2'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.intervals'), # filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).intervals'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{sample[0]}.merged.dedup.bam'), output='alignments/{sample[0]}/{sample[0]}.merged.dedup.realn.bam') .follows('mark_duplicates_picard2')) # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('local_realignment_gatk2'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.dedup.realn.bam'), output='variants/{sample[0]}.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_haplotypecaller_gatk'), output='variants/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), add_inputs=add_inputs(['ALL.snp_recal', 'ALL.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), add_inputs=add_inputs( ['ALL.indel_recal', 'ALL.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['ALL.recal_INDEL.vcf']), # output='.combined.vcf') output='ALL.raw.vqsr.vcf') .follows('apply_indel_recalibrate_gatk')) # # # Select variants using GATK # pipeline.transform( # task_func=stages.select_variants_gatk, # name='select_variants_gatk', # input=output_from('combine_variants_gatk'), # filter=suffix('.combined.vcf'), # output='.selected.vcf') return pipeline
def make_pipeline_process(state): #originate process pipeline state # Define empty pipeline pipeline = Pipeline(name='haloplexpipe') # Get a list of paths to all the directories to be combined for variant calling run_directories = state.config.get_option('runs') #grab files from each of the processed directories in "runs" gatk_files = [] for directory in run_directories: gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf')) stages = Stages(state) #dummy stage to take the globbed outputs of each run that is to be processed pipeline.originate(task_func=stages.glob_gatk, name='glob_gatk', output=gatk_files) # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('glob_gatk'), output='processed/gatk/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # Apply GT filters to genotyped vcf pipeline.transform(task_func=stages.genotype_filter_gatk, name='genotype_filter_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output='.raw.gt-filter.vcf') # Decompose and normalise multiallelic sites pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('genotype_filter_gatk'), filter=suffix('.raw.gt-filter.vcf'), output='.raw.gt-filter.decomp.norm.vcf') # Annotate VCF file using GATK pipeline.transform(task_func=stages.variant_annotator_gatk, name='variant_annotator_gatk', input=output_from('vt_decompose_normalise'), filter=suffix('.raw.gt-filter.decomp.norm.vcf'), output='.raw.gt-filter.decomp.norm.annotate.vcf') # Filter vcf pipeline.transform( task_func=stages.gatk_filter, name='gatk_filter', input=output_from('variant_annotator_gatk'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'), output='.raw.gt-filter.decomp.norm.annotate.filter.vcf') #Apply VEP pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('gatk_filter'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'), output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf') ####### vardict stuff vardict_files = [] for directory in run_directories: vardict_files.extend( glob.glob(directory + '/variants/vardict/*sorted.vcf.gz')) #dummy stage to take the globbed outputs of each run that is to be processed pipeline.originate(task_func=stages.glob_vardict, name='glob_vardict', output=vardict_files) safe_make_dir('processed/vardict') #concatenate all vardict vcfs pipeline.merge(task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('glob_vardict'), output='processed/vardict/combined.vcf.gz') pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise_vardict', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.decomp.norm.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_final_vcf', input=output_from('vt_decompose_normalise_vardict'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vcf.gz.tbi') (pipeline.transform( task_func=stages.apply_vep, name='apply_vep_vardict', input=output_from('vt_decompose_normalise_vardict'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vep.vcf').follows('index_final_vcf')) return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name="radpipe") # Stages are dependent on the state stages = PipelineStages(state) # Get a list of library objects. libraries = parse_libraries( libraries=state.config.get_options("libraries")) # Get a list of input files input_files = [l.files for l in libraries] # input_files = [item for sublist in input_files for item in sublist] state.logger.info("Input files: " + str(input_files)) # Get a list of all samples for each library samples_dict = OrderedDict() for l in libraries: samples_dict[l.name] = l.samples state.logger.debug("Samples: " + str(samples_dict)) # Make sure that there are no duplicate samples sample_list = [ item for sublist in samples_dict.values() for item in sublist ] sample_counts = Counter(sample_list) for sample in sample_counts: if sample_counts[sample] > 1: print("Sample {} appears {} times in the barcodes files. " "Sample names must be unique".format(sample, sample_counts[sample])) sys.exit(radpipe.error_codes.INVALID_INPUT_FILE) # Define output directories output_dir = get_output_paths(state) state.logger.debug(output_dir) # Allow multiple comma-separated tasks if len(state.options.target_tasks) == 1: state.options.target_tasks = state.options.target_tasks[0].split(",") if len(state.options.forced_tasks) == 1: state.options.forced_tasks = state.options.forced_tasks[0].split(",") state.logger.debug("Target tasks: " + str(state.options.target_tasks)) state.logger.debug("Forced tasks: " + str(state.options.forced_tasks)) # Check if alignment_method is valid alignment_method = state.config.get_options( "alignment_method").strip().lower() if alignment_method not in ["bwa mem", "bowtie"]: print("Error: Invalid alignment_method in config file. " \ "Valid options are ['bwa mem', 'bowtie'].") sys.exit(radpipe.error_codes.INVALID_ARGUMENT) if alignment_method == "bwa mem": align_task_name = "bwa_mem" index_task_name = "bwa_index" else: align_task_name = "bowtie" index_task_name = "bowtie_index" # TODO: Refactor this # If 'alignment' is in target_tasks or forced_tasks, specify which # type of alignment job if "alignment" in state.options.target_tasks: index = state.options.target_tasks.index("alignment") state.options.target_tasks[index] = align_task_name if "alignment" in state.options.forced_tasks: index = state.options.forced_tasks.index("alignment") state.options.forced_tasks[index] = align_task_name # If 'build_index' is in target_tasks or forced_tasks, specify which # type of index job if "build_index" in state.options.target_tasks: index = state.options.target_tasks.index("build_index") state.options.target_tasks[index] = index_task_name if "build_index" in state.options.forced_tasks: index = state.options.forced_tasks.index("build_index") state.options.forced_tasks[index] = index_task_name state.logger.debug(state) # Whether to include filter_bam stage or not filter_bams = False try: samtools_view_options = state.config.get_options( "samtools_view_options") if samtools_view_options: filter_bams = True except: pass state.logger.info("Filter bams: {}".format(filter_bams)) # Population map filenames popmap_file = "{output_dir}/{name}_popmap.txt".format( output_dir=output_dir["populations"], name=state.config.get_options("analysis_id")) try: config_popmap_file = state.config.get_options("popmap_file") if config_popmap_file: state.logger.info( "Using popmap file: {}".format(config_popmap_file)) else: raise (Exception) except Exception: config_popmap_file = None state.logger.info("Creating new popmap file: {}".format(popmap_file)) # Population r values populations_r = state.config.get_options("populations_r") assert (isinstance(populations_r, list)) # Dummy stages. These do nothing except provide a node at the beginning # for the pipeline graph, giving the pipeline an obvious starting point. pipeline.originate(task_func=stages.do_nothing, name="original_fastqs", output=input_files) pipeline.originate(task_func=stages.do_nothing, name="reference_genome", output=state.config.get_options("reference_genome")) # Create a copy of the population map file needed for stacks, or create # one denovo using the sample list. pipeline.originate(task_func=stages.create_popmap_file, name="create_popmap_file", output=[popmap_file], extras=[config_popmap_file, sample_list]) # Create index for reference genome based on alignment method. if alignment_method == "bwa mem": pipeline.transform( task_func=stages.bwa_index, name="bwa_index", input=output_from("reference_genome"), filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"), output=path_list_join(output_dir["reference"], ["reference.fa.bwt", "reference.fa.sa"]), extras=[output_dir["reference"]]) if alignment_method == "bowtie": pipeline.transform(task_func=stages.bowtie_index, name="bowtie_index", input=output_from("reference_genome"), filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"), output=path_list_join( output_dir["reference"], ["reference.1.ebwt", "reference.rev.1.ebwt"]), extras=[output_dir["reference"]]) # FastQC pipeline.transform( task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=formatter(".+/(?P<lib>[^/]+)/(?P<fn>[^/]+).(fastq|fq).gz"), output="%s/{lib[0]}/{fn[0]}_fastqc.zip" % output_dir["fastqc"], extras=[output_dir["fastqc"], "{lib[0]}"]) # MultiQC: FastQC pipeline.merge(task_func=stages.multiqc_fastqc, name="multiqc_fastqc", input=output_from("fastqc"), output="%s/multiqc_fastqc_report.html" % output_dir["qc"], extras=[output_dir["qc"], output_dir["fastqc"]]) # Stacks: Process RAD-Tags pipeline.transform(task_func=stages.process_radtags, name="process_radtags", input=output_from("original_fastqs"), filter=formatter(".+/(?P<lib>[^/]+)/[^/]+"), output="%s/{lib[0]}/{lib[0]}.success" % output_dir["process_radtags"], extras=[ output_dir["process_radtags"], "{lib[0]}", state.config.get_options("renz_1"), state.config.get_options("renz_2"), state.config.get_options("process_radtags_options") ]) # Create a list for alignment with the input fastq files from process_radtags process_radtags_outputs = [] for l in libraries: for s in l.samples: base = "{dir}/{lib}/{sample}".format( dir=output_dir["process_radtags"], lib=l.lib_id, sample=s) process_radtags_outputs.append( [base + ".1.fq.gz", base + ".2.fq.gz"]) # print(process_radtags_outputs) # Alignment if align_task_name == "bwa_mem": (pipeline.transform( task_func=stages.bwa_align, name=align_task_name, input=process_radtags_outputs, filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"), output="%s/{sm[0]}.bwa.bam" % output_dir["alignments"], extras=[ os.path.join(output_dir["reference"], "reference.fa"), "{path[0]}", output_dir["alignments"], "{sm[0]}", state.config.get_options("alignment_options") ])).follows("bwa_index").follows("process_radtags") if align_task_name == "bowtie": (pipeline.transform( task_func=stages.bowtie_align, name=align_task_name, input=process_radtags_outputs, filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"), output="%s/{sm[0]}.bowtie.bam" % output_dir["alignments"], extras=[ os.path.join(output_dir["reference"], "reference"), "{path[0]}", output_dir["alignments"], "{sm[0]}", state.config.get_options("alignment_options") ])).follows("bowtie_index").follows("process_radtags") # Sort BAM and index pipeline.transform(task_func=stages.sort_bam, name="sort_bam", input=output_from(align_task_name), filter=suffix(".bam"), output=".sorted.bam") if filter_bams: final_bam_task_name = "filter_bam" pipeline.transform( task_func=stages.filter_bam, name="filter_bam", input=output_from("sort_bam"), filter=suffix(".sorted.bam"), output=".sorted.filtered.bam", extras=[state.config.get_options("samtools_view_options")]) else: final_bam_task_name = "sort_bam" # Samtools flagstat pipeline.transform(task_func=stages.flagstat, name="flagstat", input=output_from(final_bam_task_name), filter=suffix(".bam"), output=".flagstat.txt", output_dir=output_dir["flagstat"]) # MultiQC: flagstat pipeline.merge(task_func=stages.multiqc_flagstat, name="multiqc_flagstat", input=output_from("flagstat"), output="%s/multiqc_flagstat_report.html" % output_dir["qc"], extras=[output_dir["qc"], output_dir["flagstat"]]) # Stacks: gstacks pipeline.merge(task_func=stages.gstacks, name="gstacks", input=output_from(final_bam_task_name), output="%s/catalog.fa.gz" % output_dir["gstacks"], extras=[ output_dir["alignments"], output_dir["gstacks"], align_task_name, final_bam_task_name, sample_list, state.config.get_options("gstacks_options") ]) # Define outputs from each run of populations populations_outputs = [] for r in populations_r: dir_name = "{pop_dir}/{analysis_name}_r{r}".format( pop_dir=output_dir["populations"], analysis_name=state.config.get_options("analysis_id"), r=r) populations_outputs.append( os.path.join(dir_name, "populations.snps.vcf")) # print(populations_outputs) # Stacks: populations pipeline.originate(task_func=stages.populations, name="popluations", output=populations_outputs, extras=[ output_dir["gstacks"], output_dir["populations"], popmap_file, state.config.get_options("populations_options") ]).follows("gstacks").follows("create_popmap_file") return pipeline
#--------------------------------------------------------------- # first task @transform(create_initial_file_pairs, suffix(".start"), ".output.1") def first_task(input_files, output_file): with open(output_file, "w"): pass #--------------------------------------------------------------- # second task @transform(first_task, suffix(".output.1"), ".output.2") def second_task(input_files, output_file): with open(output_file, "w"): pass test_pipeline = Pipeline("test") test_pipeline.originate(output = [ [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job1.a.start', tempdir + 'job1.b.start'], [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job2.a.start', tempdir + 'job2.b.start'], [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job3.a.start', tempdir + 'job3.b.start'] ], task_func = create_initial_file_pairs) test_pipeline.transform(task_func = first_task, input = create_initial_file_pairs, filter = suffix(".start"), output = ".output.1") test_pipeline.transform(input = first_task, filter = suffix(".output.1"), output = ".output.2", task_func= second_task) decorator_syntax = 0 oop_syntax = 1 class Test_verbosity(unittest.TestCase): #___________________________________________________________________________ # # test_printout_abbreviated_path1 #___________________________________________________________________________ def test_printout_abbreviated_path1(self): """Input file exists, output doesn't exist"""
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='vcf_annotation') # Get a list of paths to all the FASTQ files vcf_files = state.config.get_option('vcfs') # Stages are dependent on the state stages = Stages(state) # The original VCF files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_vcf, name='original_vcf', output=vcf_file) # Decompose VCF using Vt pipeline.transform( task_func=stages.decompose_vcf, name='decompose_vcf', input=output_from('original_vcf'), # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).vcf'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the VCF file name (e.g. study/family name. # This is needed within the stage for finding out sample specific # configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.decompose.normalize.vcf') # FILTER COMMON VARIANTS # ADD FILTER COMMON VARIANTS USING VEP # Annotate using VEP pipeline.transform( task_func=stages.annotate_vep, name='annotate_vep', input=output_from('decompose_vcf'), filter=suffix('.vcf'), output='.vep.vcf') # Annotate using SnpEff pipeline.transform( task_func=stages.annotate_snpeff, name='annotate_snpeff', input=output_from('annotate_vep'), filter=suffix('.vcf'), output='.snpeff.vcf') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform( task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Call variants using GATK pipeline.transform( task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='PCExomes.mergegvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergegvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs( ['PCExomes.indel_recal', 'PCExomes.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']), output='.combined.vcf') .follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform( task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='complexo') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform( task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Call variants using GATK pipeline.transform( task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='PCExomes.mergegvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergegvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.indel_recal', 'PCExomes.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']), output='.combined.vcf') .follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform( task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='crpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Find the path to the reference genome # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Convert FASTQ file to FASTA using fastx toolkit # pipeline.transform( # task_func=stages.fastq_to_fasta, # name='fastq_to_fasta', # input=output_from('original_fastqs'), # filter=suffix('.fastq.gz'), # output='.fasta') # The original reference file # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. #pipeline.originate( # task_func=stages.original_reference, # name='original_reference', # output=reference_file) # Run fastQC on the FASTQ files pipeline.transform(task_func=stages.fastqc, name='fastqc', input=output_from('original_fastqs'), filter=suffix('.fastq.gz'), output='_fastqc') # Index the reference using BWA #pipeline.transform( # task_func=stages.index_reference_bwa, # name='index_reference_bwa', # input=output_from('original_reference'), # filter=suffix('.fa'), # output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Index the reference using samtools # pipeline.transform( # task_func=stages.index_reference_samtools, # name='index_reference_samtools', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.fa.fai') # Index the reference using bowtie 2 # pipeline.transform( # task_func=stages.index_reference_bowtie2, # name='index_reference_bowtie2', # input=output_from('original_reference'), # filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'), # output=['{path[0]}/{refname[0]}.1.bt2', # '{path[0]}/{refname[0]}.2.bt2', # '{path[0]}/{refname[0]}.3.bt2', # '{path[0]}/{refname[0]}.4.bt2', # '{path[0]}/{refname[0]}.rev.1.bt2', # '{path[0]}/{refname[0]}.rev.2.bt2'], # extras=['{path[0]}/{refname[0]}']) # # Create a FASTA sequence dictionary for the reference using picard # pipeline.transform( # task_func=stages.reference_dictionary_picard, # name='reference_dictionary_picard', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.dict') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), # Add two more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort alignment with sambamba pipeline.transform(task_func=stages.sort_bam_sambamba, name='sort_alignment', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.sorted.bam') # Extract MMR genes from the sorted BAM file pipeline.transform( task_func=stages.extract_genes_bedtools, name='extract_genes_bedtools', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.mmr.bam') # Extract selected chromosomes from the sorted BAM file pipeline.transform( task_func=stages.extract_chromosomes_samtools, name='extract_chromosomes_samtools', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.chroms.bam') # Index the MMR genes bam file with samtools pipeline.transform(task_func=stages.index_bam, name='index_mmr_alignment', input=output_from('extract_genes_bedtools'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).mmr.bam'), output='{path[0]}/{sample[0]}.mmr.bam.bai') # Compute depth of coverage of the alignment with GATK DepthOfCoverage #pipeline.transform( # task_func=stages.alignment_coverage_gatk, # name='alignment_coverage_gatk', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs([reference_file]), # output='{path[0]}/{sample[0]}.coverage_summary', # extras=['{path[0]}/{sample[0]}_coverage']) # Index the alignment with samtools pipeline.transform( task_func=stages.index_bam, name='index_alignment', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.sorted.bam.bai') # Generate alignment stats with bamtools pipeline.transform(task_func=stages.bamtools_stats, name='bamtools_stats', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.stats.txt') # Extract the discordant paired-end alignments pipeline.transform(task_func=stages.extract_discordant_alignments, name='extract_discordant_alignments', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.discordants.unsorted.bam') # Extract split-read alignments pipeline.transform(task_func=stages.extract_split_read_alignments, name='extract_split_read_alignments', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.splitters.unsorted.bam') # Sort discordant reads. # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name='sort_discordants', input=output_from('extract_discordant_alignments'), filter=formatter( '.+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam'), extras=['{path[0]}/{sample[0]}.discordants'], output='{path[0]}/{sample[0]}.discordants.bam') # Index the sorted discordant bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_discordants', # input=output_from('sort_discordants'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'), # output='{path[0]}/{sample[0]}.discordants.bam.bai') # Sort discordant reads # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name='sort_splitters', input=output_from('extract_split_read_alignments'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam'), extras=['{path[0]}/{sample[0]}.splitters'], output='{path[0]}/{sample[0]}.splitters.bam') # Index the sorted splitters bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_splitters', # input=output_from('sort_splitters'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'), # output='{path[0]}/{sample[0]}.splitters.bam.bai') # Call structural variants with lumpy (pipeline.transform( task_func=stages.structural_variants_lumpy, name='structural_variants_lumpy', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), add_inputs=add_inputs([ '{path[0]}/{sample[0]}.splitters.bam', '{path[0]}/{sample[0]}.discordants.bam' ]), output='{path[0]}/{sample[0]}.lumpy.vcf').follows('index_alignment'). follows('sort_splitters').follows('sort_discordants')) # Call genotypes on lumpy output using SVTyper #(pipeline.transform( # task_func=stages.genotype_svtyper, # name='genotype_svtyper', # input=output_from('structural_variants_lumpy'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']), # output='{path[0]}/{sample[0]}.svtyper.vcf') # .follows('align_bwa') # .follows('sort_splitters') # .follows('index_alignment') # .follows('index_splitters') # .follows('index_discordants')) # Call SVs with Socrates (pipeline.transform( task_func=stages.structural_variants_socrates, name='structural_variants_socrates', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # output goes to {path[0]}/socrates/ output= '{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt', extras=['{path[0]}'])) # Call DELs with DELLY pipeline.merge(task_func=stages.deletions_delly, name='deletions_delly', input=output_from('sort_alignment'), output='delly.DEL.vcf') # Call DUPs with DELLY pipeline.merge(task_func=stages.duplications_delly, name='duplications_delly', input=output_from('sort_alignment'), output='delly.DUP.vcf') # Call INVs with DELLY pipeline.merge(task_func=stages.inversions_delly, name='inversions_delly', input=output_from('sort_alignment'), output='delly.INV.vcf') # Call TRAs with DELLY pipeline.merge(task_func=stages.translocations_delly, name='translocations_delly', input=output_from('sort_alignment'), output='delly.TRA.vcf') # Join both read pair files using gustaf_mate_joining #pipeline.transform( # task_func=stages.gustaf_mate_joining, # name='gustaf_mate_joining', # input=output_from('fastq_to_fasta'), # # Match the R1 (read 1) FASTA file and grab the path and sample name. # # This will be the first input to the stage. # # We assume the sample name may consist of only alphanumeric # # characters. # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'), # # Add one more input to the stage: # # 1. The corresponding R2 FASTA file # add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']), # output='{path[0]}/{sample[0]}.joined_mates.fasta') # Call structural variants with pindel #(pipeline.transform( # task_func=stages.structural_variants_pindel, # name='structural_variants_pindel', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]), # output='{path[0]}/{sample[0]}.pindel') # .follows('index_reference_bwa') # .follows('index_reference_samtools')) return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='cellfree_seq') # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') # The original FASTQ files fastq_files = glob.glob('fastqs/*') # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}.sort.hq.bam') pipeline.transform(task_func=stages.run_connor, name='run_connor', input=output_from('align_bwa'), filter=suffix('.sort.hq.bam'), output='.sort.hq.connor.bam') safe_make_dir('metrics') safe_make_dir('metrics/summary') safe_make_dir('metrics/connor') pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_raw', input=output_from('coverage_bed_raw', 'genome_reads_raw', 'target_reads_raw', 'total_reads_raw'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'summary.txt']) pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/connor/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_connor', input=output_from('coverage_bed_connor', 'genome_reads_connor', 'target_reads_connor', 'total_reads_connor'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/connor/all_sample.summary.\1.txt', extras=[r'\1', 'connor.summary.txt']) safe_make_dir('variants') safe_make_dir('variants/vardict') pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') (pipeline.merge( task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('sort_vcfs'), output='variants/vardict/combined.vcf.gz').follows('index_vcfs')) pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.decomp.norm.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_final_vcf', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vcf.gz.tbi') (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vep.vcf').follows('index_final_vcf')) return pipeline
"test_active_if/b.3" -> "test_active_if/b.4" "test_active_if/b.4" -> "test_active_if/summary.5" """ expected_inactive_text = """null -> "test_active_if/a.1" "test_active_if/a.1" -> "test_active_if/a.2" "test_active_if/a.2" -> "test_active_if/a.4" null -> "test_active_if/b.1" "test_active_if/b.1" -> "test_active_if/b.2" "test_active_if/b.2" -> "test_active_if/b.4" "test_active_if/b.4" -> "test_active_if/summary.5" """ # alternative syntax test_pipeline = Pipeline("test") test_pipeline.originate(task1, ['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")\ .follows(mkdir("test_active_if")) test_pipeline.transform(task2, task1, suffix(".1"), ".2") test_pipeline.transform(task3, task1, suffix(".1"), ".3").active_if(lambda: pipeline_active_if) test_pipeline.collate(task4, [task2, task3], regex(r"(.+)\.[23]"), r"\1.4") test_pipeline.merge(task5, task4, "test_active_if/summary.5") class Test_ruffus(unittest.TestCase): def setUp(self): try: shutil.rmtree(tempdir) except: pass os.makedirs(tempdir)
"test_active_if/b.4" -> "test_active_if/summary.5" """ expected_inactive_text = """null -> "test_active_if/a.1" "test_active_if/a.1" -> "test_active_if/a.2" "test_active_if/a.2" -> "test_active_if/a.4" null -> "test_active_if/b.1" "test_active_if/b.1" -> "test_active_if/b.2" "test_active_if/b.2" -> "test_active_if/b.4" "test_active_if/b.4" -> "test_active_if/summary.5" """ # alternative syntax test_pipeline = Pipeline("test") test_pipeline.originate(task1, ['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter")\ .follows(mkdir("test_active_if")) test_pipeline.transform(task2, task1, suffix(".1"), ".2") test_pipeline.transform(task3, task1, suffix( ".1"), ".3").active_if(lambda: pipeline_active_if) test_pipeline.collate(task4, [task2, task3], regex(r"(.+)\.[23]"), r"\1.4") test_pipeline.merge(task5, task4, "test_active_if/summary.5") class Test_ruffus(unittest.TestCase): def setUp(self): try: shutil.rmtree(tempdir) except: pass os.makedirs(tempdir)