def setUp(self): # list of executed tasks manager = multiprocessing.managers.SyncManager() manager.start() global mutex_proxy global executed_tasks_proxy mutex_proxy = manager.Lock() executed_tasks_proxy = manager.dict() pipeline = Pipeline.pipelines["main"] pipeline.originate(task_func = start_task, output = [tempdir + "a.1", tempdir + "b.1"], extras = [executed_tasks_proxy, mutex_proxy])\ .mkdir(tempdir) pipeline.transform(task_func = same_file_name_task, input = start_task, filter = suffix(".1"), output = ".1", extras = [executed_tasks_proxy, mutex_proxy]) pipeline.transform( task_func = linked_file_name_task, input = start_task, filter = suffix(".1"), output = ".linked.1", extras = [executed_tasks_proxy, mutex_proxy]) pipeline.transform(task_func = final_task, input = [linked_file_name_task, same_file_name_task], filter = suffix(".1"), output = ".3", extras = [executed_tasks_proxy, mutex_proxy]) self.cleanUp()
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.originate(start_task, ["a.1", "b.1"]) test_pipeline.transform(same_file_name_task, start_task, suffix(".1"), ".1") test_pipeline.transform(linked_file_name_task, start_task, suffix(".1"), ".linked.1") test_pipeline.transform(final_task, [linked_file_name_task, same_file_name_task], suffix(".1"), ".3") test_pipeline.run(log_exceptions = True, verbose = 0)
def build_pipeline(): pipe = Pipeline("my_pipeline") pipe.originate( name="create_three_new_files", task_func=create_new_file, output=[os.path.join(WORK_DIR, f"file{i}.csv") for i in range(1, 4)], ) pipe.transform( name="convert_csv_files_to_tsv", task_func=csv_to_tsv, input=output_from("create_three_new_files"), filter=suffix(".csv"), output=".tsv", ) pipe.transform( name="calculate_md5", task_func=md5, input=output_from("convert_csv_files_to_tsv"), filter=suffix(".tsv"), output=".md5sum", ) return pipe
def test_newstyle_ruffus(self): # alternative syntax test_pipeline = Pipeline("test") test_pipeline.mkdir(data_dir, work_dir) test_pipeline.originate(task_func=task1, output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"]) test_pipeline.mkdir(filter=suffix(".1"), output=".dir", output_dir=work_dir) test_pipeline.transform(task_func=task2, input=task1, filter=suffix(".1"), output=[".1", ".bak"], extras=["extra.tst", 4, r"orig_dir=\1"], output_dir=work_dir) test_pipeline.subdivide(task3, task2, suffix( ".1"), r"\1.*.2", [r"\1.a.2", r"\1.b.2"], output_dir=data_dir) test_pipeline.transform(task4, task3, suffix( ".2"), ".3", output_dir=work_dir) test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5")) test_pipeline.run(multiprocess=50, verbose=0) with open(os.path.join(data_dir, "summary.5")) as ii: active_text = ii.read() if active_text != expected_active_text: raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" % (expected_active_text, active_text))
def create_pipeline(self): """ Create new pipeline on the fly without using decorators """ global count_pipelines count_pipelines = count_pipelines + 1 test_pipeline = Pipeline("test %d" % count_pipelines) test_pipeline.transform(task_func=transform1, input=input_file, filter=suffix('.txt'), output='.output', extras=[runtime_data]) test_pipeline.transform(task_func=transform_raise_error, input=input_file, filter=suffix('.txt'), output='.output', extras=[runtime_data]) test_pipeline.split(task_func=split1, input=input_file, output=split1_outputs) test_pipeline.merge(task_func=merge2, input=split1, output=merge2_output) return test_pipeline
def setUp(self): # list of executed tasks manager = multiprocessing.managers.SyncManager() manager.start() global mutex_proxy global executed_tasks_proxy mutex_proxy = manager.Lock() executed_tasks_proxy = manager.dict() pipeline = Pipeline.pipelines["main"] pipeline.originate(task_func=start_task, output=[tempdir + "a.1", tempdir + "b.1"], extras=[executed_tasks_proxy, mutex_proxy])\ .mkdir(tempdir) pipeline.transform(task_func=same_file_name_task, input=start_task, filter=suffix(".1"), output=".1", extras=[executed_tasks_proxy, mutex_proxy]) pipeline.transform(task_func=linked_file_name_task, input=start_task, filter=suffix(".1"), output=".linked.1", extras=[executed_tasks_proxy, mutex_proxy]) pipeline.transform(task_func=final_task, input=[linked_file_name_task, same_file_name_task], filter=suffix(".1"), output=".3", extras=[executed_tasks_proxy, mutex_proxy]) self.cleanUp()
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.split(task_func=split_fasta_file, input=tempdir + "original.fa", output=[tempdir + "files.split.success", tempdir + "files.split.*.fa"])\ .posttask(lambda: verbose_output.write(" Split into %d files\n" % 10)) test_pipeline.transform(task_func=align_sequences, input=split_fasta_file, filter=suffix(".fa"), output=".aln" # fa -> aln )\ .posttask(lambda: verbose_output.write(" Sequences aligned\n")) test_pipeline.transform(task_func=percentage_identity, input=align_sequences, # find all results from align_sequences # replace suffix with: filter=suffix(".aln"), output=[r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: verbose_output.write(" %Identity calculated\n")) test_pipeline.merge(task_func=combine_results, input=percentage_identity, output=[tempdir + "all.combine_results", tempdir + "all.combine_results_success"])\ .posttask(lambda: verbose_output.write(" Results recombined\n")) test_pipeline.run(multiprocess=50, verbose=0) if not os.path.exists(tempdir + "all.combine_results"): raise Exception("Missing %s" % (tempdir + "all.combine_results"))
def test_newstyle_simpler (self): test_pipeline = Pipeline("test") test_pipeline.originate(task1, input_file_names, extras = [logger_proxy, logging_mutex]) test_pipeline.transform(task2, task1, suffix(".1"), ".2", extras = [logger_proxy, logging_mutex]) test_pipeline.transform(task3, task2, suffix(".2"), ".3", extras = [logger_proxy, logging_mutex]) test_pipeline.merge(task4, task3, final_file_name, extras = [logger_proxy, logging_mutex]) #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex}) test_pipeline.run(multiprocess = 500, verbose = 0)
def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func = generate_initial_files, output = original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide( task_func = split_fasta_file, input = generate_initial_files, filter = regex(r".*\/original_(\d+).fa"), # match original files output = [tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras = [r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func = align_sequences, input = split_fasta_file, filter = suffix(".fa"), output = ".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func = percentage_identity, input = align_sequences, # find all results from align_sequences filter = suffix(".aln"), # replace suffix with: output = [r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func = combine_results, input = percentage_identity, filter = regex(r".*files.split\.(\d+)\.\d+.pcid"), output = [tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue( re.search('Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def make_pipeline1( pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform( task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform( task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform( task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def test_newstyle_collate(self): """ As above but create pipeline on the fly using object orientated syntax rather than decorators """ # # Create pipeline on the fly, joining up tasks # test_pipeline = Pipeline("test") test_pipeline.originate(task_func=generate_initial_files, output=original_files)\ .mkdir(tempdir, tempdir+"/test") test_pipeline.subdivide(task_func=split_fasta_file, input=generate_initial_files, # match original files filter=regex(r".*\/original_(\d+).fa"), output=[tempdir + r"/files.split.\1.success", # flag file for each original file tempdir + r"/files.split.\1.*.fa"], # glob pattern extras=[r"\1"])\ .posttask(lambda: sys.stderr.write("\tSplit into %d files each\n" % JOBS_PER_TASK)) test_pipeline.transform(task_func=align_sequences, input=split_fasta_file, filter=suffix(".fa"), output=".aln") \ .posttask(lambda: sys.stderr.write("\tSequences aligned\n")) test_pipeline.transform(task_func=percentage_identity, input=align_sequences, # find all results from align_sequences # replace suffix with: filter=suffix(".aln"), output=[r".pcid", # .pcid suffix for the result r".pcid_success"] # .pcid_success to indicate job completed )\ .posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) test_pipeline.collate(task_func=combine_results, input=percentage_identity, filter=regex(r".*files.split\.(\d+)\.\d+.pcid"), output=[tempdir + r"/\1.all.combine_results", tempdir + r"/\1.all.combine_results_success"])\ .posttask(lambda: sys.stderr.write("\tResults recombined\n")) # # Cleanup, printout and run # self.cleanup_tmpdir() s = StringIO() test_pipeline.printout(s, [combine_results], verbose=5, wrap_width=10000) self.assertTrue(re.search( 'Job needs update:.*Missing files.*', s.getvalue(), re.DOTALL) is not None) test_pipeline.run(verbose=0)
def make_pipeline1(pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform(task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs( tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform(task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform(task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='test_pipeline') # Get a list of paths to all the FASTQ files input_files = state.config.get_option('files') # Stages are dependent on the state stages = Stages(state) # The original files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_files, name='original_files', output=input_files) pipeline.transform( task_func=stages.stage1, name='stage1', input=output_from('original_files'), filter=suffix('.0'), output='.1') pipeline.transform( task_func=stages.stage2, name='stage2', input=output_from('stage1'), filter=suffix('.1'), output='.2') pipeline.transform( task_func=stages.stage3, name='stage3', input=output_from('stage2'), filter=suffix('.2'), output='.3') pipeline.transform( task_func=stages.stage4, name='stage4', input=output_from('stage3'), filter=suffix('.3'), output='.4') pipeline.transform( task_func=stages.stage5, name='stage5', input=output_from('stage4'), filter=suffix('.4'), output='.5') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='twin ion') # Get a list of paths to all the MZML files mzml_files = state.config.get_option('mzml') # Stages are dependent on the state stages = Stages(state) # The original MZML files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_mzml, name='original_mzml', output=mzml_files) pipeline.transform( task_func=stages.resample, name='resample', input=output_from('original_mzml'), filter=suffix('.mzML'), output='.resample.mzML') pipeline.transform( task_func=stages.noise_filter_sgolay, name='noise_filter_sgolay', input=output_from('resample'), filter=suffix('.resample.mzML'), output='.denoise.mzML') pipeline.transform( task_func=stages.baseline_filter, name='baseline_filter', input=output_from('noise_filter_sgolay'), filter=suffix('.denoise.mzML'), output='.baseline.mzML') pipeline.transform( task_func=stages.peak_picker_hires, name='peak_picker_hires', input=output_from('baseline_filter'), filter=suffix('.baseline.mzML'), output='.peaks.mzML') pipeline.transform( task_func=stages.feature_finder_centroid, name='feature_finder_centroid', input=output_from('peak_picker_hires'), filter=suffix('.peaks.mzML'), output='.featureXML') return pipeline
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func=task1, output=[tempdir + 'a.1'] + runtime_files) test_pipeline.transform(task2, task1, suffix(".1"), ".2") test_pipeline.transform(task_func=task3, input=task2, filter=suffix(".2"), output=".3") test_pipeline.transform(task_func=task4, input=runtime_parameter("a"), filter=suffix(".3"), output=".4").follows(task3) test_pipeline.run(verbose=0, runtime_data={"a": runtime_files})
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='md5') # Get a list of paths to all the input files input_files = state.config.get_option('files') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_files, name='original_files', output=input_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.md5_checksum, name='md5_checksum', input=output_from('original_files'), filter=suffix(''), output='.md5') return pipeline
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\ .follows(mkdir(tempdir)) test_pipeline.split(task_func = step_4_split_numbers_into_chunks, input = tempdir + "random_numbers.list", output = tempdir + "*.chunks")\ .follows(create_random_numbers) test_pipeline.transform(task_func=step_5_calculate_sum_of_squares, input=step_4_split_numbers_into_chunks, filter=suffix(".chunks"), output=".sums") test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\ .posttask(lambda: sys.stdout.write(" hooray\n"))\ .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done"))) test_pipeline.run(multiprocess=50, verbose=0) output_file = os.path.join(tempdir, "variance.result") if not os.path.exists(output_file): raise Exception("Missing %s" % output_file)
def main(argv=None): if argv is None: argv = sys.argv options, args = P.initialize(argv, config_file="template.yml", defaults={ "min_value": 0.0, "num_samples": 1000, "mu": 0.0, "sigma": 1.0 }) pipeline = ruffus.Pipeline("template_pipeline") task_create_files = pipeline.originate( task_func=create_files, output=["sample_{:02}.txt".format(x) for x in range(10)]) task_compute_mean = pipeline.transform(task_func=compute_mean, input=task_create_files, filter=ruffus.suffix(".txt"), output=".mean") task_combine_means = pipeline.merge(task_func=combine_means, input=task_compute_mean, output="means.txt") # primary targets pipeline.merge(task_func=P.EmptyRunner("all"), input=task_combine_means, output="all") E.debug("starting workflow") return P.run_workflow(options, args)
def test_newstyle_ruffus (self): test_pipeline = Pipeline("test") test_pipeline.files(create_random_numbers, None, tempdir + "random_numbers.list")\ .follows(mkdir(tempdir)) test_pipeline.split(task_func = step_4_split_numbers_into_chunks, input = tempdir + "random_numbers.list", output = tempdir + "*.chunks")\ .follows(create_random_numbers) test_pipeline.transform(task_func = step_5_calculate_sum_of_squares, input = step_4_split_numbers_into_chunks, filter = suffix(".chunks"), output = ".sums") test_pipeline.merge(task_func = step_6_calculate_variance, input = step_5_calculate_sum_of_squares, output = os.path.join(tempdir, "variance.result"))\ .posttask(lambda: sys.stdout.write(" hooray\n"))\ .posttask(print_hooray_again, print_whoppee_again, touch_file(os.path.join(tempdir, "done"))) test_pipeline.run(multiprocess = 50, verbose = 0) output_file = os.path.join(tempdir, "variance.result") if not os.path.exists (output_file): raise Exception("Missing %s" % output_file)
def build_pipeline(self, pipeline_name, **kwargs): # fudge: clear all previous pipelines ruffus.Pipeline.clear_all() pipeline = ruffus.Pipeline(pipeline_name) task_create_files = pipeline.originate( task_func=create_files, output=["sample_{:02}.txt".format(x) for x in range(10)]) task_compute_mean = pipeline.transform(task_func=compute_mean, input=task_create_files, filter=ruffus.suffix(".txt"), output=".mean") task_combine_means = pipeline.merge(task_func=combine_means, input=task_compute_mean, output="means.txt") task_run_local_job1 = pipeline.transform(task_func=run_local_job1, input=task_create_files, filter=ruffus.suffix(".txt"), output=".local1") # test jobs_limit with local running task_run_local_job2 = pipeline.transform(task_func=run_local_job2, input=task_create_files, filter=ruffus.suffix(".txt"), output=".local2").jobs_limit( NUM_CORES // 2) # multiprocessing and DRMAA do not work at the moment likely # cause is the shared session object. if not HAVE_DRMAA or (kwargs.get("multiprocess", 1) > 1): return task_run_remote_job1 = pipeline.transform(task_func=run_remote_job1, input=task_create_files, filter=ruffus.suffix(".txt"), output=".remote1") # test jobs_limit with remote running task_run_remote_job2 = pipeline.transform( task_func=run_remote_job2, input=task_create_files, filter=ruffus.suffix(".txt"), output=".remote2").jobs_limit(NUM_CORES // 2)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='ovarian_cancer_pipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') human_reference_genome_file = state.config.get_option('human_reference_genome') # Stages are dependent on the state stages = PipelineStages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # The human reference genome in FASTA format pipeline.originate( task_func=stages.human_reference_genome, name='human_reference_genome', output=human_reference_genome_file) # Index the human reference genome with BWA, needed before we can map reads pipeline.transform( task_func=stages.index_ref_bwa, name='index_ref_bwa', input=output_from('human_reference_genome'), filter=suffix('.fa'), output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Align paired end reads in FASTQ to the reference producing a BAM file (pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[_a-zA-Z0-9]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') .follows('index_ref_bwa')) return pipeline
def create_pipeline(self): #each pipeline has a different name global cnt_pipelines cnt_pipelines = cnt_pipelines + 1 test_pipeline = Pipeline("test %d" % cnt_pipelines) test_pipeline.originate( task_func=generate_initial_files1, output=[tempdir + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline.originate( task_func=generate_initial_files2, output=[tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"]) test_pipeline.originate( task_func=generate_initial_files3, output=[tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"]) test_pipeline.originate(task_func=generate_initial_files4, output=tempdir + "i_name.tmp1") test_pipeline.collate(task_func=test_task2, input=[ generate_initial_files1, generate_initial_files2, generate_initial_files3, generate_initial_files4 ], filter=formatter(), output="{path[0]}/all.tmp2") test_pipeline.transform(task_func=test_task3, input=test_task2, filter=suffix(".tmp2"), output=".tmp3") test_pipeline.transform(task_func=test_task4, input=test_task3, filter=suffix(".tmp3"), output=".tmp4") return test_pipeline
def make_pipeline_call(state): #this part of the pipeline will take the summary results of "map" and turn them into gatk and undr_rover vcfs pipeline = Pipeline(name='hiplexpipe') with open("all_sample.passed.summary.txt", 'r') as inputf: passed_files = inputf.read().split('\n') stages = Stages(state) safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/undr_rover') safe_make_dir('variants/undr_rover/coverdir') pipeline.originate(task_func=stages.passed_filter_files, name='passed_filter_files', output=passed_files) # Call variants using undr_rover pipeline.transform( task_func=stages.apply_undr_rover, name='apply_undr_rover', input=output_from('passed_filter_files'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='variants/undr_rover/{sample[0]}.vcf', extras=['{sample[0]}']) #### concatenate undr_rover vcfs #### pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('apply_undr_rover'), filter=formatter('variants/undr_rover/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/undr_rover/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).clipped.sort.hq.bam'), output='variants/gatk/{sample[0]}.g.vcf') return pipeline
def test_newstyle_simpler(self): test_pipeline = Pipeline("test") test_pipeline.originate(task1, input_file_names, extras=[logger_proxy, logging_mutex]) test_pipeline.transform(task2, task1, suffix(".1"), ".2", extras=[logger_proxy, logging_mutex]) test_pipeline.transform(task3, task2, suffix(".2"), ".3", extras=[logger_proxy, logging_mutex]) test_pipeline.merge(task4, task3, final_file_name, extras=[logger_proxy, logging_mutex]) #test_pipeline.merge(task4, task3, final_file_name, extras = {"logger_proxy": logger_proxy, "logging_mutex": logging_mutex}) test_pipeline.run(multiprocess=500, verbose=0)
def test_newstyle_ruffus(self): # alternative syntax test_pipeline = Pipeline("test") test_pipeline.mkdir(data_dir, work_dir) test_pipeline.originate( task_func=task1, output=[os.path.join(data_dir, "%s.1" % aa) for aa in "abcd"]) test_pipeline.mkdir(filter=suffix(".1"), output=".dir", output_dir=work_dir) test_pipeline.transform(task_func=task2, input=task1, filter=suffix(".1"), output=[".1", ".bak"], extras=["extra.tst", 4, r"orig_dir=\1"], output_dir=work_dir) test_pipeline.subdivide(task3, task2, suffix(".1"), r"\1.*.2", [r"\1.a.2", r"\1.b.2"], output_dir=data_dir) test_pipeline.transform(task4, task3, suffix(".2"), ".3", output_dir=work_dir) test_pipeline.merge(task5, task4, os.path.join(data_dir, "summary.5")) test_pipeline.run(multiprocess=50, verbose=0) with open(os.path.join(data_dir, "summary.5")) as ii: active_text = ii.read() if active_text != expected_active_text: raise Exception("Error:\n\tExpected\n%s\nInstead\n%s\n" % (expected_active_text, active_text))
def create_pipeline (self): #each pipeline has a different name global cnt_pipelines cnt_pipelines = cnt_pipelines + 1 test_pipeline = Pipeline("test %d" % cnt_pipelines) test_pipeline.originate(task_func = generate_initial_files1, output = [tempdir + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline.originate(task_func = generate_initial_files2, output = [tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"]) test_pipeline.originate(task_func = generate_initial_files3, output = [tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"]) test_pipeline.originate(task_func = generate_initial_files4, output = tempdir + "i_name.tmp1") test_pipeline.collate( task_func = test_task2, input = [generate_initial_files1, generate_initial_files2, generate_initial_files3, generate_initial_files4], filter = formatter(), output = "{path[0]}/all.tmp2") test_pipeline.transform(task_func = test_task3, input = test_task2, filter = suffix(".tmp2"), output = ".tmp3") test_pipeline.transform(task_func = test_task4, input = test_task3, filter = suffix(".tmp3"), output = ".tmp4") return test_pipeline
def build_pipeline(self, pipeline_name): # fudge: clear all previous pipelines ruffus.Pipeline.clear_all() pipeline = ruffus.Pipeline(pipeline_name) task_create_files = pipeline.originate( task_func=create_files, output=["sample_{:02}.txt".format(x) for x in range(10)]) task_compute_mean = pipeline.transform( task_func=compute_mean, input=task_create_files, filter=ruffus.suffix(".txt"), output=".mean") task_combine_means = pipeline.merge( task_func=combine_means, input=task_compute_mean, output="means.txt")
def make_pipeline2( pipeline_name = "pipeline2"): test_pipeline2 = Pipeline(pipeline_name) test_pipeline2.transform(task_func = task_1_to_1, # task name name = "44_to_55", # placeholder: will be replaced later with set_input() input = None, filter = suffix(".44"), output = ".55") test_pipeline2.merge( task_func = task_m_to_1, input = test_pipeline2["44_to_55"], output = tempdir + "/final.output",) # Set head and tail test_pipeline2.set_tail_tasks([test_pipeline2[task_m_to_1]]) if not DEBUG_do_not_define_head_task: test_pipeline2.set_head_tasks([test_pipeline2["44_to_55"]]) return test_pipeline2
def make_pipeline2(pipeline_name="pipeline2"): test_pipeline2 = Pipeline(pipeline_name) test_pipeline2.transform( task_func=task_1_to_1, # task name name="44_to_55", # placeholder: will be replaced later with set_input() input=None, filter=suffix(".44"), output=".55") test_pipeline2.merge( task_func=task_m_to_1, input=test_pipeline2["44_to_55"], output=tempdir + "/final.output", ) # Set head and tail test_pipeline2.set_tail_tasks([test_pipeline2[task_m_to_1]]) if not DEBUG_do_not_define_head_task: test_pipeline2.set_head_tasks([test_pipeline2["44_to_55"]]) return test_pipeline2
with iotools.open_file(fn, "w") as outf: df.to_csv(outf, sep="\t", index=True) @merge(runFastQC, "fastqc_status_summary.tsv.gz") def buildFastQCSummaryStatus(infiles, outfile): '''load FastQC status summaries into a single table.''' readqc.buildFastQCSummaryStatus( infiles, outfile, "fastqc.dir") @jobs_limit(P.get_params().get("jobs_limit_db", 1), "db") @transform((summarizeFastQC, buildFastQCSummaryStatus), suffix(".tsv.gz"), ".load") def loadFastQC(infile, outfile): '''load FASTQC stats into database.''' # a check to make sure file isnt empty n = 0 with iotools.open_file(infile) as f: for i, line in enumerate(f): n =+ i if n > 0: P.load(infile, outfile, options="--add-index=track") else: table_name = infile.replace(".tsv.gz", "") database_sql = P.get_params()["database"]["url"] database_name = os.path.basename(database_sql) statement = """sqlite3 %(database_name)s
">;)", ] ) UNHAPPY_SMILIES = list(set(SMILIES) - set(HAPPY_SMILIES)) def detect_language(text): # details is 3x (langName, langCode, percent, score) lang_is_reliable, _, lang_details = cld2.detect(text) lang_details = lang_details[0] # take only the first lang detected lang_name, lang_code, lang_percent, lang_score = lang_details return lang_name, lang_code, lang_score, lang_is_reliable @ruffus.transform(os.path.join(tweets_dir, "tweets_100k.json.gz"), ruffus.suffix(".json.gz"), ".english.json.gz") def extract_english_tweets(input_file, output_file): tokenizer = WordPunctTokenizer() n_happy = 0 n_sad = 0 labelled_tweets = [] with gzip.open(input_file) as input: for line in input: tweet_info = json.loads(line) if "limit" in tweet_info: continue # TODO: care about unicode
m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"], outdir=PARAMS["exportdir"] + "/fastqc", contaminants=PARAMS['contaminants']) else: m = PipelineMapping.FastQc(nogroup=PARAMS["readqc_no_group"], outdir=PARAMS["exportdir"] + "/fastqc") if PARAMS["general_reconcile"] == 1: infiles = infiles.replace("processed.dir/trimmed", "reconciled.dir/trimmed") statement = m.build((infiles,), outfile) P.run() @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") @transform(runFastqc, suffix(".fastqc"), "_fastqc.load") def loadFastqc(infile, outfile): '''load FASTQC stats into database.''' track = P.snip(infile, ".fastqc") filename = os.path.join( PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt") PipelineReadqc.loadFastqc(filename, backend=PARAMS["database_backend"], database=PARAMS["database_name"], host=PARAMS["database_host"], username=PARAMS["database_username"], password=PARAMS["database_password"], port=PARAMS["database_port"]) P.touch(outfile)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name="radpipe") # Stages are dependent on the state stages = PipelineStages(state) # Get a list of library objects. libraries = parse_libraries( libraries=state.config.get_options("libraries")) # Get a list of input files input_files = [l.files for l in libraries] # input_files = [item for sublist in input_files for item in sublist] state.logger.info("Input files: " + str(input_files)) # Get a list of all samples for each library samples_dict = OrderedDict() for l in libraries: samples_dict[l.name] = l.samples state.logger.debug("Samples: " + str(samples_dict)) # Make sure that there are no duplicate samples sample_list = [ item for sublist in samples_dict.values() for item in sublist ] sample_counts = Counter(sample_list) for sample in sample_counts: if sample_counts[sample] > 1: print("Sample {} appears {} times in the barcodes files. " "Sample names must be unique".format(sample, sample_counts[sample])) sys.exit(radpipe.error_codes.INVALID_INPUT_FILE) # Define output directories output_dir = get_output_paths(state) state.logger.debug(output_dir) # Allow multiple comma-separated tasks if len(state.options.target_tasks) == 1: state.options.target_tasks = state.options.target_tasks[0].split(",") if len(state.options.forced_tasks) == 1: state.options.forced_tasks = state.options.forced_tasks[0].split(",") state.logger.debug("Target tasks: " + str(state.options.target_tasks)) state.logger.debug("Forced tasks: " + str(state.options.forced_tasks)) # Check if alignment_method is valid alignment_method = state.config.get_options( "alignment_method").strip().lower() if alignment_method not in ["bwa mem", "bowtie"]: print("Error: Invalid alignment_method in config file. " \ "Valid options are ['bwa mem', 'bowtie'].") sys.exit(radpipe.error_codes.INVALID_ARGUMENT) if alignment_method == "bwa mem": align_task_name = "bwa_mem" index_task_name = "bwa_index" else: align_task_name = "bowtie" index_task_name = "bowtie_index" # TODO: Refactor this # If 'alignment' is in target_tasks or forced_tasks, specify which # type of alignment job if "alignment" in state.options.target_tasks: index = state.options.target_tasks.index("alignment") state.options.target_tasks[index] = align_task_name if "alignment" in state.options.forced_tasks: index = state.options.forced_tasks.index("alignment") state.options.forced_tasks[index] = align_task_name # If 'build_index' is in target_tasks or forced_tasks, specify which # type of index job if "build_index" in state.options.target_tasks: index = state.options.target_tasks.index("build_index") state.options.target_tasks[index] = index_task_name if "build_index" in state.options.forced_tasks: index = state.options.forced_tasks.index("build_index") state.options.forced_tasks[index] = index_task_name state.logger.debug(state) # Whether to include filter_bam stage or not filter_bams = False try: samtools_view_options = state.config.get_options( "samtools_view_options") if samtools_view_options: filter_bams = True except: pass state.logger.info("Filter bams: {}".format(filter_bams)) # Population map filenames popmap_file = "{output_dir}/{name}_popmap.txt".format( output_dir=output_dir["populations"], name=state.config.get_options("analysis_id")) try: config_popmap_file = state.config.get_options("popmap_file") if config_popmap_file: state.logger.info( "Using popmap file: {}".format(config_popmap_file)) else: raise (Exception) except Exception: config_popmap_file = None state.logger.info("Creating new popmap file: {}".format(popmap_file)) # Population r values populations_r = state.config.get_options("populations_r") assert (isinstance(populations_r, list)) # Dummy stages. These do nothing except provide a node at the beginning # for the pipeline graph, giving the pipeline an obvious starting point. pipeline.originate(task_func=stages.do_nothing, name="original_fastqs", output=input_files) pipeline.originate(task_func=stages.do_nothing, name="reference_genome", output=state.config.get_options("reference_genome")) # Create a copy of the population map file needed for stacks, or create # one denovo using the sample list. pipeline.originate(task_func=stages.create_popmap_file, name="create_popmap_file", output=[popmap_file], extras=[config_popmap_file, sample_list]) # Create index for reference genome based on alignment method. if alignment_method == "bwa mem": pipeline.transform( task_func=stages.bwa_index, name="bwa_index", input=output_from("reference_genome"), filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"), output=path_list_join(output_dir["reference"], ["reference.fa.bwt", "reference.fa.sa"]), extras=[output_dir["reference"]]) if alignment_method == "bowtie": pipeline.transform(task_func=stages.bowtie_index, name="bowtie_index", input=output_from("reference_genome"), filter=formatter(".+/(?P<ref>[^/]+).(fa|fasta)"), output=path_list_join( output_dir["reference"], ["reference.1.ebwt", "reference.rev.1.ebwt"]), extras=[output_dir["reference"]]) # FastQC pipeline.transform( task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=formatter(".+/(?P<lib>[^/]+)/(?P<fn>[^/]+).(fastq|fq).gz"), output="%s/{lib[0]}/{fn[0]}_fastqc.zip" % output_dir["fastqc"], extras=[output_dir["fastqc"], "{lib[0]}"]) # MultiQC: FastQC pipeline.merge(task_func=stages.multiqc_fastqc, name="multiqc_fastqc", input=output_from("fastqc"), output="%s/multiqc_fastqc_report.html" % output_dir["qc"], extras=[output_dir["qc"], output_dir["fastqc"]]) # Stacks: Process RAD-Tags pipeline.transform(task_func=stages.process_radtags, name="process_radtags", input=output_from("original_fastqs"), filter=formatter(".+/(?P<lib>[^/]+)/[^/]+"), output="%s/{lib[0]}/{lib[0]}.success" % output_dir["process_radtags"], extras=[ output_dir["process_radtags"], "{lib[0]}", state.config.get_options("renz_1"), state.config.get_options("renz_2"), state.config.get_options("process_radtags_options") ]) # Create a list for alignment with the input fastq files from process_radtags process_radtags_outputs = [] for l in libraries: for s in l.samples: base = "{dir}/{lib}/{sample}".format( dir=output_dir["process_radtags"], lib=l.lib_id, sample=s) process_radtags_outputs.append( [base + ".1.fq.gz", base + ".2.fq.gz"]) # print(process_radtags_outputs) # Alignment if align_task_name == "bwa_mem": (pipeline.transform( task_func=stages.bwa_align, name=align_task_name, input=process_radtags_outputs, filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"), output="%s/{sm[0]}.bwa.bam" % output_dir["alignments"], extras=[ os.path.join(output_dir["reference"], "reference.fa"), "{path[0]}", output_dir["alignments"], "{sm[0]}", state.config.get_options("alignment_options") ])).follows("bwa_index").follows("process_radtags") if align_task_name == "bowtie": (pipeline.transform( task_func=stages.bowtie_align, name=align_task_name, input=process_radtags_outputs, filter=formatter(".+/(?P<sm>[^/]+).1.fq.gz"), output="%s/{sm[0]}.bowtie.bam" % output_dir["alignments"], extras=[ os.path.join(output_dir["reference"], "reference"), "{path[0]}", output_dir["alignments"], "{sm[0]}", state.config.get_options("alignment_options") ])).follows("bowtie_index").follows("process_radtags") # Sort BAM and index pipeline.transform(task_func=stages.sort_bam, name="sort_bam", input=output_from(align_task_name), filter=suffix(".bam"), output=".sorted.bam") if filter_bams: final_bam_task_name = "filter_bam" pipeline.transform( task_func=stages.filter_bam, name="filter_bam", input=output_from("sort_bam"), filter=suffix(".sorted.bam"), output=".sorted.filtered.bam", extras=[state.config.get_options("samtools_view_options")]) else: final_bam_task_name = "sort_bam" # Samtools flagstat pipeline.transform(task_func=stages.flagstat, name="flagstat", input=output_from(final_bam_task_name), filter=suffix(".bam"), output=".flagstat.txt", output_dir=output_dir["flagstat"]) # MultiQC: flagstat pipeline.merge(task_func=stages.multiqc_flagstat, name="multiqc_flagstat", input=output_from("flagstat"), output="%s/multiqc_flagstat_report.html" % output_dir["qc"], extras=[output_dir["qc"], output_dir["flagstat"]]) # Stacks: gstacks pipeline.merge(task_func=stages.gstacks, name="gstacks", input=output_from(final_bam_task_name), output="%s/catalog.fa.gz" % output_dir["gstacks"], extras=[ output_dir["alignments"], output_dir["gstacks"], align_task_name, final_bam_task_name, sample_list, state.config.get_options("gstacks_options") ]) # Define outputs from each run of populations populations_outputs = [] for r in populations_r: dir_name = "{pop_dir}/{analysis_name}_r{r}".format( pop_dir=output_dir["populations"], analysis_name=state.config.get_options("analysis_id"), r=r) populations_outputs.append( os.path.join(dir_name, "populations.snps.vcf")) # print(populations_outputs) # Stacks: populations pipeline.originate(task_func=stages.populations, name="popluations", output=populations_outputs, extras=[ output_dir["gstacks"], output_dir["populations"], popmap_file, state.config.get_options("populations_options") ]).follows("gstacks").follows("create_popmap_file") return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='crpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Find the path to the reference genome # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Convert FASTQ file to FASTA using fastx toolkit # pipeline.transform( # task_func=stages.fastq_to_fasta, # name='fastq_to_fasta', # input=output_from('original_fastqs'), # filter=suffix('.fastq.gz'), # output='.fasta') # The original reference file # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. #pipeline.originate( # task_func=stages.original_reference, # name='original_reference', # output=reference_file) # Run fastQC on the FASTQ files pipeline.transform(task_func=stages.fastqc, name='fastqc', input=output_from('original_fastqs'), filter=suffix('.fastq.gz'), output='_fastqc') # Index the reference using BWA #pipeline.transform( # task_func=stages.index_reference_bwa, # name='index_reference_bwa', # input=output_from('original_reference'), # filter=suffix('.fa'), # output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Index the reference using samtools # pipeline.transform( # task_func=stages.index_reference_samtools, # name='index_reference_samtools', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.fa.fai') # Index the reference using bowtie 2 # pipeline.transform( # task_func=stages.index_reference_bowtie2, # name='index_reference_bowtie2', # input=output_from('original_reference'), # filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'), # output=['{path[0]}/{refname[0]}.1.bt2', # '{path[0]}/{refname[0]}.2.bt2', # '{path[0]}/{refname[0]}.3.bt2', # '{path[0]}/{refname[0]}.4.bt2', # '{path[0]}/{refname[0]}.rev.1.bt2', # '{path[0]}/{refname[0]}.rev.2.bt2'], # extras=['{path[0]}/{refname[0]}']) # # Create a FASTA sequence dictionary for the reference using picard # pipeline.transform( # task_func=stages.reference_dictionary_picard, # name='reference_dictionary_picard', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.dict') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), # Add two more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort alignment with sambamba pipeline.transform(task_func=stages.sort_bam_sambamba, name='sort_alignment', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.sorted.bam') # Extract MMR genes from the sorted BAM file pipeline.transform( task_func=stages.extract_genes_bedtools, name='extract_genes_bedtools', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.mmr.bam') # Extract selected chromosomes from the sorted BAM file pipeline.transform( task_func=stages.extract_chromosomes_samtools, name='extract_chromosomes_samtools', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.chroms.bam') # Index the MMR genes bam file with samtools pipeline.transform(task_func=stages.index_bam, name='index_mmr_alignment', input=output_from('extract_genes_bedtools'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).mmr.bam'), output='{path[0]}/{sample[0]}.mmr.bam.bai') # Compute depth of coverage of the alignment with GATK DepthOfCoverage #pipeline.transform( # task_func=stages.alignment_coverage_gatk, # name='alignment_coverage_gatk', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs([reference_file]), # output='{path[0]}/{sample[0]}.coverage_summary', # extras=['{path[0]}/{sample[0]}_coverage']) # Index the alignment with samtools pipeline.transform( task_func=stages.index_bam, name='index_alignment', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.sorted.bam.bai') # Generate alignment stats with bamtools pipeline.transform(task_func=stages.bamtools_stats, name='bamtools_stats', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.stats.txt') # Extract the discordant paired-end alignments pipeline.transform(task_func=stages.extract_discordant_alignments, name='extract_discordant_alignments', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.discordants.unsorted.bam') # Extract split-read alignments pipeline.transform(task_func=stages.extract_split_read_alignments, name='extract_split_read_alignments', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.splitters.unsorted.bam') # Sort discordant reads. # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name='sort_discordants', input=output_from('extract_discordant_alignments'), filter=formatter( '.+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam'), extras=['{path[0]}/{sample[0]}.discordants'], output='{path[0]}/{sample[0]}.discordants.bam') # Index the sorted discordant bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_discordants', # input=output_from('sort_discordants'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'), # output='{path[0]}/{sample[0]}.discordants.bam.bai') # Sort discordant reads # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name='sort_splitters', input=output_from('extract_split_read_alignments'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam'), extras=['{path[0]}/{sample[0]}.splitters'], output='{path[0]}/{sample[0]}.splitters.bam') # Index the sorted splitters bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_splitters', # input=output_from('sort_splitters'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'), # output='{path[0]}/{sample[0]}.splitters.bam.bai') # Call structural variants with lumpy (pipeline.transform( task_func=stages.structural_variants_lumpy, name='structural_variants_lumpy', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), add_inputs=add_inputs([ '{path[0]}/{sample[0]}.splitters.bam', '{path[0]}/{sample[0]}.discordants.bam' ]), output='{path[0]}/{sample[0]}.lumpy.vcf').follows('index_alignment'). follows('sort_splitters').follows('sort_discordants')) # Call genotypes on lumpy output using SVTyper #(pipeline.transform( # task_func=stages.genotype_svtyper, # name='genotype_svtyper', # input=output_from('structural_variants_lumpy'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']), # output='{path[0]}/{sample[0]}.svtyper.vcf') # .follows('align_bwa') # .follows('sort_splitters') # .follows('index_alignment') # .follows('index_splitters') # .follows('index_discordants')) # Call SVs with Socrates (pipeline.transform( task_func=stages.structural_variants_socrates, name='structural_variants_socrates', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # output goes to {path[0]}/socrates/ output= '{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt', extras=['{path[0]}'])) # Call DELs with DELLY pipeline.merge(task_func=stages.deletions_delly, name='deletions_delly', input=output_from('sort_alignment'), output='delly.DEL.vcf') # Call DUPs with DELLY pipeline.merge(task_func=stages.duplications_delly, name='duplications_delly', input=output_from('sort_alignment'), output='delly.DUP.vcf') # Call INVs with DELLY pipeline.merge(task_func=stages.inversions_delly, name='inversions_delly', input=output_from('sort_alignment'), output='delly.INV.vcf') # Call TRAs with DELLY pipeline.merge(task_func=stages.translocations_delly, name='translocations_delly', input=output_from('sort_alignment'), output='delly.TRA.vcf') # Join both read pair files using gustaf_mate_joining #pipeline.transform( # task_func=stages.gustaf_mate_joining, # name='gustaf_mate_joining', # input=output_from('fastq_to_fasta'), # # Match the R1 (read 1) FASTA file and grab the path and sample name. # # This will be the first input to the stage. # # We assume the sample name may consist of only alphanumeric # # characters. # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'), # # Add one more input to the stage: # # 1. The corresponding R2 FASTA file # add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']), # output='{path[0]}/{sample[0]}.joined_mates.fasta') # Call structural variants with pindel #(pipeline.transform( # task_func=stages.structural_variants_pindel, # name='structural_variants_pindel', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]), # output='{path[0]}/{sample[0]}.pindel') # .follows('index_reference_bwa') # .follows('index_reference_samtools')) return pipeline
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # # First task # @originate(["a.1", "b.1"]) def start_task(output_file_name): with open(output_file_name, "w") as f: pass # # Forwards file names, is always as up to date as its input files... # @transform(start_task, suffix(".1"), ".1") def same_file_name_task(input_file_name, output_file_name): pass # # Links file names, is always as up to date if links are not missing # @transform(start_task, suffix(".1"), ".linked.1") def linked_file_name_task(input_file_name, output_file_name): try: os.symlink(input_file_name, output_file_name) except: print (input_file_name, output_file_name) raise
try: with open(input_file, 'rb') as f: signature = f.read(4) if signature == b'%PDF': re_symlink(input_file, output_file) return except EnvironmentError as e: log.error(e) sys.exit(ExitCode.input_file) triage_image_file(input_file, output_file, log) @transform( input=triage, filter=suffix('.pdf'), output='.repaired.pdf', output_dir=work_folder, extras=[_log, _pdfinfo, _pdfinfo_lock]) def repair_pdf( input_file, output_file, log, pdfinfo, pdfinfo_lock): qpdf.repair(input_file, output_file, log) with pdfinfo_lock: pdfinfo.extend(pdf_get_all_pageinfo(output_file)) log.debug(pdfinfo)
# This also updates timestamps. Ruffus doesn't recognize these files as complete results unles the # timestamp is up to date. sh.mv("testdata.manual.2009.06.14.csv", "sentiment140.test.csv") sh.mv("training.1600000.processed.noemoticon.csv", "sentiment140.train.csv") # Re-encode the files as utf8. They look like utf8 already (e.g. file thinks they're utf8) # but they are actually encoded as latin1. This doesn't make a difference for the test data # (the utf8 and latin1 encoded test data are identical files) but the train data has some # byte sequences that are invalid utf8 and this makes simplejson really upset. for output_file in output_file_names: sh.mv(output_file, "temp") sh.iconv("-f", "latin1", "-t", "utf8", "temp", _out=output_file) sh.rm("temp") @ruffus.transform(extract_data, ruffus.suffix(".csv"), ".json") def reformat_data(input_file_name, output_file_name): df = pd.io.parsers.read_csv( input_file_name, names=["polarity", "id", "date", "query", "user", "text"], encoding='utf8') # drop columns we don't care about df = df[["text", "polarity"]] # remove neutral class df = df[df.polarity != 2] assert all((df.polarity == 4) | (df.polarity == 0)) # re-map polarity to smilies df.polarity = df.polarity.apply(lambda x: ':)' if x == 4 else ':(')
CHECKSUM_HISTORY_TIMESTAMPS, CHECKSUM_FUNCTIONS, CHECKSUM_FUNCTIONS_AND_PARAMS) from ruffus.ruffus_exceptions import RethrownJobError possible_chksms = range(CHECKSUM_FUNCTIONS_AND_PARAMS + 1) workdir = 'tmp_test_job_completion/' input_file = os.path.join(workdir, 'input.txt') transform1_out = input_file.replace('.txt', '.output') split1_outputs = [ os.path.join(workdir, 'split.out1.txt'), os.path.join(workdir, 'split.out2.txt')] merge2_output = os.path.join(workdir, 'merged.out') runtime_data = [] @transform(input_file, suffix('.txt'), '.output', runtime_data) def transform1(in_name, out_name, how_many): with open(out_name, 'w') as outfile: outfile.write(open(in_name).read()) @transform(input_file, suffix('.txt'), '.output', runtime_data) def transform_raise_error(in_name, out_name, how_many): # raise an error unless runtime_data has 'okay' in it with open(out_name, 'w') as outfile: outfile.write(open(in_name).read()) if 'okay' not in runtime_data: raise RuntimeError("'okay' wasn't in runtime_data!") @split(input_file, split1_outputs) def split1(in_name, out_names): for n in out_names:
def task1(outfile, *extra_params): """ First task """ with open(tempdir + "jobs.start", "a") as oo: oo.write('job = %s\n' % json.dumps([None, outfile])) test_job_io(None, outfile, extra_params) with open(tempdir + "jobs.finish", "a") as oo: oo.write('job = %s\n' % json.dumps([None, outfile])) # # task2 # @posttask(lambda: do_write(test_file, "Task 2 Done\n")) @transform(task1, suffix(".1"), ".2") def task2(infiles, outfiles, *extra_params): """ Second task """ with open(tempdir + "jobs.start", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfiles])) test_job_io(infiles, outfiles, extra_params) with open(tempdir + "jobs.finish", "a") as oo: oo.write('job = %s\n' % json.dumps([infiles, outfiles])) # # task3 # @transform(task2, regex('(.*).2'), inputs([r"\1.2", tempdir + "a.1"]), r'\1.3')
@follows(mkdir("test_active_if")) @originate(['test_active_if/a.1', 'test_active_if/b.1'], "an extra_parameter") def task1(outfile, extra): """ First task """ # N.B. originate works with an extra parameter helper(None, outfile) # # task2 # @transform(task1, suffix(".1"), ".2") def task2(infile, outfile): """ Second task """ helper(infile, outfile) # # task3 # @active_if(lambda: pipeline_active_if) @transform(task1, suffix(".1"), ".3") def task3(infile, outfile): """ Third task
# for i in range(JOBS_PER_TASK): with open(tempdir + "/files.split.%s.%03d.fa" % (original_index, i), "w") as oo: pass with open(success_flag, "w") as oo: pass # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # # align_sequences # @posttask(lambda: sys.stderr.write("\tSequences aligned\n")) # fa -> aln @transform(split_fasta_file, suffix(".fa"), ".aln") def align_sequences(input_file, output_filename): with open(output_filename, "w") as oo: oo.write("%s\n" % output_filename) # 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # # percentage_identity # @posttask(lambda: sys.stderr.write("\t%Identity calculated\n")) @transform(align_sequences, # find all results from align_sequences suffix(".aln"), # replace suffix with: [r".pcid", # .pcid suffix for the result r".pcid_success"]) # .pcid_success to indicate job completed def percentage_identity(input_file, output_files):
# @originate([tempdir + 'a.1'] + runtime_files) def task1(outfile): """ First task """ output_text = "" output_text += " -> " + json.dumps(outfile) + "\n" with open(outfile, "w") as oo: oo.write(output_text) # # task2 # @transform(task1, suffix(".1"), ".2") def task2(infile, outfile): """ Second task """ if infile: with open(infile) as ii: output_text = ii.read() else: output_text = "" output_text += json.dumps(infile) + " -> " + json.dumps(outfile) + "\n" with open(outfile, "w") as oo: oo.write(output_text) #
formatter(), "{path[0]}/all.tmp2") #@transform([generate_initial_files1, generate_initial_files2, generate_initial_files3, # generate_initial_files4], # formatter( ), # "{path[0]}/{basename[0]}.tmp2") def test_task2( infiles, outfile): with open(outfile, "w") as p: pass #print >>sys.stderr, "8" * 80, "\n", " task2 :%s %s " % (infiles, outfile) #___________________________________________________________________________ # # test_task3 #___________________________________________________________________________ @transform(test_task2, suffix(".tmp2"), ".tmp3") def test_task3( infile, outfile): global throw_exception if throw_exception != None: throw_exception = not throw_exception if throw_exception: #print >>sys.stderr, "Throw exception for ", infile, outfile raise Exception("oops") else: #print >>sys.stderr, "No throw exception for ", infile, outfile pass with open(outfile, "w") as p: pass #print >>sys.stderr, "8" * 80, "\n", " task3 :%s %s " % (infile, outfile) #___________________________________________________________________________ #
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='complexo') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform( task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Call variants using GATK pipeline.transform( task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='PCExomes.mergegvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergegvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.indel_recal', 'PCExomes.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']), output='.combined.vcf') .follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform( task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline
statement = '''awk 'BEGIN { printf("word\\tfreq\\n"); } {for (i = 1; i <= NF; i++) freq[$i]++} END { for (word in freq) printf "%%s\\t%%d\\n", word, freq[word] }' < %(infile)s > %(outfile)s''' # execute command in variable statement. # # The command will be sent to the cluster. The statement will be # interpolated with any options that are defined in in the # configuration files or variable that are declared in the calling # function. For example, %(infile)s will we substituted with the # contents of the variable "infile". P.run(statement) @transform(count_words, suffix(".counts"), "_counts.load") def load_word_counts(infile, outfile): '''load results of word counting into database.''' P.load(infile, outfile, "--add-index=word") # --------------------------------------------------- # Generic pipeline tasks @follows(load_word_counts) def full(): pass def main(argv=None): if argv is None: argv = sys.argv
r"\g<PREFIX>", # extra: prefix = \2 r"\4") # extra: extension def test_regex_unmatched_task(infiles, outfile, prefix1, prefix2, extension): raise Exception("Should blow up first") #___________________________________________________________________________ # # test_suffix_task #___________________________________________________________________________ @transform( generate_initial_files1, suffix(".tmp1"), r".tmp2", # output file r"\1") # extra: basename def test_suffix_task(infile, outfile, basename): with open (outfile, "w") as f: pass #___________________________________________________________________________ # # test_suffix_unmatched_task #___________________________________________________________________________ @transform( generate_initial_files1, suffix(".tmp1"), r".tmp2", # output file
filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.tmp1)"), output=r"\1/\g<PREFIX>\3.tmp2", # output file extras=[r"\2", # extra: prefix = \2 r"\g<PREFIX>", # extra: prefix = \2 r"\4"]) # extra: extension test_pipeline.transform(task_func=check_regex_unmatched_task, input=generate_initial_files1, filter=regex("(.*)/(?P<PREFIX>[abcd])(_name)(.xxx)"), output=r"\1/\g<PREFIXA>\3.tmp2", # output file extras=[r"\2", # extra: prefix = \2 r"\g<PREFIX>", # extra: prefix = \2 r"\4"]) # extra: extension test_pipeline.transform(task_func=check_suffix_task, input=generate_initial_files1, filter=suffix(".tmp1"), output=r".tmp2", # output file extras=[r"\1"]) # extra: basename test_pipeline.transform(task_func=check_suffix_unmatched_task, input=generate_initial_files1, filter=suffix(".tmp1"), output=r".tmp2", # output file extras=[r"\2"]) # extra: unknown test_pipeline.transform(task_func=check_suffix_unmatched_task2, input=generate_initial_files1, filter=suffix(".tmp2"), output=r".tmp2") # output file test_pipeline.transform(task_func=check_regex_misspelt_capture_error_task,
with open(input_file_name) as ii: for i, line in enumerate(ii): if i % CHUNK_SIZE == 0: cnt_files += 1 if output_file: output_file.close() output_file = open(tempdir + "%d.chunks" % cnt_files, "w") output_file.write(line) if output_file: output_file.close() #--------------------------------------------------------------- # # Calculate sum and sum of squares for each chunk file # @transform(step_4_split_numbers_into_chunks, suffix(".chunks"), ".sums") def step_5_calculate_sum_of_squares (input_file_name, output_file_name): with open(output_file_name, "w") as oo: sum_squared, sum = [0.0, 0.0] cnt_values = 0 with open(input_file_name) as ii: for line in ii: cnt_values += 1 val = float(line.rstrip()) sum_squared += val * val sum += val oo.write("%s\n%s\n%d\n" % (repr(sum_squared), repr(sum), cnt_values)) def print_hooray_again(): print(" hooray again")
# # First task # @originate(["a.1", "b.1"]) def start_task(output_file_name): with open(output_file_name, "w") as f: pass # # Forwards file names, is always as up to date as its input files... # @transform(start_task, suffix(".1"), ".1") def same_file_name_task(input_file_name, output_file_name): pass # # Links file names, is always as up to date if links are not missing # @transform(start_task, suffix(".1"), ".linked.1") def linked_file_name_task(input_file_name, output_file_name): try: os.symlink(input_file_name, output_file_name) except: print(input_file_name, output_file_name)
from ruffus import (transform, follows, collate, files, split, merge, suffix, mkdir, jobs_limit, output_from) from ruffus.task import active_if from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import cfg from hts_waterworks.utils.common import parseFastq # filtering original_reads = '*.fastq' prev_output = original_reads prev_suffix = '.fastq' @active_if(cfg.getboolean('filtering', 'convert_sanger_to_illumina')) @transform(prev_output, suffix(prev_suffix), '.fastq_illumina') def convert_fastq(in_fastq, out_fastq): 'convert sanger fastq format (phred-33) to illumina format (phred-64)' base_out = os.path.splitext(out_fastq)[0] records = SeqIO.parse(in_fastq, "fastq") with open(base_out, 'w') as outfile: SeqIO.write(records, outfile, "fastq-illumina") check_call('gzip %s' % base_out, shell=True) if cfg.getboolean('filtering', 'convert_sanger_to_illumina'): prev_output = convert_fastq prev_suffix = '' @active_if(cfg.getboolean('filtering', 'clip_adapter')) @transform(prev_output, suffix(prev_suffix), '.noAdapter')
def triage(input_file, output_file, log): try: with open(input_file, 'rb') as f: signature = f.read(4) if signature == b'%PDF': re_symlink(input_file, output_file) return except EnvironmentError as e: log.error(e) sys.exit(ExitCode.input_file) triage_image_file(input_file, output_file, log) @transform(input=triage, filter=suffix('.pdf'), output='.repaired.pdf', output_dir=work_folder, extras=[_log, _pdfinfo, _pdfinfo_lock]) def repair_pdf(input_file, output_file, log, pdfinfo, pdfinfo_lock): qpdf.repair(input_file, output_file, log) with pdfinfo_lock: pdfinfo.extend(pdf_get_all_pageinfo(output_file)) log.debug(pdfinfo) def get_pageinfo(input_file, pdfinfo, pdfinfo_lock): pageno = int(os.path.basename(input_file)[0:6]) - 1 with pdfinfo_lock: pageinfo = pdfinfo[pageno].copy()
import gzip import simplejson as json data_dir = os.environ['DATA'] words_dir = os.path.join(data_dir, "words") # /usr/share/dict/words is a text file full of words on most unix systems @ruffus.follows(ruffus.mkdir(words_dir)) @ruffus.originate(os.path.join(words_dir, "words.txt")) def get_words(output_file): sh.cp("/usr/share/dict/words", output_file) sh.chmod("u+w", output_file) @ruffus.transform(get_words, ruffus.suffix(".txt"), ".alphabet.json") def build_alphabet_dictionary(input_file, output_file): characters = set() with open(input_file) as f: for line in f: characters = characters.union(line.rstrip()) alphabet = list(sorted(characters)) + ['PADDING', 'START', 'END'] with open(output_file, 'w') as f: f.write(json.dumps(alphabet)) @ruffus.transform(build_alphabet_dictionary, ruffus.suffix(".alphabet.json"), ".alphabet.encoding.json") def encode_alphabet_dictionary(input_file, output_file): alphabet = dict() with open(input_file) as alphabet_file:
def build_pipeline(options, work_folder, log, context): main_pipeline = Pipeline.pipelines['main'] # Triage task_triage = main_pipeline.transform( task_func=triage, input=os.path.join(work_folder, 'origin'), filter=formatter('(?i)'), output=os.path.join(work_folder, 'origin.pdf'), extras=[log, context]) task_repair_pdf = main_pipeline.transform(task_func=repair_pdf, input=task_triage, filter=suffix('.pdf'), output='.repaired.pdf', output_dir=work_folder, extras=[log, context]) # Split (kwargs for split seems to be broken, so pass plain args) task_split_pages = main_pipeline.split(split_pages, task_repair_pdf, os.path.join( work_folder, '*.page.pdf'), extras=[log, context]) # Rasterize preview task_rasterize_preview = main_pipeline.transform( task_func=rasterize_preview, input=task_split_pages, filter=suffix('.page.pdf'), output='.preview.jpg', output_dir=work_folder, extras=[log, context]) task_rasterize_preview.active_if(options.rotate_pages) # Orient task_orient_page = main_pipeline.collate( task_func=orient_page, input=[task_split_pages, task_rasterize_preview], filter=regex( r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"), output=os.path.join(work_folder, r'\1\2.oriented.pdf'), extras=[log, context]) # Rasterize actual task_rasterize_with_ghostscript = main_pipeline.transform( task_func=rasterize_with_ghostscript, input=task_orient_page, filter=suffix('.ocr.oriented.pdf'), output='.page.png', output_dir=work_folder, extras=[log, context]) # Preprocessing subpipeline task_preprocess_remove_background = main_pipeline.transform( task_func=preprocess_remove_background, input=task_rasterize_with_ghostscript, filter=suffix(".page.png"), output=".pp-background.png", extras=[log, context]) task_preprocess_deskew = main_pipeline.transform( task_func=preprocess_deskew, input=task_preprocess_remove_background, filter=suffix(".pp-background.png"), output=".pp-deskew.png", extras=[log, context]) task_preprocess_clean = main_pipeline.transform( task_func=preprocess_clean, input=task_preprocess_deskew, filter=suffix(".pp-deskew.png"), output=".pp-clean.png", extras=[log, context]) task_select_ocr_image = main_pipeline.collate( task_func=select_ocr_image, input=[task_preprocess_clean], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r"\1.ocr.png"), extras=[log, context]) # HOCR OCR task_ocr_tesseract_hocr = main_pipeline.transform( task_func=ocr_tesseract_hocr, input=task_select_ocr_image, filter=suffix(".ocr.png"), output=".hocr", extras=[log, context]) task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"') task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr') if tesseract.v4(): task_ocr_tesseract_hocr.jobs_limit(2) # Uses multi-core on its own task_select_visible_page_image = main_pipeline.collate( task_func=select_visible_page_image, input=[ task_rasterize_with_ghostscript, task_preprocess_remove_background, task_preprocess_deskew, task_preprocess_clean ], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r'\1.image'), extras=[log, context]) task_select_visible_page_image.graphviz(shape='diamond') task_select_image_layer = main_pipeline.collate( task_func=select_image_layer, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.image-layer.pdf'), extras=[log, context]) task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond') task_select_image_layer.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'tess4') task_render_hocr_page = main_pipeline.transform( task_func=render_hocr_page, input=task_ocr_tesseract_hocr, filter=suffix('.hocr'), output='.text.pdf', extras=[log, context]) task_render_hocr_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_page.active_if(options.pdf_renderer == 'hocr') task_render_hocr_debug_page = main_pipeline.collate( task_func=render_hocr_debug_page, input=[task_select_visible_page_image, task_ocr_tesseract_hocr], filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"), output=os.path.join(work_folder, r'\1.debug.pdf'), extras=[log, context]) task_render_hocr_debug_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_debug_page.active_if(options.pdf_renderer == 'hocr') task_render_hocr_debug_page.active_if(options.debug_rendering) # Tesseract OCR + text only PDF task_ocr_tesseract_textonly_pdf = main_pipeline.collate( task_func=ocr_tesseract_textonly_pdf, input=[task_select_ocr_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.text.pdf'), extras=[log, context]) task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"') task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'tess4') if tesseract.v4(): task_ocr_tesseract_textonly_pdf.jobs_limit(2) task_combine_layers = main_pipeline.collate( task_func=combine_layers, input=[ task_render_hocr_page, task_ocr_tesseract_textonly_pdf, task_select_image_layer ], filter=regex(r".*/(\d{6})(?:\.text\.pdf|\.image-layer\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[log, context]) task_combine_layers.graphviz(fillcolor='"#00cc66"') task_combine_layers.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'tess4') # Tesseract OCR+PDF task_ocr_tesseract_and_render_pdf = main_pipeline.collate( task_func=ocr_tesseract_and_render_pdf, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[log, context]) task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"') task_ocr_tesseract_and_render_pdf.active_if( options.pdf_renderer == 'tesseract') if tesseract.v4(): task_ocr_tesseract_and_render_pdf.jobs_limit(2) # Uses multi-core # PDF/A task_generate_postscript_stub = main_pipeline.transform( task_func=generate_postscript_stub, input=task_repair_pdf, filter=formatter(r'\.repaired\.pdf'), output=os.path.join(work_folder, 'pdfa.ps'), extras=[log, context]) task_generate_postscript_stub.active_if(options.output_type == 'pdfa') # Bypass valve task_skip_page = main_pipeline.transform( task_func=skip_page, input=task_orient_page, filter=suffix('.skip.oriented.pdf'), output='.done.pdf', output_dir=work_folder, extras=[log, context]) # Merge pages task_merge_pages_ghostscript = main_pipeline.merge( task_func=merge_pages_ghostscript, input=[ task_combine_layers, task_render_hocr_debug_page, task_skip_page, task_ocr_tesseract_and_render_pdf, task_generate_postscript_stub ], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) task_merge_pages_ghostscript.active_if(options.output_type == 'pdfa') task_merge_pages_qpdf = main_pipeline.merge( task_func=merge_pages_qpdf, input=[ task_combine_layers, task_render_hocr_debug_page, task_skip_page, task_ocr_tesseract_and_render_pdf, task_repair_pdf ], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) task_merge_pages_qpdf.active_if(options.output_type == 'pdf') # Finalize task_copy_final = main_pipeline.merge( task_func=copy_final, input=[task_merge_pages_ghostscript, task_merge_pages_qpdf], output=options.output_file, extras=[log, context])
#--------------------------------------------------------------- # create initial files # @mkdir(tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven') @originate([ [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job1.a.start', tempdir + 'job1.b.start'], [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job2.a.start', tempdir + 'job2.b.start'], [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job3.a.start', tempdir + 'job3.b.start'] ]) def create_initial_file_pairs(output_files): # create both files as necessary for output_file in output_files: with open(output_file, "w") as oo: pass #--------------------------------------------------------------- # first task @transform(create_initial_file_pairs, suffix(".start"), ".output.1") def first_task(input_files, output_file): with open(output_file, "w"): pass #--------------------------------------------------------------- # second task @transform(first_task, suffix(".output.1"), ".output.2") def second_task(input_files, output_file): with open(output_file, "w"): pass test_pipeline = Pipeline("test") test_pipeline.originate(output = [ [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job1.a.start', tempdir + 'job1.b.start'], [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job2.a.start', tempdir + 'job2.b.start'], [tempdir + 'data/scratch/lg/what/one/two/three/four/five/six/seven/job3.a.start', tempdir + 'job3.b.start'] ], task_func = create_initial_file_pairs)