def test_newstyle_ruffus (self): print(" Run pipeline normally...") test_pipeline = Pipeline("test") test_pipeline.originate(make_start, [tempdir + 'start']) test_pipeline.split(split_start, make_start, tempdir + '*.split') test_pipeline.subdivide(subdivide_start, split_start, formatter(), tempdir + '{basename[0]}_*.subdivided', tempdir + '{basename[0]}') if self.graph_viz_present: test_pipeline.printout_graph(tempdir + "flowchart.dot") test_pipeline.printout_graph(tempdir + "flowchart.jpg", target_tasks =[subdivide_start], forcedtorun_tasks = [split_start], no_key_legend = True) test_pipeline.printout_graph(tempdir + "flowchart.svg", no_key_legend = False) # Unknown format try: test_pipeline.printout_graph(tempdir + "flowchart.unknown", no_key_legend = False) raise Exception("Failed to throw exception for test_pipeline.printout_graph unknown extension ") except CalledProcessError as err: pass test_pipeline.printout_graph(tempdir + "flowchart.unknown", "svg", no_key_legend = False) else: test_pipeline.printout_graph(tempdir + "flowchart.dot", target_tasks =[subdivide_start], forcedtorun_tasks = [split_start], no_key_legend = True)
def test_newstyle_ruffus(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func=make_start, output=[tempdir + 'start']) test_pipeline.split(task_func=split_start, input=make_start, output=tempdir + '*.split') test_pipeline.subdivide(task_func=subdivide_start, input=split_start, filter=formatter( ), output=tempdir + '{basename[0]}_*.subdivided', extras=[tempdir + '{basename[0]}']) expected_files_after_1_runs = ["start", "0.split", "0_0.subdivided"] expected_files_after_2_runs = [ "1.split", "0_1.subdivided", "1_0.subdivided"] expected_files_after_3_runs = [ "2.split", "0_2.subdivided", "1_1.subdivided", "2_0.subdivided"] expected_files_after_4_runs = [ "3.split", "0_3.subdivided", "1_2.subdivided", "2_1.subdivided", "3_0.subdivided"] print(" 1 Run pipeline normally...") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) print(" 2 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs, expected_files_after_2_runs) time.sleep(2) print(" 3 Running again with forced tasks to generate more files...") test_pipeline.run(forcedtorun_tasks=[ "test::make_start"], multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) print(" 4 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs, expected_files_after_3_runs) time.sleep(2) print(" 5 Running again with forced tasks to generate even more files...") test_pipeline.run(forcedtorun_tasks=make_start, multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs) print(" 6 Check that running again does nothing. (All up to date).") test_pipeline.run(multiprocess=10, verbose=TEST_VERBOSITY) self.check_file_exists_or_not_as_expected(expected_files_after_1_runs + expected_files_after_2_runs + expected_files_after_3_runs, expected_files_after_4_runs)
def test_newstyle_mkdir_run(self): test_pipeline = Pipeline("test") test_pipeline.split(task_func = generate_initial_files1, input = 1, output = [tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline.transform( task_func = test_transform, input = generate_initial_files1, filter = formatter(), output = "{path[0]}/{basename[0]}.dir/{basename[0]}.tmp2")\ .mkdir(tempdir + "/test1")\ .mkdir(tempdir + "/test2")\ .mkdir(generate_initial_files1, formatter(), ["{path[0]}/{basename[0]}.dir", 3, "{path[0]}/{basename[0]}.dir2"]) test_pipeline.mkdir(test_transform2, tempdir + "/test3")\ .mkdir(generate_initial_files1, formatter(), "{path[0]}/{basename[0]}.dir2") cleanup_tmpdir() pipeline_run([test_transform, test_transform2], verbose=0, multiprocess = 2, pipeline= "main")
def test_transform_with_missing_formatter_args_b(self): test_pipeline = Pipeline("test") test_pipeline.originate(task_func = generate_initial_files, output = [os.path.join(tempdir, ff + ".tmp") for ff in "abcd"])\ .mkdir(tempdir) test_pipeline.transform(task_func = transform_with_missing_formatter_args, input = generate_initial_files, filter = formatter(), output = "{path[0]}/{basename[0]}.task1", extras =['echo {dynamic_message} > {some_file}']) s = StringIO() test_pipeline.printout(s, [transform_with_missing_formatter_args], verbose=4, wrap_width = 10000, pipeline= "test") self.assertIn("Missing key = {dynamic_message}", s.getvalue()) #log to stream s = StringIO() logger = t_stream_logger(s) test_pipeline.run([transform_with_missing_formatter_args], verbose=5, pipeline= "test", logger=logger) self.assertIn("Missing key = {dynamic_message}", s.getvalue())
def create_pipeline (self): #each pipeline has a different name global cnt_pipelines cnt_pipelines = cnt_pipelines + 1 test_pipeline = Pipeline("test %d" % cnt_pipelines) test_pipeline.originate(task_func = generate_initial_files1, output = [tempdir + prefix + "_name.tmp1" for prefix in "abcd"]) test_pipeline.originate(task_func = generate_initial_files2, output = [tempdir + "e_name.tmp1", tempdir + "f_name.tmp1"]) test_pipeline.originate(task_func = generate_initial_files3, output = [tempdir + "g_name.tmp1", tempdir + "h_name.tmp1"]) test_pipeline.originate(task_func = generate_initial_files4, output = tempdir + "i_name.tmp1") test_pipeline.collate( task_func = test_task2, input = [generate_initial_files1, generate_initial_files2, generate_initial_files3, generate_initial_files4], filter = formatter(), output = "{path[0]}/all.tmp2") test_pipeline.transform(task_func = test_task3, input = test_task2, filter = suffix(".tmp2"), output = ".tmp3") test_pipeline.transform(task_func = test_task4, input = test_task3, filter = suffix(".tmp3"), output = ".tmp4") return test_pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='thepipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. # filter=formatter('(?P<path>.+)/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+)_1.fastq.gz'), # 1_HFYLVCCXX:2:TCCGCGAA_2_GE0343_1.fastq.gz # 1_HCJWFBCXX:GGACTCCT_L001_9071584415739518822-AGRF-023_R2.fastq.gz filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz add_inputs=add_inputs( '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'], # extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Local realignment using GATK # Generate RealignerTargetCreator using GATK pipeline.transform( task_func=stages.realigner_target_creator, name='realigner_target_creator', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('realigner_target_creator'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).intervals'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.bam'), output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'), # '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'), # output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Merge lane bams to sample bams pipeline.collate( task_func=stages.merge_sample_bams, name='merge_sample_bams', filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).sort.dedup.realn.recal.bam'), '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).sort.dedup.realn.recal.bam'), # inputs=add_inputs('alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'), input=output_from('print_reads_gatk'), output='alignments/{sample[0]}/{sample[0]}.merged.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard2', input=output_from('merge_sample_bams'), # filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).merged.bam'), filter=suffix('.merged.bam'), # XXX should make metricsup an extra output? output=['.merged.dedup.bam', '.metricsdup']) # Local realignment2 using GATK # Generate RealignerTargetCreator using GATK pipeline.transform( task_func=stages.realigner_target_creator, name='realigner_target_creator2', input=output_from('mark_duplicates_picard2'), filter=suffix('.dedup.bam'), output='.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk2', input=output_from('realigner_target_creator2'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.intervals'), # filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).intervals'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{sample[0]}.merged.dedup.bam'), output='alignments/{sample[0]}/{sample[0]}.merged.dedup.realn.bam') .follows('mark_duplicates_picard2')) # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('local_realignment_gatk2'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.dedup.realn.bam'), output='variants/{sample[0]}.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_haplotypecaller_gatk'), output='variants/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), add_inputs=add_inputs(['ALL.snp_recal', 'ALL.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), add_inputs=add_inputs( ['ALL.indel_recal', 'ALL.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['ALL.recal_INDEL.vcf']), # output='.combined.vcf') output='ALL.raw.vqsr.vcf') .follows('apply_indel_recalibrate_gatk')) # # # Select variants using GATK # pipeline.transform( # task_func=stages.select_variants_gatk, # name='select_variants_gatk', # input=output_from('combine_variants_gatk'), # filter=suffix('.combined.vcf'), # output='.selected.vcf') return pipeline
def main(): ######### # SETUP # ######### # catch jgi logon and password from cli parser = ruffus.cmdline.get_argparse( description='5 accessions variant calling pipeline.') parser.add_argument('--email', '-e', help='Logon email address for JGI', type=str, dest='jgi_logon') parser.add_argument('--password', '-p', help='JGI password', type=str, dest='jgi_password') options = parser.parse_args() jgi_logon = options.jgi_logon jgi_password = options.jgi_password ################## # PIPELINE STEPS # ################## # test function for checking input/output passed to job_script and parsing # by io_parser test_job_function = functions.generate_job_function( job_script='src/sh/io_parser', job_name='test') # initialise pipeline main_pipeline = ruffus.Pipeline.pipelines["main"] # bamfiles raw_files = [x.path for x in os.scandir('data/bam') if x.name.endswith('.bam') and x.is_file] # subset the files while the pipeline is in development. Make this equal # to the raw_files to run the whole pipline. # active_raw_files = [x for x in raw_files if # 'G1' in x or 'G4' in x or 'J1' in x or 'J4' in x] active_raw_files = raw_files # species short names for vcf splitting species_short_names = list(set( [os.path.basename(x)[0] for x in active_raw_files])) # check that the files exist mapped_raw = main_pipeline.originate( name='mapped_raw', task_func=os.path.isfile, output=active_raw_files) # genome fasta ref_fa = main_pipeline.originate( name='ref_fa', task_func=functions.generate_job_function( job_script='src/sh/download_genome', job_name='ref_fa', job_type='download'), output='data/genome/Osativa_323_v7.0.fa', extras=[jgi_logon, jgi_password]) # indexes fa_idx = main_pipeline.transform( name='fa_idx', task_func=functions.generate_job_function( job_script='src/sh/fa_idx', job_name='fa_idx', job_type='transform', cpus_per_task=6), input=ref_fa, filter=ruffus.suffix(".fa"), output=['.dict', '.fa.fai']) # annotation annot = main_pipeline.originate( name='annot', task_func=functions.generate_job_function( job_script='src/sh/download_genome', job_name='annot', job_type='download'), output=('data/genome/' 'Osativa_323_v7.0.gene_exons.gffread.rRNAremoved.gtf'), extras=[jgi_logon, jgi_password]) # convert annotation to .bed annot_bed = main_pipeline.transform( name='annot_bed', task_func=functions.generate_job_function( job_script='src/sh/annot_bed', job_name='annot_bed', job_type='transform', cpus_per_task=7), input=annot, filter=ruffus.suffix('.gtf'), output='.bed') # mark duplicates with picard deduped = main_pipeline.transform( name='dedupe', task_func=functions.generate_job_function( job_script='src/sh/mark_duplicates_and_sort', job_name='dedupe', job_type='transform', cpus_per_task=2), input=mapped_raw, filter=ruffus.regex(r"data/bam/(.*).Aligned.out.bam"), output=(r"output/mark_duplicates_and_sort/\1.deduped.bam")) # Split'N'Trim and reassign mapping qualities split_and_trimmed = main_pipeline.transform( name='split_trim', task_func=functions.generate_job_function( job_script='src/sh/split_trim', job_name='split_trim', job_type='transform', cpus_per_task=2), input=deduped, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.formatter( "output/mark_duplicates_and_sort/(?P<LIB>.+).deduped.bam"), output=["{subdir[0][1]}/split_trim/{LIB[0]}.split.bam"])\ .follows(fa_idx) # we're going to recycle call_variants, merge_variants, filter_variants # and analyze_covar so we'll get the functions in advance call_variants = functions.generate_queue_job_function( job_script='src/sh/call_variants', job_name='call_variants') merge_variants = functions.generate_job_function( job_script='src/sh/merge_variants', job_name='merge_variants', job_type='transform', cpus_per_task=8) filter_variants = functions.generate_job_function( job_script='src/sh/filter_variants', job_name='filter_variants', job_type='transform', cpus_per_task=1) analyze_covar = functions.generate_queue_job_function( job_script='src/sh/analyze_covar', job_name='analyze_covar') # call variants without recalibration tables uncalibrated_variants = main_pipeline.transform( name='uncalibrated_variants', task_func=call_variants, input=split_and_trimmed, add_inputs=ruffus.add_inputs([ref_fa, annot_bed]), filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'), output='{subdir[0][1]}/variants_uncalibrated/{LIB[0]}.g.vcf.gz') # merge gVCF variants uncalibrated_variants_merged = main_pipeline.merge( name='uncalibrated_variants_merged', task_func=merge_variants, input=[uncalibrated_variants, ref_fa], output='output/variants_uncalibrated/variants_uncalibrated.vcf.gz') # filter variants on un-corrected bamfiles uncalibrated_variants_filtered = main_pipeline.transform( name='uncalibrated_variants_filtered', task_func=filter_variants, input=uncalibrated_variants_merged, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('_uncalibrated.vcf.gz'), output='_uncalibrated_filtered.vcf.gz') # select variant (only recalibrate using passed SNPs) uncalibrated_variants_selected = main_pipeline.transform( name='uncalibrated_variants_selected', task_func=functions.generate_job_function( job_script='src/sh/select_variants', job_name='select_variants', job_type='transform'), input=uncalibrated_variants_filtered, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('_uncalibrated_filtered.vcf.gz'), output='_uncalibrated_selected.vcf.gz') # create recalibration report with filtered variants covar_report = main_pipeline.merge( name='covar_report', task_func=analyze_covar, input=[split_and_trimmed, ref_fa, annot_bed, uncalibrated_variants_selected], output="output/covar_analysis/recal_data.table") # second pass to analyze covariation remaining after recalibration second_pass_covar_report = main_pipeline.merge( name='second_pass_covar_report', task_func=analyze_covar, input=[split_and_trimmed, ref_fa, annot_bed, uncalibrated_variants_filtered, covar_report], output="output/covar_analysis/post_recal_data.table") # plot effect of base recalibration recal_plot = main_pipeline.transform( name='recal_plot', task_func=functions.generate_job_function( job_script='src/R/recal_plot.R', job_name='recal_plot', job_type='transform', cpus_per_task=1), input=second_pass_covar_report, filter=ruffus.suffix('post_recal_data.table'), add_inputs=ruffus.add_inputs(covar_report), output='recalibration_plots.pdf') # recalibrate bases using recalibration report recalibrated = main_pipeline.transform( name='recalibrate', task_func=functions.generate_job_function( job_script='src/sh/recalibrate', job_name='recalibrate', job_type='transform', cpus_per_task=2), input=split_and_trimmed, add_inputs=ruffus.add_inputs([ref_fa, covar_report]), filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'), output='{subdir[0][1]}/recal/{LIB[0]}.recal.bam') # final variant calling variants = main_pipeline.transform( name='variants', task_func=call_variants, input=recalibrated, add_inputs=ruffus.add_inputs(ref_fa, annot_bed), filter=ruffus.formatter('output/recal/(?P<LIB>.+).recal.bam'), output='{subdir[0][1]}/variants/{LIB[0]}.g.vcf.gz') # merge gVCF variants variants_merged = main_pipeline.merge( name='variants_merged', task_func=merge_variants, input=[variants, ref_fa], output='output/variants/variants.vcf.gz') # variant filtering variants_filtered = main_pipeline.transform( name='variants_filtered', task_func=filter_variants, input=variants_merged, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('.vcf.gz'), output='_filtered.vcf.gz') # variants by species split_variants = main_pipeline.subdivide( name='split_variants', task_func=functions.generate_job_function( job_script='src/sh/split_variants', job_name='split_variants', job_type='transform', cpus_per_task=1, ntasks=len(species_short_names)), input=variants_filtered, filter=ruffus.formatter(), add_inputs=ruffus.add_inputs(ref_fa), output=[('output/split_variants/' + x + '.variants_filtered.vcf.gz') for x in species_short_names]) # count variants per gene per species cds_variants = main_pipeline.transform( name='cds_variants', task_func=functions.generate_job_function( job_script='src/R/cds_variants.R', job_name='cds_variants', job_type='transform'), input=split_variants, add_inputs=ruffus.add_inputs([ref_fa, annot]), filter=ruffus.formatter( 'output/split_variants/(?P<LIB>.+).variants_filtered.vcf.gz'), output='{subdir[0][1]}/cds_variants/{LIB[0]}.cds_variants.Rds') # merge counted variants variants_per_gene = main_pipeline.merge( name='cds_merge', task_func=functions.generate_job_function( job_script='src/R/cds_merge.R', job_name='cds_merge', job_type='transform'), input=cds_variants, output='output/cds_variants/cds_variants.Rds') ################### # RUFFUS COMMANDS # ################### # print the flowchart ruffus.pipeline_printout_graph( "ruffus/flowchart.pdf", "pdf", pipeline_name="5 accessions variant calling pipeline") # run the pipeline ruffus.cmdline.run(options, multithread=8)
work_folder = mkdtemp(prefix="com.github.ocrmypdf.") @atexit.register def cleanup_working_files(*args): if options.keep_temporary_files: print("Temporary working files saved at:") print(work_folder) else: with suppress(FileNotFoundError): shutil.rmtree(work_folder) @transform( input=options.input_file, filter=formatter('(?i)\.pdf'), output=work_folder + '{basename[0]}.repaired.pdf', extras=[_log, _pdfinfo, _pdfinfo_lock]) def repair_pdf( input_file, output_file, log, pdfinfo, pdfinfo_lock): args_qpdf = [ 'qpdf', input_file, output_file ] try: out = check_output(args_qpdf, stderr=STDOUT, universal_newlines=True) except CalledProcessError as e: exit_with_error = True
def build_pipeline(options, work_folder, log, context): main_pipeline = Pipeline.pipelines['main'] # Triage task_triage = main_pipeline.transform( task_func=triage, input=os.path.join(work_folder, 'origin'), filter=formatter('(?i)'), output=os.path.join(work_folder, 'origin.pdf'), extras=[log, context]) task_repair_and_parse_pdf = main_pipeline.transform( task_func=repair_and_parse_pdf, input=task_triage, filter=suffix('.pdf'), output='.repaired.pdf', output_dir=work_folder, extras=[log, context]) # Split (kwargs for split seems to be broken, so pass plain args) task_pre_split_pages = main_pipeline.split(pre_split_pages, task_repair_and_parse_pdf, os.path.join( work_folder, '*.presplit.pdf'), extras=[log, context]) task_split_pages = main_pipeline.transform(task_func=split_page, input=task_pre_split_pages, filter=suffix('.presplit.pdf'), output='.page.pdf', output_dir=work_folder, extras=[log, context]) task_ocr_or_skip = main_pipeline.split( ocr_or_skip, task_split_pages, [ os.path.join(work_folder, '*.ocr.page.pdf'), os.path.join(work_folder, '*.skip.page.pdf') ], extras=[log, context]) # Rasterize preview task_rasterize_preview = main_pipeline.transform( task_func=rasterize_preview, input=task_ocr_or_skip, filter=suffix('.page.pdf'), output='.preview.jpg', output_dir=work_folder, extras=[log, context]) task_rasterize_preview.active_if(options.rotate_pages) # Orient task_orient_page = main_pipeline.collate( task_func=orient_page, input=[task_ocr_or_skip, task_rasterize_preview], filter=regex( r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"), output=os.path.join(work_folder, r'\1\2.oriented.pdf'), extras=[log, context]) # Rasterize actual task_rasterize_with_ghostscript = main_pipeline.transform( task_func=rasterize_with_ghostscript, input=task_orient_page, filter=suffix('.ocr.oriented.pdf'), output='.page.png', output_dir=work_folder, extras=[log, context]) # Preprocessing subpipeline task_preprocess_remove_background = main_pipeline.transform( task_func=preprocess_remove_background, input=task_rasterize_with_ghostscript, filter=suffix(".page.png"), output=".pp-background.png", extras=[log, context]) task_preprocess_deskew = main_pipeline.transform( task_func=preprocess_deskew, input=task_preprocess_remove_background, filter=suffix(".pp-background.png"), output=".pp-deskew.png", extras=[log, context]) task_preprocess_clean = main_pipeline.transform( task_func=preprocess_clean, input=task_preprocess_deskew, filter=suffix(".pp-deskew.png"), output=".pp-clean.png", extras=[log, context]) task_select_ocr_image = main_pipeline.collate( task_func=select_ocr_image, input=[task_preprocess_clean], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r"\1.ocr.png"), extras=[log, context]) # HOCR OCR task_ocr_tesseract_hocr = main_pipeline.transform( task_func=ocr_tesseract_hocr, input=task_select_ocr_image, filter=suffix(".ocr.png"), output=[".hocr", ".txt"], extras=[log, context]) task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"') task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr') task_select_visible_page_image = main_pipeline.collate( task_func=select_visible_page_image, input=[ task_rasterize_with_ghostscript, task_preprocess_remove_background, task_preprocess_deskew, task_preprocess_clean ], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r'\1.image'), extras=[log, context]) task_select_visible_page_image.graphviz(shape='diamond') task_select_image_layer = main_pipeline.collate( task_func=select_image_layer, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.image-layer.pdf'), extras=[log, context]) task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond') task_select_image_layer.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich') task_render_hocr_page = main_pipeline.transform( task_func=render_hocr_page, input=task_ocr_tesseract_hocr, filter=regex(r".*/(\d{6})(?:\.hocr)"), output=os.path.join(work_folder, r'\1.text.pdf'), extras=[log, context]) task_render_hocr_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_page.active_if(options.pdf_renderer == 'hocr') task_render_hocr_debug_page = main_pipeline.collate( task_func=render_hocr_debug_page, input=[task_select_visible_page_image, task_ocr_tesseract_hocr], filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"), output=os.path.join(work_folder, r'\1.debug.pdf'), extras=[log, context]) task_render_hocr_debug_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_debug_page.active_if(options.pdf_renderer == 'hocr') task_render_hocr_debug_page.active_if(options.debug_rendering) # Tesseract OCR + text only PDF task_ocr_tesseract_textonly_pdf = main_pipeline.collate( task_func=ocr_tesseract_textonly_pdf, input=[task_select_ocr_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"), output=[ os.path.join(work_folder, r'\1.text.pdf'), os.path.join(work_folder, r'\1.text.txt') ], extras=[log, context]) task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"') task_ocr_tesseract_textonly_pdf.active_if( options.pdf_renderer == 'sandwich') task_combine_layers = main_pipeline.collate( task_func=combine_layers, input=[ task_render_hocr_page, task_ocr_tesseract_textonly_pdf, task_select_image_layer ], filter=regex(r".*/(\d{6})(?:\.text\.pdf|\.image-layer\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[log, context]) task_combine_layers.graphviz(fillcolor='"#00cc66"') task_combine_layers.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich') # Tesseract OCR+PDF task_ocr_tesseract_and_render_pdf = main_pipeline.collate( task_func=ocr_tesseract_and_render_pdf, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=[ os.path.join(work_folder, r'\1.rendered.pdf'), os.path.join(work_folder, r'\1.rendered.txt') ], extras=[log, context]) task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"') task_ocr_tesseract_and_render_pdf.active_if( options.pdf_renderer == 'tesseract') # PDF/A task_generate_postscript_stub = main_pipeline.transform( task_func=generate_postscript_stub, input=task_repair_and_parse_pdf, filter=formatter(r'\.repaired\.pdf'), output=os.path.join(work_folder, 'pdfa.ps'), extras=[log, context]) task_generate_postscript_stub.active_if( options.output_type.startswith('pdfa')) # Bypass valve task_skip_page = main_pipeline.transform( task_func=skip_page, input=task_orient_page, filter=suffix('.skip.oriented.pdf'), output='.done.pdf', output_dir=work_folder, extras=[log, context]) # Merge pages task_merge_pages_ghostscript = main_pipeline.merge( task_func=merge_pages_ghostscript, input=[ task_combine_layers, task_render_hocr_debug_page, task_skip_page, task_ocr_tesseract_and_render_pdf, task_generate_postscript_stub ], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) task_merge_pages_ghostscript.active_if( options.output_type.startswith('pdfa')) task_merge_pages_qpdf = main_pipeline.merge( task_func=merge_pages_qpdf, input=[ task_combine_layers, task_render_hocr_debug_page, task_skip_page, task_ocr_tesseract_and_render_pdf, task_repair_and_parse_pdf ], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) task_merge_pages_qpdf.active_if(options.output_type == 'pdf' and not fitz) task_merge_pages_mupdf = main_pipeline.merge( task_func=merge_pages_mupdf, input=[ task_combine_layers, task_render_hocr_debug_page, task_skip_page, task_ocr_tesseract_and_render_pdf, task_repair_and_parse_pdf ], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) task_merge_pages_mupdf.active_if(options.output_type == 'pdf' and fitz) task_merge_sidecars = main_pipeline.merge( task_func=merge_sidecars, input=[ task_ocr_tesseract_hocr, task_ocr_tesseract_and_render_pdf, task_ocr_tesseract_textonly_pdf ], output=options.sidecar, extras=[log, context]) task_merge_sidecars.active_if(options.sidecar) # Finalize main_pipeline.merge(task_func=copy_final, input=[ task_merge_pages_ghostscript, task_merge_pages_mupdf, task_merge_pages_qpdf ], output=options.output_file, extras=[log, context])
(options.image_dpi, options.image_dpi)) with open(output_file, 'wb') as outf: img2pdf.convert( input_file, layout_fun=layout_fun, with_pdfrw=False, outputstream=outf) log.info("Successfully converted to PDF, processing...") except img2pdf.ImageOpenError as e: log.error(e) sys.exit(ExitCode.input_file) @transform( input=options.input_file, filter=formatter('(?i)'), output=os.path.join(work_folder, '{basename[0]}.pdf'), extras=[_log]) def triage( input_file, output_file, log): try: with open(input_file, 'rb') as f: signature = f.read(4) if signature == b'%PDF': re_symlink(input_file, output_file) return except EnvironmentError as e: log.error(e) sys.exit(ExitCode.input_file)
cmd = config['CMD_ASCP'].format( log_dir=sra_outdir, url_path=sra_url_path, output_dir=sra_outdir) returncode = misc.execute(cmd, msg_id, flag_file, options.debug) if returncode != 0 or returncode is None: # try wget # cmd template looks like this: # wget ftp://ftp-trace.ncbi.nlm.nih.gov{url_path} -P {output_dir} -N cmd = config['CMD_WGET'].format( url_path=sra_url_path, output_dir=sra_outdir) misc.execute(cmd, msg_id, flag_file, options.debug) @R.subdivide( download, R.formatter(r'{0}/(?P<RX>[SED]RX\d+)/(?P<RR>[SED]RR\d+)/(.*)\.sra'.format(PATH_RE)), ['{subpath[0][2]}/{RR[0]}_[12].fastq.gz', '{subpath[0][2]}/{RR[0]}.sra.sra2fastq.COMPLETE']) def sra2fastq(inputs, outputs): """for meaning of [SED]RR, see http://www.ncbi.nlm.nih.gov/books/NBK56913/#search.the_entrez_sra_search_response_pa S =NCBI-SRA, E = EMBL-SRA, D = DDBJ-SRA SRR: SRA run accession ERR: ERA run accession DRR: DRA run accession """ sra, _ = inputs # ignore the flag file from previous task flag_file = outputs[-1] outdir = os.path.dirname(os.path.dirname(os.path.dirname(sra))) cmd = config['CMD_FASTQ_DUMP'].format(output_dir=outdir, accession=sra)
right that way.''' if s.endswith('.ps'): return 99999999 key = int(os.path.basename(s)[0:6]) * 10 if 'debug' in os.path.basename(s): key += 1 return key pdf_pages = sorted(input_files, key=input_file_order) log.info(pdf_pages) ghostscript.generate_pdfa(pdf_pages, output_file, options.jobs or 1) @transform( input=merge_pages, filter=formatter(), output=options.output_file, extras=[_log, _pdfinfo, _pdfinfo_lock]) def copy_final( input_file, output_file, log, pdfinfo, pdfinfo_lock): shutil.copy(input_file, output_file) def validate_pdfa( input_file, log):
def main(): # prepare the ruffus pipeline main_pipeline = ruffus.Pipeline.pipelines["main"] # catch jgi logon and password from cli parser = ruffus.cmdline.get_argparse(description='UV-B analysis pipeline.') parser.add_argument('--email', '-e', help='Logon email address for JGI', type=str, dest='jgi_logon') parser.add_argument('--password', '-p', help='JGI password', type=str, dest='jgi_password') options = parser.parse_args() jgi_logon = options.jgi_logon jgi_password = options.jgi_password # need a dictionary of species to genome URL and species to gff. # supply this in a text file fasta_urls = {} annotation_urls = {} with open('data/genomeUrls.txt') as tsv: genome_urls = csv.reader(tsv, delimiter='\t') next(genome_urls, None) for row in genome_urls: fasta_urls[row[0]] = row[1] annotation_urls[row[0]] = row[2] # iterate over fasta_urls keys to run jobs for species in fasta_urls.keys(): # call download script main_pipeline.originate( name=species + "_genome", task_func=download_genome, output="data/genome/" + species + "/METADATA.csv", extras=[species, fasta_urls[species], annotation_urls[species], jgi_logon, jgi_password]) # generate a star genome for each species main_pipeline.transform( name=species + "_index", task_func=generate_index, input=ruffus.output_from(species + "_genome"), filter=ruffus.regex(r"data/genome/(.*)/METADATA.csv"), output=r"output/\1/star-index/METADATA.csv", extras=[r"\1"]) # define the reads main_pipeline.originate(name=species + "_reads", task_func=define_reads, output="ruffus/" + species + "_reads", extras=[species]) # first mapping step main_pipeline.collate( name=species + "_mapped_reads", task_func=star, input=[[ruffus.output_from(species + "_reads"), ruffus.output_from(species + "_index")]], filter=ruffus.formatter(), output=["output/{subdir[1][1]}/star/METADATA.csv"], extras=["{subdir[1][1]}"]) # FOR LOOP ENDS # parse the mapping stats mapping_stats = main_pipeline.merge( task_func=parse_star_stats_R, input=ruffus.output_from( list(species + "_mapped_reads" for species in fasta_urls.keys())), output="output/mapping_stats/SessionInfo.txt") # generate plots for mapping mapping_plots = main_pipeline.transform( task_func=plot_reads_in_genes_R, input=mapping_stats, filter=ruffus.formatter(), output="{subpath[0][0]}/Figure S1.pdf") # use generator in the input field to collate the previous results deseq_results = main_pipeline.transform( task_func=deseq2_R, input=ruffus.output_from( list(species + "_mapped_reads" for species in fasta_urls.keys())), filter=ruffus.formatter(), output=[r"output/{subdir[0][1]}/deseq2/SessionInfo.txt"], extras=[r"{subdir[0][1]}"]) # combine the deseq results de_lists = main_pipeline.merge( task_func=list_de_genes_R, input=deseq_results, output="output/merged/deseq2/SessionInfo.de_genes.txt") # run clustering mfuzz_results = main_pipeline.transform( task_func=mfuzz_R, input=deseq_results, filter=ruffus.formatter(), output='output/{subdir[0][1]}/mfuzz/SessionInfo.mfuzz.txt', extras=['{subdir[0][1]}']) # combine mfuzz_results mfuzz_plot = main_pipeline.merge( task_func=combine_mfuzz_results_R, input=mfuzz_results, output='output/merged/mfuzz/SessionInfo.mfuzz.txt') # compare flavonoid synthesis genes flavonoid_genes = main_pipeline.transform( task_func=compare_saito_genes_R, input=de_lists, filter=ruffus.formatter(), output='{path[0]}/SessionInfo.flavonoid_synthesis.txt') # run the pipeline ruffus.cmdline.run(options, multithread=8) # print the flowchart ruffus.pipeline_printout_graph("ruffus/flowchart.pdf", "pdf", pipeline_name="UV-B analysis pipeline")
def build_pipeline(options, work_folder, log, context): main_pipeline = Pipeline.pipelines['main'] # Triage task_triage = main_pipeline.transform( task_func=triage, input=os.path.join(work_folder, 'origin'), filter=formatter('(?i)'), output=os.path.join(work_folder, 'origin.pdf'), extras=[log, context], ) task_repair_and_parse_pdf = main_pipeline.transform( task_func=repair_and_parse_pdf, input=task_triage, filter=suffix('.pdf'), output='.repaired.pdf', output_dir=work_folder, extras=[log, context], ) # Split (kwargs for split seems to be broken, so pass plain args) task_marker_pages = main_pipeline.split( marker_pages, task_repair_and_parse_pdf, os.path.join(work_folder, '*.marker.pdf'), extras=[log, context], ) task_ocr_or_skip = main_pipeline.split( ocr_or_skip, task_marker_pages, [ os.path.join(work_folder, '*.ocr.page.pdf'), os.path.join(work_folder, '*.skip.page.pdf'), ], extras=[log, context], ) # Rasterize preview task_rasterize_preview = main_pipeline.transform( task_func=rasterize_preview, input=task_ocr_or_skip, filter=suffix('.page.pdf'), output='.preview.jpg', output_dir=work_folder, extras=[log, context], ) task_rasterize_preview.active_if(options.rotate_pages) # Orient task_orient_page = main_pipeline.collate( task_func=orient_page, input=[task_ocr_or_skip, task_rasterize_preview], filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"), output=os.path.join(work_folder, r'\1\2.oriented.pdf'), extras=[log, context], ) # Rasterize actual task_rasterize_with_ghostscript = main_pipeline.transform( task_func=rasterize_with_ghostscript, input=task_orient_page, filter=suffix('.ocr.oriented.pdf'), output='.page.png', output_dir=work_folder, extras=[log, context], ) # Preprocessing subpipeline task_preprocess_remove_background = main_pipeline.transform( task_func=preprocess_remove_background, input=task_rasterize_with_ghostscript, filter=suffix(".page.png"), output=".pp-background.png", extras=[log, context], ) task_preprocess_deskew = main_pipeline.transform( task_func=preprocess_deskew, input=task_preprocess_remove_background, filter=suffix(".pp-background.png"), output=".pp-deskew.png", extras=[log, context], ) task_preprocess_clean = main_pipeline.transform( task_func=preprocess_clean, input=task_preprocess_deskew, filter=suffix(".pp-deskew.png"), output=".pp-clean.png", extras=[log, context], ) task_select_ocr_image = main_pipeline.collate( task_func=select_ocr_image, input=[task_preprocess_clean], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r"\1.ocr.png"), extras=[log, context], ) # HOCR OCR task_ocr_tesseract_hocr = main_pipeline.transform( task_func=ocr_tesseract_hocr, input=task_select_ocr_image, filter=suffix(".ocr.png"), output=[".hocr", ".txt"], extras=[log, context], ) task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"') task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr') task_select_visible_page_image = main_pipeline.collate( task_func=select_visible_page_image, input=[ task_rasterize_with_ghostscript, task_preprocess_remove_background, task_preprocess_deskew, task_preprocess_clean, ], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r'\1.image'), extras=[log, context], ) task_select_visible_page_image.graphviz(shape='diamond') task_select_image_layer = main_pipeline.collate( task_func=select_image_layer, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.image-layer.pdf'), extras=[log, context], ) task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond') task_render_hocr_page = main_pipeline.transform( task_func=render_hocr_page, input=task_ocr_tesseract_hocr, filter=regex(r".*/(\d{6})(?:\.hocr)"), output=os.path.join(work_folder, r'\1.text.pdf'), extras=[log, context], ) task_render_hocr_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_page.active_if(options.pdf_renderer == 'hocr') # Tesseract OCR + text only PDF task_ocr_tesseract_textonly_pdf = main_pipeline.collate( task_func=ocr_tesseract_textonly_pdf, input=[task_select_ocr_image], filter=regex(r".*/(\d{6})(?:\.ocr.png)"), output=[ os.path.join(work_folder, r'\1.text.pdf'), os.path.join(work_folder, r'\1.text.txt'), ], extras=[log, context], ) task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"') task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich') task_weave_layers = main_pipeline.collate( task_func=weave_layers, input=[ task_repair_and_parse_pdf, task_render_hocr_page, task_ocr_tesseract_textonly_pdf, task_select_image_layer, ], filter=regex( r".*/((?:\d{6}(?:\.text\.pdf|\.image-layer\.pdf))|(?:origin\.repaired\.pdf))" ), output=os.path.join(work_folder, r'layers.rendered.pdf'), extras=[log, context], ) task_weave_layers.graphviz(fillcolor='"#00cc66"') # PDF/A pdfmark task_generate_postscript_stub = main_pipeline.transform( task_func=generate_postscript_stub, input=task_repair_and_parse_pdf, filter=formatter(r'\.repaired\.pdf'), output=os.path.join(work_folder, 'pdfa.ps'), extras=[log, context], ) task_generate_postscript_stub.active_if(options.output_type.startswith('pdfa')) # PDF/A conversion task_convert_to_pdfa = main_pipeline.merge( task_func=convert_to_pdfa, input=[task_generate_postscript_stub, task_weave_layers], output=os.path.join(work_folder, 'pdfa.pdf'), extras=[log, context], ) task_convert_to_pdfa.active_if(options.output_type.startswith('pdfa')) task_metadata_fixup = main_pipeline.merge( task_func=metadata_fixup, input=[task_repair_and_parse_pdf, task_weave_layers, task_convert_to_pdfa], output=os.path.join(work_folder, 'metafix.pdf'), extras=[log, context], ) task_merge_sidecars = main_pipeline.merge( task_func=merge_sidecars, input=[task_ocr_tesseract_hocr, task_ocr_tesseract_textonly_pdf], output=options.sidecar, extras=[log, context], ) task_merge_sidecars.active_if(options.sidecar) # Optimize task_optimize_pdf = main_pipeline.transform( task_func=optimize_pdf, input=task_metadata_fixup, filter=suffix('.pdf'), output='.optimized.pdf', output_dir=work_folder, extras=[log, context], ) # Finalize main_pipeline.merge( task_func=copy_final, input=[task_optimize_pdf], output=options.output_file, extras=[log, context], )
#___________________________________________________________________________ # # generate_initial_files1 #___________________________________________________________________________ @originate(tempdir + "i_name.tmp1") def generate_initial_files4(on): with open(on, 'w') as outfile: pass #___________________________________________________________________________ # # test_task2 #___________________________________________________________________________ @collate([generate_initial_files1, generate_initial_files2, generate_initial_files3, generate_initial_files4], formatter(), "{path[0]}/all.tmp2") #@transform([generate_initial_files1, generate_initial_files2, generate_initial_files3, # generate_initial_files4], # formatter( ), # "{path[0]}/{basename[0]}.tmp2") def test_task2( infiles, outfile): with open(outfile, "w") as p: pass #print >>sys.stderr, "8" * 80, "\n", " task2 :%s %s " % (infiles, outfile) #___________________________________________________________________________ # # test_task3 #___________________________________________________________________________ @transform(test_task2, suffix(".tmp2"), ".tmp3")
def build_pipeline(options, work_folder, log, context): main_pipeline = Pipeline.pipelines['main'] # Triage task_triage = main_pipeline.transform( task_func=triage, input=os.path.join(work_folder, 'origin'), filter=formatter('(?i)'), output=os.path.join(work_folder, 'origin.pdf'), extras=[log, context]) task_repair_pdf = main_pipeline.transform( task_func=repair_pdf, input=task_triage, filter=suffix('.pdf'), output='.repaired.pdf', output_dir=work_folder, extras=[log, context]) # Split (kwargs for split seems to be broken, so pass plain args) task_split_pages = main_pipeline.split( split_pages, task_repair_pdf, os.path.join(work_folder, '*.page.pdf'), extras=[log, context]) # Rasterize preview task_rasterize_preview = main_pipeline.transform( task_func=rasterize_preview, input=task_split_pages, filter=suffix('.page.pdf'), output='.preview.jpg', output_dir=work_folder, extras=[log, context]) task_rasterize_preview.active_if(options.rotate_pages) # Orient task_orient_page = main_pipeline.collate( task_func=orient_page, input=[task_split_pages, task_rasterize_preview], filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"), output=os.path.join(work_folder, r'\1\2.oriented.pdf'), extras=[log, context]) # Rasterize actual task_rasterize_with_ghostscript = main_pipeline.transform( task_func=rasterize_with_ghostscript, input=task_orient_page, filter=suffix('.ocr.oriented.pdf'), output='.page.png', output_dir=work_folder, extras=[log, context]) # Preprocessing subpipeline task_preprocess_remove_background = main_pipeline.transform( task_func=preprocess_remove_background, input=task_rasterize_with_ghostscript, filter=suffix(".page.png"), output=".pp-background.png", extras=[log, context]) task_preprocess_deskew = main_pipeline.transform( task_func=preprocess_deskew, input=task_preprocess_remove_background, filter=suffix(".pp-background.png"), output=".pp-deskew.png", extras=[log, context]) task_preprocess_clean = main_pipeline.transform( task_func=preprocess_clean, input=task_preprocess_deskew, filter=suffix(".pp-deskew.png"), output=".pp-clean.png", extras=[log, context]) task_select_ocr_image = main_pipeline.collate( task_func=select_ocr_image, input=[task_preprocess_clean], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r"\1.ocr.png"), extras=[log, context]) # HOCR OCR task_ocr_tesseract_hocr = main_pipeline.transform( task_func=ocr_tesseract_hocr, input=task_select_ocr_image, filter=suffix(".ocr.png"), output=[".hocr", ".txt"], extras=[log, context]) task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"') task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr') if tesseract.v4(): task_ocr_tesseract_hocr.jobs_limit(2) # Uses multi-core on its own task_select_visible_page_image = main_pipeline.collate( task_func=select_visible_page_image, input=[task_rasterize_with_ghostscript, task_preprocess_remove_background, task_preprocess_deskew, task_preprocess_clean], filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"), output=os.path.join(work_folder, r'\1.image'), extras=[log, context]) task_select_visible_page_image.graphviz(shape='diamond') task_select_image_layer = main_pipeline.collate( task_func=select_image_layer, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=os.path.join(work_folder, r'\1.image-layer.pdf'), extras=[log, context]) task_select_image_layer.graphviz( fillcolor='"#00cc66"', shape='diamond') task_select_image_layer.active_if( options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich') task_render_hocr_page = main_pipeline.transform( task_func=render_hocr_page, input=task_ocr_tesseract_hocr, filter=regex(r".*/(\d{6})(?:\.hocr)"), output=os.path.join(work_folder, r'\1.text.pdf'), extras=[log, context]) task_render_hocr_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_page.active_if(options.pdf_renderer == 'hocr') task_render_hocr_debug_page = main_pipeline.collate( task_func=render_hocr_debug_page, input=[task_select_visible_page_image, task_ocr_tesseract_hocr], filter=regex(r".*/(\d{6})(?:\.image|\.hocr)"), output=os.path.join(work_folder, r'\1.debug.pdf'), extras=[log, context]) task_render_hocr_debug_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_debug_page.active_if(options.pdf_renderer == 'hocr') task_render_hocr_debug_page.active_if(options.debug_rendering) # Tesseract OCR + text only PDF task_ocr_tesseract_textonly_pdf = main_pipeline.collate( task_func=ocr_tesseract_textonly_pdf, input=[task_select_ocr_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"), output=[os.path.join(work_folder, r'\1.text.pdf'), os.path.join(work_folder, r'\1.text.txt')], extras=[log, context]) task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"') task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich') if tesseract.v4(): task_ocr_tesseract_textonly_pdf.jobs_limit(2) task_combine_layers = main_pipeline.collate( task_func=combine_layers, input=[task_render_hocr_page, task_ocr_tesseract_textonly_pdf, task_select_image_layer], filter=regex(r".*/(\d{6})(?:\.text\.pdf|\.image-layer\.pdf)"), output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[log, context]) task_combine_layers.graphviz(fillcolor='"#00cc66"') task_combine_layers.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich') # Tesseract OCR+PDF task_ocr_tesseract_and_render_pdf = main_pipeline.collate( task_func=ocr_tesseract_and_render_pdf, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), output=[os.path.join(work_folder, r'\1.rendered.pdf'), os.path.join(work_folder, r'\1.rendered.txt')], extras=[log, context]) task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"') task_ocr_tesseract_and_render_pdf.active_if(options.pdf_renderer == 'tesseract') if tesseract.v4(): task_ocr_tesseract_and_render_pdf.jobs_limit(2) # Uses multi-core # PDF/A task_generate_postscript_stub = main_pipeline.transform( task_func=generate_postscript_stub, input=task_repair_pdf, filter=formatter(r'\.repaired\.pdf'), output=os.path.join(work_folder, 'pdfa.ps'), extras=[log, context]) task_generate_postscript_stub.active_if(options.output_type.startswith('pdfa')) # Bypass valve task_skip_page = main_pipeline.transform( task_func=skip_page, input=task_orient_page, filter=suffix('.skip.oriented.pdf'), output='.done.pdf', output_dir=work_folder, extras=[log, context]) # Merge pages task_merge_pages_ghostscript = main_pipeline.merge( task_func=merge_pages_ghostscript, input=[task_combine_layers, task_render_hocr_debug_page, task_skip_page, task_ocr_tesseract_and_render_pdf, task_generate_postscript_stub], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) task_merge_pages_ghostscript.active_if(options.output_type.startswith('pdfa')) task_merge_pages_qpdf = main_pipeline.merge( task_func=merge_pages_qpdf, input=[task_combine_layers, task_render_hocr_debug_page, task_skip_page, task_ocr_tesseract_and_render_pdf, task_repair_pdf], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) task_merge_pages_qpdf.active_if(options.output_type == 'pdf') task_merge_sidecars = main_pipeline.merge( task_func=merge_sidecars, input=[task_ocr_tesseract_hocr, task_ocr_tesseract_and_render_pdf, task_ocr_tesseract_textonly_pdf], output=options.sidecar, extras=[log, context]) task_merge_sidecars.active_if(options.sidecar) # Finalize main_pipeline.merge( task_func=copy_final, input=[task_merge_pages_ghostscript, task_merge_pages_qpdf], output=options.output_file, extras=[log, context])
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='vcf_annotation') # Get a list of paths to all the FASTQ files vcf_files = state.config.get_option('vcfs') # Stages are dependent on the state stages = Stages(state) # The original VCF files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_vcf, name='original_vcf', output=vcf_file) # Decompose VCF using Vt pipeline.transform( task_func=stages.decompose_vcf, name='decompose_vcf', input=output_from('original_vcf'), # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).vcf'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the VCF file name (e.g. study/family name. # This is needed within the stage for finding out sample specific # configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.decompose.normalize.vcf') # FILTER COMMON VARIANTS # ADD FILTER COMMON VARIANTS USING VEP # Annotate using VEP pipeline.transform( task_func=stages.annotate_vep, name='annotate_vep', input=output_from('decompose_vcf'), filter=suffix('.vcf'), output='.vep.vcf') # Annotate using SnpEff pipeline.transform( task_func=stages.annotate_snpeff, name='annotate_snpeff', input=output_from('annotate_vep'), filter=suffix('.vcf'), output='.snpeff.vcf') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform( task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Call variants using GATK pipeline.transform( task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='PCExomes.mergegvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergegvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs( ['PCExomes.indel_recal', 'PCExomes.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']), output='.combined.vcf') .follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform( task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline
# ___________________________________________________________________________ @split(1, [tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"]) def generate_initial_files1(in_name, out_names): for on in out_names: with open(on, 'w') as outfile: pass # ___________________________________________________________________________ # # check_product_task # ___________________________________________________________________________ @mkdir(tempdir + "/test1") @mkdir(tempdir + "/test2") @mkdir(generate_initial_files1, formatter(), ["{path[0]}/{basename[0]}.dir", 3, "{path[0]}/{basename[0]}.dir2"]) @transform(generate_initial_files1, formatter(), "{path[0]}/{basename[0]}.dir/{basename[0]}.tmp2") def check_transform(infiles, outfile): with open(outfile, "w") as p: pass @mkdir(tempdir + "/test3") @mkdir(generate_initial_files1, formatter(), "{path[0]}/{basename[0]}.dir2") def check_transform2(): print(" Loose cannon!", file=sys.stderr)
fa_flag = os.path.join(fa_dir, fa_flag_bn) fas.extend([fa_all, fa_mer]) fa_flags.append(fa_flag) return k_sizes, bfs, bf_flags, fas, fa_flags K_MER_SIZES, BFS, BF_FLAGS, FAS, FA_FLAGS = gen_vars(INPUT_FQS) for __ in FAS: print __ for __ in FA_FLAGS: print __ @R.mkdir(INPUT_FQS, R.formatter(PATH_RE), ['{prefix[0]}/kon/{chr[0]}/bf']) @R.collate(INPUT_FQS, R.formatter(), BFS + BF_FLAGS) def abyss_bloom(input_fqs, outputs): fq1, fq2 = input_fqs for k_mer_size, bf, bf_flag in zip(K_MER_SIZES, BFS, BF_FLAGS): cmd = CONFIG['abyss_bloom']['cmd'].format(**locals()) # cmd = ('abyss-bloom build -v -k {k_mer_size} -j 8 -b 3G -l 2 -q 15 - ' # '{fq1} {fq2} ' # '| gzip -c > {bf}'.format(**locals())) execute(cmd, flag=bf_flag) @R.follow(abyss_bloom) @R.mkdir(abyss_bloom, R.formatter(PATH_RE), ['{subpath[0][1]}/fafq']) @R.collate(abyss_bloom, R.formatter(), FAS + FA_FLAGS) def konnector(input_fqs, outputs):
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='complexo') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform( task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Call variants using GATK pipeline.transform( task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='PCExomes.mergegvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergegvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.indel_recal', 'PCExomes.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']), output='.combined.vcf') .follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform( task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline
def make_pipeline(state): """Build the pipeline by constructing stages and connecting them together""" # Build an empty pipeline pipeline = Pipeline(name="crpipe") # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option("fastqs") # Find the path to the reference genome # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name="original_fastqs", output=fastq_files) # Convert FASTQ file to FASTA using fastx toolkit # pipeline.transform( # task_func=stages.fastq_to_fasta, # name='fastq_to_fasta', # input=output_from('original_fastqs'), # filter=suffix('.fastq.gz'), # output='.fasta') # The original reference file # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. # pipeline.originate( # task_func=stages.original_reference, # name='original_reference', # output=reference_file) # Run fastQC on the FASTQ files pipeline.transform( task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=suffix(".fastq.gz"), output="_fastqc", ) # Index the reference using BWA # pipeline.transform( # task_func=stages.index_reference_bwa, # name='index_reference_bwa', # input=output_from('original_reference'), # filter=suffix('.fa'), # output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Index the reference using samtools # pipeline.transform( # task_func=stages.index_reference_samtools, # name='index_reference_samtools', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.fa.fai') # Index the reference using bowtie 2 # pipeline.transform( # task_func=stages.index_reference_bowtie2, # name='index_reference_bowtie2', # input=output_from('original_reference'), # filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'), # output=['{path[0]}/{refname[0]}.1.bt2', # '{path[0]}/{refname[0]}.2.bt2', # '{path[0]}/{refname[0]}.3.bt2', # '{path[0]}/{refname[0]}.4.bt2', # '{path[0]}/{refname[0]}.rev.1.bt2', # '{path[0]}/{refname[0]}.rev.2.bt2'], # extras=['{path[0]}/{refname[0]}']) # # Create a FASTA sequence dictionary for the reference using picard # pipeline.transform( # task_func=stages.reference_dictionary_picard, # name='reference_dictionary_picard', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.dict') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name="align_bwa", input=output_from("original_fastqs"), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz"), # Add two more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=["{sample[0]}"], # The output file name is the sample name with a .bam extension. output="{path[0]}/{sample[0]}.bam", ) # Sort alignment with sambamba pipeline.transform( task_func=stages.sort_bam_sambamba, name="sort_alignment", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.sorted.bam", ) # Extract MMR genes from the sorted BAM file pipeline.transform( task_func=stages.extract_genes_bedtools, name="extract_genes_bedtools", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), output="{path[0]}/{sample[0]}.mmr.bam", ) # Extract selected chromosomes from the sorted BAM file pipeline.transform( task_func=stages.extract_chromosomes_samtools, name="extract_chromosomes_samtools", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), output="{path[0]}/{sample[0]}.chroms.bam", ) # Index the MMR genes bam file with samtools pipeline.transform( task_func=stages.index_bam, name="index_mmr_alignment", input=output_from("extract_genes_bedtools"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).mmr.bam"), output="{path[0]}/{sample[0]}.mmr.bam.bai", ) # Compute depth of coverage of the alignment with GATK DepthOfCoverage # pipeline.transform( # task_func=stages.alignment_coverage_gatk, # name='alignment_coverage_gatk', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs([reference_file]), # output='{path[0]}/{sample[0]}.coverage_summary', # extras=['{path[0]}/{sample[0]}_coverage']) # Index the alignment with samtools pipeline.transform( task_func=stages.index_bam, name="index_alignment", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), output="{path[0]}/{sample[0]}.sorted.bam.bai", ) # Generate alignment stats with bamtools pipeline.transform( task_func=stages.bamtools_stats, name="bamtools_stats", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.stats.txt", ) # Extract the discordant paired-end alignments pipeline.transform( task_func=stages.extract_discordant_alignments, name="extract_discordant_alignments", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.discordants.unsorted.bam", ) # Extract split-read alignments pipeline.transform( task_func=stages.extract_split_read_alignments, name="extract_split_read_alignments", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.splitters.unsorted.bam", ) # Sort discordant reads. # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name="sort_discordants", input=output_from("extract_discordant_alignments"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam"), extras=["{path[0]}/{sample[0]}.discordants"], output="{path[0]}/{sample[0]}.discordants.bam", ) # Index the sorted discordant bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_discordants', # input=output_from('sort_discordants'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'), # output='{path[0]}/{sample[0]}.discordants.bam.bai') # Sort discordant reads # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name="sort_splitters", input=output_from("extract_split_read_alignments"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam"), extras=["{path[0]}/{sample[0]}.splitters"], output="{path[0]}/{sample[0]}.splitters.bam", ) # Index the sorted splitters bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_splitters', # input=output_from('sort_splitters'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'), # output='{path[0]}/{sample[0]}.splitters.bam.bai') # Call structural variants with lumpy ( pipeline.transform( task_func=stages.structural_variants_lumpy, name="structural_variants_lumpy", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), add_inputs=add_inputs(["{path[0]}/{sample[0]}.splitters.bam", "{path[0]}/{sample[0]}.discordants.bam"]), output="{path[0]}/{sample[0]}.lumpy.vcf", ) .follows("index_alignment") .follows("sort_splitters") .follows("sort_discordants") ) # Call genotypes on lumpy output using SVTyper # (pipeline.transform( # task_func=stages.genotype_svtyper, # name='genotype_svtyper', # input=output_from('structural_variants_lumpy'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']), # output='{path[0]}/{sample[0]}.svtyper.vcf') # .follows('align_bwa') # .follows('sort_splitters') # .follows('index_alignment') # .follows('index_splitters') # .follows('index_discordants')) # Call SVs with Socrates ( pipeline.transform( task_func=stages.structural_variants_socrates, name="structural_variants_socrates", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), # output goes to {path[0]}/socrates/ output="{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt", extras=["{path[0]}"], ) ) # Call DELs with DELLY pipeline.merge( task_func=stages.deletions_delly, name="deletions_delly", input=output_from("sort_alignment"), output="delly.DEL.vcf", ) # Call DUPs with DELLY pipeline.merge( task_func=stages.duplications_delly, name="duplications_delly", input=output_from("sort_alignment"), output="delly.DUP.vcf", ) # Call INVs with DELLY pipeline.merge( task_func=stages.inversions_delly, name="inversions_delly", input=output_from("sort_alignment"), output="delly.INV.vcf", ) # Call TRAs with DELLY pipeline.merge( task_func=stages.translocations_delly, name="translocations_delly", input=output_from("sort_alignment"), output="delly.TRA.vcf", ) # Join both read pair files using gustaf_mate_joining # pipeline.transform( # task_func=stages.gustaf_mate_joining, # name='gustaf_mate_joining', # input=output_from('fastq_to_fasta'), # # Match the R1 (read 1) FASTA file and grab the path and sample name. # # This will be the first input to the stage. # # We assume the sample name may consist of only alphanumeric # # characters. # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'), # # Add one more input to the stage: # # 1. The corresponding R2 FASTA file # add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']), # output='{path[0]}/{sample[0]}.joined_mates.fasta') # Call structural variants with pindel # (pipeline.transform( # task_func=stages.structural_variants_pindel, # name='structural_variants_pindel', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]), # output='{path[0]}/{sample[0]}.pindel') # .follows('index_reference_bwa') # .follows('index_reference_samtools')) return pipeline
os.unlink(f) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # # Create more files than the previous invocation # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! n_to_produce = len(outfiles) + 1 for i in range(n_to_produce): f = '{}{}.split'.format(tempdir, i) open(f, 'a').close() @subdivide(split_start, formatter(), tempdir + '{basename[0]}_*.subdivided', tempdir + '{basename[0]}') def subdivide_start(infile, outfiles, infile_basename): # cleanup existing for f in outfiles: os.unlink(f) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # # Create more files than the previous invocation # #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! n_to_produce = len(outfiles) + 1 for i in range( n_to_produce): open('{}_{}.subdivided'.format(infile_basename, i), 'a').close()
#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 # Tasks #88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 @mkdir(tempdir) @originate([os.path.join(tempdir, ff + ".tmp") for ff in "abcd"]) def generate_initial_files(out_name): with open(out_name, 'w') as outfile: pass @transform(input = generate_initial_files, filter=formatter(), output = "{path[0]}/{basename[0]}.task1.{whatever}", extras=['echo {dynamic_message} > {some_file}']) def transform_with_missing_formatter_args(input_file, output_files, output1): print ("input = %r, output = %r, extras = %r" % (input_file, output_files, output1)) class Test_ruffus(unittest.TestCase): #___________________________________________________________________________ # # setup and cleanup #___________________________________________________________________________ def setUp(self): import os try: shutil.rmtree(tempdir) except:
# # generate_initial_files1 #___________________________________________________________________________ @originate([tempdir + "/g_name.tmp1", tempdir + "/h_name.tmp1"]) def generate_initial_files3(out_name): with open(out_name, 'w') as outfile: pass #___________________________________________________________________________ # # test_product_task #___________________________________________________________________________ @follows(generate_initial_files1) @product( [tempdir + "/" + prefix + "_name.tmp1" for prefix in "abcd"], formatter(".*/(?P<FILE_PART>.+).tmp1$" ), generate_initial_files2, formatter(), "{path[0][0]}/{FILE_PART[0][0]}.{basename[1][0]}.{basename[2][0]}.tmp2", input3 = generate_initial_files3, filter3 = formatter(r"tmp1$" ), extras = [ "{basename[0][0][0]}{basename[1][0][0]}{basename[2][0][0]}", # extra: prefices only (abcd etc) "{subpath[0][0][0]}", # extra: path for 2nd input, 1st file "{subdir[0][0][0]}"]) def test_product_task( infiles, outfile, prefices, subpath, subdir): with open(outfile, "w") as p: p.write(prefices + ",")
"appearances": len(character["films"]), "species_id": species_value, } data.append(row) df = pd.DataFrame(data=data) df = df.set_index("appearance") df.to_csv(output_file) clean_dir = os.path.join(DATADIR, "clean") clean_files = [os.path.join(clean_dir, "clean.csv")] @mkdir(clean_dir) @transform(get_character_data, formatter(r".*?\.csv"), os.path.join(clean_dir, "cleaned.csv")) def clean_data(input_file, output_file): """ Remove character rows with "unknown" height. Take the top ten characters, sorted by appearances, descending) """ df = pd.read_csv(input_file, index_col="appearances") # df = df.reset_index(drop=True) df = df.fillna("") remove_unknown_df = df[df['height'] != "unknown"].copy() df = remove_unknown_df.sort_index(ascending=False) df = df.head(10) df.to_csv(output_file)