def make_pipeline1( pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform( task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs(tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform( task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform( task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='ovarian_cancer_pipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') human_reference_genome_file = state.config.get_option('human_reference_genome') # Stages are dependent on the state stages = PipelineStages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # The human reference genome in FASTA format pipeline.originate( task_func=stages.human_reference_genome, name='human_reference_genome', output=human_reference_genome_file) # Index the human reference genome with BWA, needed before we can map reads pipeline.transform( task_func=stages.index_ref_bwa, name='index_ref_bwa', input=output_from('human_reference_genome'), filter=suffix('.fa'), output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Align paired end reads in FASTQ to the reference producing a BAM file (pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[_a-zA-Z0-9]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') .follows('index_ref_bwa')) return pipeline
def make_pipeline1(pipeline_name, # Pipelines need to have a unique name starting_file_names): test_pipeline = Pipeline(pipeline_name) # We can change the starting files later using # set_input() for transform etc. # or set_output() for originate # But it can be more convenient to just pass this to the function making the pipeline # test_pipeline.originate(task_originate, starting_file_names)\ .follows(mkdir(tempdir), mkdir(tempdir + "/testdir", tempdir + "/testdir2"))\ .posttask(touch_file(tempdir + "/testdir/whatever.txt")) test_pipeline.transform(task_func=task_m_to_1, name="add_input", # Lookup Task from function name task_originate() # So long as this is unique in the pipeline input=task_originate, # requires an anchor from 3.7 onwards, see # https://bugs.python.org/issue34982 filter=regex(r"^(.*)"), add_inputs=add_inputs( tempdir + "/testdir/whatever.txt"), output=r"\1.22") test_pipeline.transform(task_func=task_1_to_1, name="22_to_33", # Lookup Task from Task name # Function name is not unique in the pipeline input=output_from("add_input"), filter=suffix(".22"), output=".33") tail_task = test_pipeline.transform(task_func=task_1_to_1, name="33_to_44", # Ask Pipeline to lookup Task from Task name input=test_pipeline["22_to_33"], filter=suffix(".33"), output=".44") # Set the tail task so that users of my sub pipeline can use it as a dependency # without knowing the details of task names # # Use Task() object directly without having to lookup test_pipeline.set_tail_tasks([tail_task]) # If we try to connect a Pipeline without tail tasks defined, we have to # specify the exact task within the Pipeline. # Otherwise Ruffus will not know which task we mean and throw an exception if DEBUG_do_not_define_tail_task: test_pipeline.set_tail_tasks([]) # Set the head task so that users of my sub pipeline send input into it # without knowing the details of task names test_pipeline.set_head_tasks([test_pipeline[task_originate]]) return test_pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='fastq2bam') # Get a list of paths to all the FASTQ files input_files = state.config.get_option('files') # Stages are dependent on the state stages = Stages(state) # The original files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_files, name='original_files', output=input_files) pipeline.transform( task_func=stages.fastq2bam, name='fastq2bam', input=output_from('original_files'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), extras=['{sample[0]}'], output='{path[0]}/out/{sample[0]}.bam') pipeline.transform( task_func=stages.validate_prealigned_bam, name='validate_prealigned_bam', input=output_from('fastq2bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.validation') pipeline.transform( task_func=stages.align, name='align', input=output_from('validate_prealigned_bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'), add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'), output='{path[0]}/{sample[0]}.mapped.bam') return pipeline
# +:RED, -:GREEN color = '255,0,0' if strand == '+' else '0,255,0' outfile.write('\t'.join(fields + [start, stop, color]) + '\n') @transform(bed_color_strand, suffix(''), '.bigbed') def bed_to_bigbed(in_bed, out_bigbed): """Convert a BED file to .bigbed for viewing on UCSC browser""" cmd = 'bedToBigBed %s %s.chrom.sizes %s' % (in_bed, genome_path(), out_bigbed) sys_call(cmd) @transform([bed_uniquefy, clip_and_sort_peaks] + mapping.all_mappers_output, regex('(.*mapped_reads).clipped.sorted(.unique|)'), #suffix('.mapped_reads'), add_inputs(bootstrap.get_chrom_sizes), r'\1\2.bedgraph') #r'.bedgraph') def bed_to_bedgraph(in_files, out_bedgraph): 'extend reads to the full fragment length and create a bedgraph from them' in_bed, in_chrom_sizes = in_files cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \ 'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % ( in_bed, cfg.getint('DEFAULT','fragment_size') - \ cfg.getint('DEFAULT','tag_size'), in_chrom_sizes, cfg.get('DEFAULT', 'genome'), genome_path(), out_bedgraph) sys_call(cmd) @active_if(cfg.getboolean('visualization', 'split_strands'))
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='cellfree_seq') # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') # The original FASTQ files fastq_files = glob.glob('fastqs/*') # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}.sort.hq.bam') pipeline.transform(task_func=stages.run_connor, name='run_connor', input=output_from('align_bwa'), filter=suffix('.sort.hq.bam'), output='.sort.hq.connor.bam') safe_make_dir('metrics') safe_make_dir('metrics/summary') safe_make_dir('metrics/connor') pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_raw', input=output_from('intersect_bed_raw'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_raw', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_raw', input=output_from('coverage_bed_raw', 'genome_reads_raw', 'target_reads_raw', 'total_reads_raw'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'summary.txt']) pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/connor/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads_connor', input=output_from('intersect_bed_connor'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads_connor', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats_connor', input=output_from('coverage_bed_connor', 'genome_reads_connor', 'target_reads_connor', 'total_reads_connor'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/connor/all_sample.summary.\1.txt', extras=[r'\1', 'connor.summary.txt']) safe_make_dir('variants') safe_make_dir('variants/vardict') pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('run_connor'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sort.hq.connor.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') (pipeline.merge( task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('sort_vcfs'), output='variants/vardict/combined.vcf.gz').follows('index_vcfs')) pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.decomp.norm.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_final_vcf', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vcf.gz.tbi') (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('vt_decompose_normalise'), filter=suffix('.decomp.norm.vcf.gz'), output='.decomp.norm.vep.vcf').follows('index_final_vcf')) return pipeline
table = os.path.basename(PARAMS["annotations_interface_table_gene_info"]) select = dbh.execute( """SELECT DISTINCT gene_id FROM annotations.%(table)s WHERE gene_biotype = 'protein_coding'""" % locals() ) with IOTools.openFile(outfile, "w") as outf: outf.write("gene_id\n") outf.write("\n".join((x[0] for x in select)) + "\n") @transform( buildReferenceGeneSet, suffix("reference.gtf.gz"), add_inputs(identifyProteinCodingGenes), "refcoding.gtf.gz" ) def buildCodingGeneSet(infiles, outfile): """build a gene set with only protein coding transcripts. Retain the genes from the gene_tsv file in the outfile geneset. The gene set will contain all transcripts of protein coding genes, including processed transcripts. The gene set includes UTR and CDS. Parameters ---------- infiles : list infile: str Input filename in :term:`gtf` format
""" statement = """gunzip < %(infile)s | cgat gtf2gtf --method=merge-transcripts --log=%(outfile)s.log | cgat gtf2gff --method=tts --promotor-size=1 --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log | cgat gff2bed --is-gtf --set-name=gene_id --log=%(outfile)s.log | gzip > %(outfile)s""" P.run(statement, job_memory=PARAMS["job_memory"]) @transform(buildGeneRegions, regex('(.*)_.*.bed.gz'), add_inputs(buildContigSizes), r'\1_intergenic.bed.gz') def buildIntergenicRegions(infiles, outfile): """build a :term:`bed` file with regions not overlapping any genes. Arguments --------- infiles : list - Input filename with geneset in :term:`gtf` format. - Input filename with chromosome sizes in :term:`tsv` format. outfile : string Output filename with genomic regions in :term:`bed` format. """ infile, contigs = infiles statement = '''zcat %(infile)s
def refseq_genes_to_regions(in_genes, out_pattern): """make regions (promoter, downstream, 5UTR, etc) from refseq_genes""" args = shlex.split('''%s --promoter_size=%s --promoter_extend=%s --downstream_size=%s --downstream_extend=%s --with_gene_name''' % (in_genes, cfg.get('genes', 'promoter_size'), cfg.get('genes', 'promoter_extend'), cfg.get('genes', 'downstream_size'), cfg.get('genes', 'downstream_extend'))) makeGeneStructure.main(args) @follows(refseq_genes_to_bed, convert_gtf_genes_to_bed) @collate(call_peaks.all_peak_caller_functions + ['*.custom.peaks'], regex(r'(.+)\.treat\.(.+)\.peaks'), add_inputs('%s*_genes.all' % cfg.get('DEFAULT', 'genome')), r'\1.treat.\2.peaks.nearby.genes') #add_inputs('%s*_genes.tss' % cfg.get('DEFAULT', 'genome')), r'\1.treat.\2.peaks.nearby.genes') def find_nearby_genes(in_files, out_genes): """report which genes are within a certain distance of a peak""" in_peaks, in_genes = in_files[0] tmp_output = tempfile.NamedTemporaryFile(delete=False).name cmd = 'closestBed -a %s -b %s -t first -d > %s' % (in_peaks, in_genes, tmp_output) sys_call(cmd) with open(tmp_output) as infile: with open(out_genes, 'w') as outfile: for line in infile: if not line: continue fields = line.strip().split('\t')
If there is no sequence quality then make a softlink. Picard tools has an issue when quality score infomation is missing''' if PARAMS["bam_sequence_stripped"] is True: bamstats.addPseudoSequenceQuality(infile, outfile) else: bamstats.copyBamFile(infile, outfile) @follows(mkdir("Picard_stats.dir")) @P.add_doc(bamstats.buildPicardAlignmentStats) @transform(intBam, regex("BamFiles.dir/(.*).bam$"), add_inputs(os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa")), r"Picard_stats.dir/\1.picard_stats") def buildPicardStats(infiles, outfile): ''' build Picard alignment stats ''' infile, reffile = infiles # patch for mapping against transcriptome - switch genomic reference # to transcriptomic sequences if "transcriptome.dir" in infile: reffile = "refcoding.fa" bamstats.buildPicardAlignmentStats(infile, outfile, reffile, PICARD_MEMORY)
"""Convert text files with motifs into our pickled format""" transfac_str = open(in_transfac).read() m = sequence_motif.parseMotifsFromTransfac(transfac_str) pickle.dump(m, open(out_pickle, 'w')) @transform('*.known.motifs.dreme', suffix('.dreme'), '') def convert_dreme_motifs(in_dreme, out_pickle): """Convert text files with motifs into our pickled format""" dreme_str = open(in_dreme).read() m = sequence_motif.parseMotifsFromTransfac(dreme_str) pickle.dump(m, open(out_pickle, 'w')) @follows(convert_transfac_motifs) @transform([discover_meme_motifs, discover_nmica_motifs, '*.known.motifs'], suffix('.motifs'), add_inputs(sample_genome_short), '.with_mean_sd.motifs') def motif_mean_sd(in_files, out_motifs): """calculate the motifs' background score distributions, with mean and sd. """ in_motif, genome_samples = in_files # Convert the .fasta file to a list of sequences sequenceList = fastaToSequenceList(genome_samples) with open(in_motif) as infile: all_motifs = pickle.load(infile) # maybe need to create new empty mutable dictionary # and populate it with items in the for loop for motif in all_motifs.values():
def run_pipeline(global_config, results_directory, cli_options): wd = _make_append_work_dir(results_directory) # Pipeline starts here @rf.mkdir(results_directory) @rf.originate(wd("config.yaml"), global_config) def save_config(output_file, config): with open(output_file, "w") as f: yaml.dump(config, f) @rf.transform( save_config, rf.formatter(), wd("pipeline_data.pkl"), global_config, ) def process_data(input_file, output_file, config): assemble_data(output_file, config["ProcessData"]) @rf.transform( process_data, rf.formatter(), wd("posterior.hd5"), global_config, ) def run_mcmc(input_file, output_file, config): mcmc(input_file, output_file, config["Mcmc"]) @rf.transform( input=run_mcmc, filter=rf.formatter(), output=wd("thin_samples.pkl"), extras=[global_config], ) def thin_samples(input_file, output_file, config): thin_posterior(input_file, output_file, config["ThinPosterior"]) # Rt related steps rf.transform( input=[[process_data, thin_samples]], filter=rf.formatter(), output=wd("ngm.pkl"), )(next_generation_matrix) rf.transform( input=next_generation_matrix, filter=rf.formatter(), output=wd("national_rt.xlsx"), )(overall_rt) # In-sample prediction @rf.transform( input=[[process_data, thin_samples]], filter=rf.formatter(), output=wd("insample7.pkl"), ) def insample7(input_files, output_file): predict( data=input_files[0], posterior_samples=input_files[1], output_file=output_file, initial_step=-8, num_steps=28, ) @rf.transform( input=[[process_data, thin_samples]], filter=rf.formatter(), output=wd("insample14.pkl"), ) def insample14(input_files, output_file): return predict( data=input_files[0], posterior_samples=input_files[1], output_file=output_file, initial_step=-14, num_steps=28, ) # Medium-term prediction @rf.transform( input=[[process_data, thin_samples]], filter=rf.formatter(), output=wd("medium_term.pkl"), ) def medium_term(input_files, output_file): return predict( data=input_files[0], posterior_samples=input_files[1], output_file=output_file, initial_step=-1, num_steps=61, ) # Summarisation rf.transform( input=next_generation_matrix, filter=rf.formatter(), output=wd("rt_summary.csv"), )(summarize.rt) rf.transform( input=medium_term, filter=rf.formatter(), output=wd("infec_incidence_summary.csv"), )(summarize.infec_incidence) rf.transform( input=[[process_data, thin_samples, medium_term]], filter=rf.formatter(), output=wd("prevalence_summary.csv"), )(summarize.prevalence) rf.transform( input=[[process_data, thin_samples]], filter=rf.formatter(), output=wd("within_between_summary.csv"), )(within_between) @rf.transform( input=[[process_data, insample7, insample14]], filter=rf.formatter(), output=wd("exceedance_summary.csv"), ) def exceedance(input_files, output_file): exceed7 = case_exceedance((input_files[0], input_files[1]), 7) exceed14 = case_exceedance((input_files[0], input_files[2]), 14) df = pd.DataFrame( { "Pr(pred<obs)_7": exceed7, "Pr(pred<obs)_14": exceed14 }, index=exceed7.coords["location"], ) df.to_csv(output_file) # Plot in-sample @rf.transform( input=[insample7, insample14], filter=rf.formatter(".+/insample(?P<LAG>\d+).pkl"), add_inputs=rf.add_inputs(process_data), output="{path[0]}/insample_plots{LAG[0]}", extras=["{LAG[0]}"], ) def plot_insample_predictive_timeseries(input_files, output_dir, lag): insample_predictive_timeseries(input_files, output_dir, lag) # Geopackage rf.transform( [[ process_data, summarize.rt, summarize.infec_incidence, summarize.prevalence, within_between, exceedance, ]], rf.formatter(), wd("prediction.gpkg"), global_config["Geopackage"], )(summary_geopackage) rf.cmdline.run(cli_options) # DSTL Summary rf.transform( [[process_data, insample14, medium_term, next_generation_matrix]], rf.formatter(), wd("summary_longformat.xlsx"), )(summary_longformat) rf.cmdline.run(cli_options)
color = "255,0,0" if strand == "+" else "0,255,0" outfile.write("\t".join(fields + [start, stop, color]) + "\n") @transform(bed_color_strand, suffix(""), ".bigbed") def bed_to_bigbed(in_bed, out_bigbed): """Convert a BED file to .bigbed for viewing on UCSC browser""" cmd = "bedToBigBed %s %s.chrom.sizes %s" % (in_bed, genome_path(), out_bigbed) sys_call(cmd) @transform( [bed_uniquefy, clip_and_sort_peaks] + mapping.all_mappers_output, regex("(.*mapped_reads).clipped.sorted(.unique|)"), # suffix('.mapped_reads'), add_inputs(bootstrap.get_chrom_sizes), r"\1\2.bedgraph", ) # r'.bedgraph') def bed_to_bedgraph(in_files, out_bedgraph): "extend reads to the full fragment length and create a bedgraph from them" in_bed, in_chrom_sizes = in_files cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % ( in_bed, cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"), in_chrom_sizes, cfg.get("DEFAULT", "genome"), genome_path(), out_bedgraph, ) sys_call(cmd)
def make_pipeline(state): """Build the pipeline by constructing stages and connecting them together""" # Build an empty pipeline pipeline = Pipeline(name="crpipe") # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option("fastqs") # Find the path to the reference genome # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name="original_fastqs", output=fastq_files) # Convert FASTQ file to FASTA using fastx toolkit # pipeline.transform( # task_func=stages.fastq_to_fasta, # name='fastq_to_fasta', # input=output_from('original_fastqs'), # filter=suffix('.fastq.gz'), # output='.fasta') # The original reference file # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. # pipeline.originate( # task_func=stages.original_reference, # name='original_reference', # output=reference_file) # Run fastQC on the FASTQ files pipeline.transform( task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=suffix(".fastq.gz"), output="_fastqc", ) # Index the reference using BWA # pipeline.transform( # task_func=stages.index_reference_bwa, # name='index_reference_bwa', # input=output_from('original_reference'), # filter=suffix('.fa'), # output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Index the reference using samtools # pipeline.transform( # task_func=stages.index_reference_samtools, # name='index_reference_samtools', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.fa.fai') # Index the reference using bowtie 2 # pipeline.transform( # task_func=stages.index_reference_bowtie2, # name='index_reference_bowtie2', # input=output_from('original_reference'), # filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'), # output=['{path[0]}/{refname[0]}.1.bt2', # '{path[0]}/{refname[0]}.2.bt2', # '{path[0]}/{refname[0]}.3.bt2', # '{path[0]}/{refname[0]}.4.bt2', # '{path[0]}/{refname[0]}.rev.1.bt2', # '{path[0]}/{refname[0]}.rev.2.bt2'], # extras=['{path[0]}/{refname[0]}']) # # Create a FASTA sequence dictionary for the reference using picard # pipeline.transform( # task_func=stages.reference_dictionary_picard, # name='reference_dictionary_picard', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.dict') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name="align_bwa", input=output_from("original_fastqs"), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz"), # Add two more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=["{sample[0]}"], # The output file name is the sample name with a .bam extension. output="{path[0]}/{sample[0]}.bam", ) # Sort alignment with sambamba pipeline.transform( task_func=stages.sort_bam_sambamba, name="sort_alignment", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.sorted.bam", ) # Extract MMR genes from the sorted BAM file pipeline.transform( task_func=stages.extract_genes_bedtools, name="extract_genes_bedtools", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), output="{path[0]}/{sample[0]}.mmr.bam", ) # Extract selected chromosomes from the sorted BAM file pipeline.transform( task_func=stages.extract_chromosomes_samtools, name="extract_chromosomes_samtools", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), output="{path[0]}/{sample[0]}.chroms.bam", ) # Index the MMR genes bam file with samtools pipeline.transform( task_func=stages.index_bam, name="index_mmr_alignment", input=output_from("extract_genes_bedtools"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).mmr.bam"), output="{path[0]}/{sample[0]}.mmr.bam.bai", ) # Compute depth of coverage of the alignment with GATK DepthOfCoverage # pipeline.transform( # task_func=stages.alignment_coverage_gatk, # name='alignment_coverage_gatk', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs([reference_file]), # output='{path[0]}/{sample[0]}.coverage_summary', # extras=['{path[0]}/{sample[0]}_coverage']) # Index the alignment with samtools pipeline.transform( task_func=stages.index_bam, name="index_alignment", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), output="{path[0]}/{sample[0]}.sorted.bam.bai", ) # Generate alignment stats with bamtools pipeline.transform( task_func=stages.bamtools_stats, name="bamtools_stats", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.stats.txt", ) # Extract the discordant paired-end alignments pipeline.transform( task_func=stages.extract_discordant_alignments, name="extract_discordant_alignments", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.discordants.unsorted.bam", ) # Extract split-read alignments pipeline.transform( task_func=stages.extract_split_read_alignments, name="extract_split_read_alignments", input=output_from("align_bwa"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).bam"), output="{path[0]}/{sample[0]}.splitters.unsorted.bam", ) # Sort discordant reads. # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name="sort_discordants", input=output_from("extract_discordant_alignments"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam"), extras=["{path[0]}/{sample[0]}.discordants"], output="{path[0]}/{sample[0]}.discordants.bam", ) # Index the sorted discordant bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_discordants', # input=output_from('sort_discordants'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'), # output='{path[0]}/{sample[0]}.discordants.bam.bai') # Sort discordant reads # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name="sort_splitters", input=output_from("extract_split_read_alignments"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam"), extras=["{path[0]}/{sample[0]}.splitters"], output="{path[0]}/{sample[0]}.splitters.bam", ) # Index the sorted splitters bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_splitters', # input=output_from('sort_splitters'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'), # output='{path[0]}/{sample[0]}.splitters.bam.bai') # Call structural variants with lumpy ( pipeline.transform( task_func=stages.structural_variants_lumpy, name="structural_variants_lumpy", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), add_inputs=add_inputs(["{path[0]}/{sample[0]}.splitters.bam", "{path[0]}/{sample[0]}.discordants.bam"]), output="{path[0]}/{sample[0]}.lumpy.vcf", ) .follows("index_alignment") .follows("sort_splitters") .follows("sort_discordants") ) # Call genotypes on lumpy output using SVTyper # (pipeline.transform( # task_func=stages.genotype_svtyper, # name='genotype_svtyper', # input=output_from('structural_variants_lumpy'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']), # output='{path[0]}/{sample[0]}.svtyper.vcf') # .follows('align_bwa') # .follows('sort_splitters') # .follows('index_alignment') # .follows('index_splitters') # .follows('index_discordants')) # Call SVs with Socrates ( pipeline.transform( task_func=stages.structural_variants_socrates, name="structural_variants_socrates", input=output_from("sort_alignment"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9]+).sorted.bam"), # output goes to {path[0]}/socrates/ output="{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt", extras=["{path[0]}"], ) ) # Call DELs with DELLY pipeline.merge( task_func=stages.deletions_delly, name="deletions_delly", input=output_from("sort_alignment"), output="delly.DEL.vcf", ) # Call DUPs with DELLY pipeline.merge( task_func=stages.duplications_delly, name="duplications_delly", input=output_from("sort_alignment"), output="delly.DUP.vcf", ) # Call INVs with DELLY pipeline.merge( task_func=stages.inversions_delly, name="inversions_delly", input=output_from("sort_alignment"), output="delly.INV.vcf", ) # Call TRAs with DELLY pipeline.merge( task_func=stages.translocations_delly, name="translocations_delly", input=output_from("sort_alignment"), output="delly.TRA.vcf", ) # Join both read pair files using gustaf_mate_joining # pipeline.transform( # task_func=stages.gustaf_mate_joining, # name='gustaf_mate_joining', # input=output_from('fastq_to_fasta'), # # Match the R1 (read 1) FASTA file and grab the path and sample name. # # This will be the first input to the stage. # # We assume the sample name may consist of only alphanumeric # # characters. # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'), # # Add one more input to the stage: # # 1. The corresponding R2 FASTA file # add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']), # output='{path[0]}/{sample[0]}.joined_mates.fasta') # Call structural variants with pindel # (pipeline.transform( # task_func=stages.structural_variants_pindel, # name='structural_variants_pindel', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]), # output='{path[0]}/{sample[0]}.pindel') # .follows('index_reference_bwa') # .follows('index_reference_samtools')) return pipeline
from ruffus import (transform, follows, collate, files, split, merge, add_inputs, regex, suffix, mkdir, jobs_limit, output_from) from ruffus.task import active_if from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import cfg, get_chrom_sizes, genome_path import hts_waterworks.mapping as mapping import hts_waterworks.clip_seq as clip_seq from hts_waterworks.utils.common import (bedCommentFilter, readBedLines, parse_ucsc_range) @active_if(cfg.getboolean('peaks', 'run_macs')) @collate(mapping.all_mappers_output, regex(r'(.+)\.treat(.*)\.mapped_reads'), add_inputs(r'\1.control\2.mapped_reads'), r'\1.treat\2.macs.peaks', cfg.getfloat('peaks', 'max_FDR')) def run_macs(in_files, out_peaks, max_fdr): """Call peak with MACS (v1.3). Apply a maximum FDR threshold and treat centers as peak summits """ in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs.peaks' max_fdr = cfg.getfloat('peaks', 'max_FDR') cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name, cfg.get('peaks', 'macs_params')) sys_call(cmd) # convert to proper bedfile- ints for score and + for strand
dbh = connect() table = os.path.basename(PARAMS["annotations_interface_table_gene_info"]) select = dbh.execute("""SELECT DISTINCT gene_id FROM annotations.%(table)s WHERE gene_biotype = 'protein_coding'""" % locals()) with IOTools.openFile(outfile, "w") as outf: outf.write("gene_id\n") outf.write("\n".join((x[0] for x in select)) + "\n") @transform(buildReferenceGeneSet, suffix("reference.gtf.gz"), add_inputs(identifyProteinCodingGenes), "refcoding.gtf.gz") def buildCodingGeneSet(infiles, outfile): '''build a gene set with only protein coding transcripts. Retain the genes from the gene_tsv file in the outfile geneset. The gene set will contain all transcripts of protein coding genes, including processed transcripts. The gene set includes UTR and CDS. Parameters ---------- infiles : list infile: str Input filename in :term:`gtf` format
sentences, score = json.loads(line) for sentence in sentences: dictionary.update(sentence) dictionary_list = list( sorted(w for w in dictionary if dictionary[w] >= word_frequency_cutoff)) with open(output_file_name, 'w') as dictionary_file: for word in dictionary_list: dictionary_file.write("{} {}\n".format(word, dictionary[word])) @ruffus.follows(build_word_dictionary) @ruffus.transform(clean_data, ruffus.suffix(".json.gz"), ruffus.add_inputs("dictionary.sentences.clean.json"), ".projected.json.gz") def project_sentences(input_file_names, output_file_name): review_file_name, dictionary_file_name = input_file_names with open(dictionary_file_name) as dictionary_file: dictionary = set(json.load(dictionary_file)) def project_sentence(s): return [w if w in dictionary else "UNKNOWN" for w in s] with gzip.open(review_file_name) as review_file: with gzip.open(output_file_name, 'w') as output_file: for line in review_file: sentences, score = json.loads(line) projected_sentences = map(project_sentence, sentences)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq # new sample name = OHI031002-P02F04 filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq' ), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq add_inputs=add_inputs( '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}_{readid[0]}/{sample[0]}_{readid[0]}.bam' ) # Call variants using undr_rover pipeline.transform( task_func=stages.apply_undr_rover, name='apply_undr_rover', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq' ), add_inputs=add_inputs( '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], extras=['{sample[0]}', '{readid[0]}'], # The output file name is the sample name with a .bam extension. output='variants/undr_rover/{sample[0]}_{readid[0]}.vcf') # Sort the BAM file using Picard pipeline.transform(task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # High quality and primary alignments pipeline.transform(task_func=stages.primary_bam, name='primary_bam', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), output='.primary.bam') # index bam file pipeline.transform(task_func=stages.index_sort_bam_picard, name='index_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.bam.bai') # Clip the primer_seq from BAM File (pipeline.transform( task_func=stages.clip_bam, name='clip_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.primerclipped.bam').follows('index_bam')) ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('clip_bam'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-_]+).primary.primerclipped.bam'), output='variants/gatk/{sample[0]}.g.vcf') # .follows('index_sort_bam_picard')) # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_haplotypecaller_gatk'), output='variants/gatk/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # Annotate VCF file using GATK pipeline.transform(task_func=stages.variant_annotator_gatk, name='variant_annotator_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output='.raw.annotate.vcf') # Apply VariantFiltration using GATK pipeline.transform(task_func=stages.apply_variant_filtration_gatk, name='apply_variant_filtration_gatk', input=output_from('variant_annotator_gatk'), filter=suffix('.raw.annotate.vcf'), output='.raw.annotate.filtered.vcf') # Apply NORM (pipeline.transform( task_func=stages.apply_vt, name='apply_vt', input=output_from('apply_variant_filtration_gatk'), filter=suffix('.raw.annotate.filtered.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.raw.annotate.filtered.norm.vcf').follows( 'apply_variant_filtration_gatk')) # Apply VEP (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('apply_vt'), filter=suffix('.raw.annotate.filtered.norm.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.raw.annotate.filtered.norm.vep.vcf').follows('apply_vt')) # Apply SnpEff (pipeline.transform( task_func=stages.apply_snpeff, name='apply_snpeff', input=output_from('apply_vep'), filter=suffix('.raw.annotate.filtered.norm.vep.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.raw.annotate.filtered.norm.vep.snpeff.vcf').follows( 'apply_vep')) # Apply vcfanno (pipeline.transform( task_func=stages.apply_vcfanno, name='apply_vcfanno', input=output_from('apply_snpeff'), filter=suffix('.raw.annotate.filtered.norm.vep.snpeff.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.annotated.vcf').follows('apply_snpeff')) # Concatenate undr_rover vcf files pipeline.merge(task_func=stages.apply_cat_vcf, name='apply_cat_vcf', input=output_from('apply_undr_rover'), output='variants/undr_rover/ur.vcf.gz') # # Apple VEP on concatenated undr_rover vcf file # (pipeline.transform( # task_func=stages.apply_vep, # name='apply_vep_ur', # input=output_from('apply_cat_vcf'), # filter=suffix('.vcf.gz'), # output='.vep.vcf') # .follows('apply_cat_vcf')) # # # Apply vcfanno on concatenated/vep undr_rover vcf file # (pipeline.transform( # task_func=stages.apply_vcfanno, # name='apply_vcfanno_ur', # input=output_from('apply_vep_ur'), # filter=suffix('.vep.vcf'), # output='.vep.anno.vcf') # .follows('apply_vep_ur')) # # # Apply snpeff # (pipeline.transform( # task_func=stages.apply_snpeff, # name='apply_snpeff_ur', # input=output_from('apply_vcfanno_ur'), # filter=suffix('.vep.anno.vcf'), # output='.vep.anno.snpeff.vcf.gz') # .follows('apply_vcfanno_ur')) # # Apply tabix pipeline.transform(task_func=stages.apply_tabix, name='apply_tabix', input=output_from('apply_cat_vcf'), filter=suffix('.vcf.gz'), output='.vcf.gz.tbi') # # Apply HomopolymerRun # (pipeline.transform( # task_func=stages.apply_homopolymer_ann, # name='apply_homopolymer_ann', # input=output_from('apply_snpeff_ur'), # filter=suffix('.vep.anno.snpeff.vcf.gz'), # output='.annotated.vcf') # .follows('apply_tabix')) # # Apply summarize multi coverage # (pipeline.merge( # task_func=stages.apply_multicov, # name='apply_multicov', # input=output_from('primary_bam'), # # filter=suffix('.primary.bam'), # output='coverage/all.multicov.txt') # .follows('index_bam')) # Apply summarize picard coverage # (pipeline.merge( # task_func=stages.apply_summarize_picard, # name='apply_summarize_picard', # input=output_from('target_coverage'), # output='coverage/all.hsmetrics.txt') # .follows('target_coverage')) # # Apply summarize multicov coverage plots # (pipeline.merge( # task_func=stages.apply_multicov_plots, # name='apply_multicov_plots', # input=output_from('apply_multicov'), # output='coverage/coverage_analysis_main.html') # .follows('apply_multicov')) return pipeline
sentences, score = json.loads(line) for sentence in sentences: dictionary.update(sentence) dictionary_list = list(sorted(w for w in dictionary if dictionary[w] >= word_frequency_cutoff)) with open(output_file_name, 'w') as dictionary_file: for word in dictionary_list: dictionary_file.write("{} {}\n".format(word, dictionary[word])) @ruffus.follows(build_word_dictionary) @ruffus.transform( clean_data, ruffus.suffix(".json.gz"), ruffus.add_inputs("dictionary.sentences.clean.json"), ".projected.json.gz") def project_sentences(input_file_names, output_file_name): review_file_name, dictionary_file_name = input_file_names with open(dictionary_file_name) as dictionary_file: dictionary = set(json.load(dictionary_file)) def project_sentence(s): return [w if w in dictionary else "UNKNOWN" for w in s] with gzip.open(review_file_name) as review_file: with gzip.open(output_file_name, 'w') as output_file: for line in review_file: sentences, score = json.loads(line) projected_sentences = map(project_sentence, sentences)
statement = """gunzip < %(infile)s | cgat gtf2gtf --method=merge-transcripts --log=%(outfile)s.log | cgat gtf2gff --method=tts --promotor-size=1 --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log | cgat gff2bed --is-gtf --set-name=gene_id --log=%(outfile)s.log | gzip > %(outfile)s""" P.run() @transform(buildGeneRegions, regex('(.*)_.*.bed.gz'), add_inputs(buildContigSizes), r'\1_intergenic.bed.gz') def buildIntergenicRegions(infiles, outfile): """build a :term:`bed` file with regions not overlapping any genes. Arguments --------- infiles : list - Input filename with geneset in :term:`gtf` format. - Input filename with chromosome sizes in :term:`tsv` format. outfile : string Output filename with genomic regions in :term:`bed` format. """ infile, contigs = infiles
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='vcf_annotation') # Get a list of paths to all the FASTQ files vcf_files = state.config.get_option('vcfs') # Stages are dependent on the state stages = Stages(state) # The original VCF files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_vcf, name='original_vcf', output=vcf_file) # Decompose VCF using Vt pipeline.transform( task_func=stages.decompose_vcf, name='decompose_vcf', input=output_from('original_vcf'), # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).vcf'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the VCF file name (e.g. study/family name. # This is needed within the stage for finding out sample specific # configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.decompose.normalize.vcf') # FILTER COMMON VARIANTS # ADD FILTER COMMON VARIANTS USING VEP # Annotate using VEP pipeline.transform( task_func=stages.annotate_vep, name='annotate_vep', input=output_from('decompose_vcf'), filter=suffix('.vcf'), output='.vep.vcf') # Annotate using SnpEff pipeline.transform( task_func=stages.annotate_snpeff, name='annotate_snpeff', input=output_from('annotate_vep'), filter=suffix('.vcf'), output='.snpeff.vcf') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform( task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Call variants using GATK pipeline.transform( task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='PCExomes.mergegvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergegvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs( ['PCExomes.indel_recal', 'PCExomes.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']), output='.combined.vcf') .follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform( task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline
with open(out_exons, 'w') as outfile: for line in open(in_refseq): (name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exons, name2, noncoding) = parse_gene_line(line) # remove the last exon if strand == '+': exons = exons[:-1] else: exons = exons[1:] for ex_start, ex_end in exons: outfile.write('\t'.join(map(str, [chrom, ex_start, ex_end])) + '\n') @follows(make_middle_exons) @transform(remove_internal_priming_again, regex(r'(.*)\.(.*$)'), add_inputs(make_middle_exons), r'\1.no_exons.\2') def remove_terminal_exon(in_files, out_bed): """Remove all exons but the last one using intersectBed""" in_bed, exon_file = in_files cmd = 'intersectBed -v -a %s -b %s > %s' % (in_bed, exon_file, out_bed) sys_call(cmd, file_log=False) @active_if(cfg.getboolean('visualization', 'normalize_per_million')) @transform(remove_terminal_exon, regex(r'(.*)\.(pileup_reads$)'), r'\1.norm_mil.\2') def pileup_normalize_per_million(in_pileup, out_pileup): """Normalize pileup reads to tags per million mapping""" total_reads = sum(float(l.strip().split('\t')[4]) for l in open(in_pileup)) with open(in_pileup) as infile:
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='fastq2bam') # Get a list of paths to all the FASTQ files input_files = state.config.get_option('files') # Stages are dependent on the state stages = Stages(state) # The original files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_files, name='original_files', output=input_files) # # performs fastqc on fastq inputs # pipeline.transform( task_func=stages.fastqc, name='fastqc', input=output_from('original_files'), filter=formatter('(?P<path>.+)/(?P<filename>.+).fastq.gz'), output='{path[0]}/{filename[0]}_fastqc') # # converts the fastq inputs to pre-aligned bams # pipeline.transform( task_func=stages.fastq2bam, name='fastq2bam', input=output_from('original_files'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), extras=['{sample[0]}'], output='{path[0]}/{sample[0]}.bam') # # validates pre-aligned bams x.bam -> x.validation # pipeline.transform( task_func=stages.validate_prealigned_bam, name='validate_prealigned_bam', input=output_from('fastq2bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.validation') # aligns pre-aligned bam x.bam -> x.mapped.bam pipeline.transform( task_func=stages.align, name='align', input=output_from('validate_prealigned_bam'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).validation'), add_inputs=add_inputs('{path[0]}/{sample[0]}.bam'), output='{path[0]}/{sample[0]}.mapped.bam') # generates stats about an aligned bam pipeline.transform( task_func=stages.align_stats_bedtools, name='align_stats_bedtools', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.genomecov.stats') # generates stats about an aligned bam pipeline.transform( task_func=stages.align_stats_picard, name='align_stats_picard', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.picard.stats') # # runs the Sanger variant calling pipeline # #pipeline.transform( # task_func=stages.analyse_wgs, # name='analyse_wgs', # input=output_from('align'), # filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), # output='{path[0]}/{sample[0]}.wgs/manifest') # runs the components of the Sanger variant calling pipeline pipeline.transform( task_func=stages.analyse_wgs_prepare, name='analyse_wgs_prepare', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.prepare') pipeline.transform( task_func=stages.analyse_wgs_reference_files, name='analyse_wgs_reference_files', input=[output_from('align'), output_from('analyse_wgs_prepare')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.reference_files') pipeline.transform( task_func=stages.analyse_wgs_init, name='analyse_wgs_init', input=[ output_from('align'), output_from('analyse_wgs_reference_files') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.init') # block 1 pipeline.transform( task_func=stages.analyse_wgs_verify_WT, name='analyse_wgs_verify_WT', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.verify_WT') pipeline.transform( task_func=stages.analyse_wgs_cgpPindel_input, name='analyse_wgs_cgpPindel_input', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel_input') pipeline.transform( task_func=stages.analyse_wgs_alleleCount, name='analyse_wgs_alleleCount', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.alleleCount') # block 2 pipeline.transform( task_func=stages.analyse_wgs_ascat, name='analyse_wgs_ascat', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.ascat') pipeline.transform( task_func=stages.analyse_wgs_cgpPindel, name='analyse_wgs_cgpPindel', input=[ output_from('align'), output_from('analyse_wgs_cgpPindel_input') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel') pipeline.transform( task_func=stages.analyse_wgs_BRASS_input, name='analyse_wgs_BRASS_input', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.BRASS_input') pipeline.transform( task_func=stages.analyse_wgs_BRASS_cover, name='analyse_wgs_BRASS_cover', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.BRASS_cover') pipeline.transform( task_func=stages.analyse_wgs_CaVEMan_split, name='analyse_wgs_CaVEMan_split', input=[output_from('align'), output_from('analyse_wgs_init')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan_split') # after block 2 pipeline.transform( task_func=stages.analyse_wgs_ascat_prep, name='analyse_wgs_ascat_prep', input=[output_from('align'), output_from('analyse_wgs_ascat')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.ascat_prep') pipeline.transform( task_func=stages.analyse_wgs_pindel_prep, name='analyse_wgs_pindel_prep', input=[output_from('align'), output_from('analyse_wgs_cgpPindel')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.pindel_prep') # parallel block 3 pipeline.transform( task_func=stages.analyse_wgs_verify_MT, name='analyse_wgs_verify_MT', input=[ output_from('align'), output_from('analyse_wgs_verify_WT'), output_from('analyse_wgs_ascat_prep') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.verify_MT') pipeline.transform( task_func=stages.analyse_wgs_CaVEMan, name='analyse_wgs_CaVEMan', input=[ output_from('align'), output_from('analyse_wgs_CaVEMan_split'), output_from('analyse_wgs_ascat_prep'), output_from('analyse_wgs_cgpPindel') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan') pipeline.transform( task_func=stages.analyse_wgs_BRASS, name='analyse_wgs_BRASS', input=[ output_from('align'), output_from('analyse_wgs_BRASS_cover'), output_from('analyse_wgs_BRASS_input'), output_from('analyse_wgs_ascat_prep') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.BRASS') pipeline.transform( task_func=stages.analyse_wgs_cgpPindel_annot, name='analyse_wgs_cgpPindel_annot', input=[output_from('align'), output_from('analyse_wgs_pindel_prep')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.cgpPindel_annot') # pre block 4 pipeline.transform( task_func=stages.analyse_wgs_caveman_prep, name='analyse_wgs_caveman_prep', input=[output_from('align'), output_from('analyse_wgs_CaVEMan')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.caveman_prep') # block 4 pipeline.transform( task_func=stages.analyse_wgs_CaVEMan_annot, name='analyse_wgs_CaVEMan_annot', input=[output_from('align'), output_from('analyse_wgs_caveman_prep')], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.CaVEMan_annot') # done pipeline.transform( task_func=stages.analyse_wgs_finish, name='analyse_wgs_finish', input=[ output_from('align'), output_from('analyse_wgs_CaVEMan_annot'), output_from('analyse_wgs_BRASS'), output_from('analyse_wgs_cgpPindel_annot'), output_from('analyse_wgs_alleleCount'), output_from('analyse_wgs_verify_MT') ], filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.wgs/completed.finish') # # runs the delly singularity container # pipeline.transform( task_func=stages.delly, name='delly', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.delly.completed') pipeline.transform( task_func=stages.gridss, name='gridss', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.gridss.completed') pipeline.transform( task_func=stages.muse, name='muse', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.muse.completed') pipeline.transform( task_func=stages.mutect2, name='mutect2', input=output_from('align'), filter=formatter('(?P<path>.+)/(?P<sample>[a-zA-Z0-9]+).mapped.bam'), output='{path[0]}/{sample[0]}.mutect2.completed') return pipeline
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') # The original FASTQ files fastq_files = glob.glob('fastqs/*') # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter( '.+/(?P<sample>[a-zA-Z0-9_-]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq.gz' ), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2_{lib[0]}.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}.clipped.sort.hq.bam') # generate mapping metrics. pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).clipped.sort.hq.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) summary_file = 'all_sample.summary.txt' (pipeline.originate(task_func=stages.grab_summary_file, name='grab_summary_file', output=summary_file).follows('generate_stats')) pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt', extras=['all_sample.failed.summary.txt']) return pipeline
def make_pipeline_map(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='haloplexpipe') # Get a list of paths to all the FASTQ files #fastq_files = state.config.get_option('fastqs') fastq_files = glob.glob("fastqs/*.gz") # Stages are dependent on the state stages = Stages(state) safe_make_dir('alignments') safe_make_dir('processed_fastqs') safe_make_dir('metrics') safe_make_dir('metrics/amplicon') safe_make_dir('metrics/summary') safe_make_dir('metrics/pass_samples') safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/vardict') # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) pipeline.transform( task_func=stages.run_surecalltrimmer, name='run_surecalltrimmer', input=output_from('original_fastqs'), filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.fastq.gz'), add_inputs=add_inputs('fastqs/{sample[0]}_R2.fastq.gz'), #filter=formatter('fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.fastq.gz'), #add_inputs=add_inputs('fastqs/{sample[0]}_R3_001.fastq.gz'), extras=['{sample[0]}'], # output only needs to know about one file to track progress of the pipeline, but the second certainly exists after this step. output='processed_fastqs/{sample[0]}_R1.processed.fastq.gz') #output='processed_fastqs/{sample[0]}_R1_001.processed.fastq.gz') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('run_surecalltrimmer'), filter=formatter( 'processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1.processed.fastq.gz' ), add_inputs=add_inputs( 'processed_fastqs/{sample[0]}_R2.processed.fastq.gz'), #filter=formatter('processed_fastqs/(?P<sample>[a-zA-Z0-9_-]+)_R1_001.processed.fastq.gz'), #add_inputs=add_inputs('processed_fastqs/{sample[0]}_R3_001.processed.fastq.gz'), extras=['{sample[0]}'], output='alignments/{sample[0]}.bam') # Run locatit from agilent. this should produce sorted bam files, so no sorting needed at the next step pipeline.collate(task_func=stages.run_locatit, name='run_locatit', input=output_from('align_bwa', 'original_fastqs'), filter=regex(r'.+/(.+_L\d\d\d).+'), output=r'alignments/\1.locatit.bam') pipeline.transform(task_func=stages.sort_bam, name='sort_bam', input=output_from('run_locatit'), filter=suffix('.locatit.bam'), output='.sorted.locatit.bam') # # # # # Metrics stages # # # # # # generate mapping metrics (post locatit) pipeline.transform( task_func=stages.generate_amplicon_metrics, name='generate_amplicon_metrics', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/amplicon/{sample[0]}.amplicon-metrics.txt', extras=['{sample[0]}']) # Intersect the bam file with the region of interest pipeline.transform( task_func=stages.intersect_bed, name='intersect_bed', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.intersectbed.bam') # Calculate coverage metrics from the intersected bam file pipeline.transform(task_func=stages.coverage_bed, name='coverage_bed', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.bedtools_hist_all.txt') # Count the number of mapped reads pipeline.transform( task_func=stages.genome_reads, name='genome_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.mapped_to_genome.txt') # Count the number of on-target reads pipeline.transform(task_func=stages.target_reads, name='target_reads', input=output_from('intersect_bed'), filter=suffix('.intersectbed.bam'), output='.mapped_to_target.txt') # Count the number of total reads pipeline.transform( task_func=stages.total_reads, name='total_reads', input=output_from('sort_bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_-]+).sorted.locatit.bam'), output='metrics/summary/{sample[0]}.total_raw_reads.txt') # Generate summary metrics from the stats files produces pipeline.collate( task_func=stages.generate_stats, name='generate_stats', input=output_from('coverage_bed', 'genome_reads', 'target_reads', 'total_reads'), #filter=regex(r'.+/(.+BS\d{4,6}.+S\d+)\..+\.txt'), filter=regex( r'.+/(.+)\.(bedtools_hist_all|mapped_to_genome|mapped_to_target|total_raw_reads)\.txt' ), output=r'metrics/summary/all_sample.summary.\1.txt', extras=[r'\1', 'all_sample.summary.txt']) # # # # # Metrics stages end # # # # # # # # # # Checking metrics and calling # # # # # # Originate to set the location of the metrics summary file (pipeline.originate( task_func=stages.grab_summary_file, name='grab_summary_file', output='all_sample.summary.txt').follows('generate_stats')) # Awk command to produce a list of bam files passing filters pipeline.transform(task_func=stages.filter_stats, name='filter_stats', input=output_from('grab_summary_file'), filter=suffix('.summary.txt'), output='.passed.summary.txt') # Touch passed bams to the pass_samples folder and pass the glob of that folder to HaplotypeCaller pipeline.subdivide(name='passed_filter_files', task_func=stages.read_samples, input=output_from('filter_stats'), filter=formatter(), output="metrics/pass_samples/*.bam") # Call variants using GATK (pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/gatk/{sample[0]}.g.vcf').follows('sort_bam')) # Call variants with vardict (pipeline.transform( task_func=stages.run_vardict, name='run_vardict', input=output_from('passed_filter_files'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-_]+).sorted.locatit.bam'), output='variants/vardict/{sample[0]}.vcf', extras=['{sample[0]}']).follows('sort_bam')) pipeline.transform( task_func=stages.sort_vcfs, name='sort_vcfs', input=output_from('run_vardict'), filter=formatter('variants/vardict/(?P<sample>[a-zA-Z0-9_-]+).vcf'), output='variants/vardict/{sample[0]}.sorted.vcf.gz') pipeline.transform(task_func=stages.index_vcfs, name='index_vcfs', input=output_from('sort_vcfs'), filter=suffix('.sorted.vcf.gz'), output='.sorted.vcf.gz.tbi') return (pipeline)
def make_pipeline_process(state): # Define empty pipeline pipeline = Pipeline(name='hiplexpipe') # Get a list of paths to all the directories to be combined for variant calling run_directories = state.config.get_option('runs') #grab files from each of the processed directories in "runs" gatk_files = [] undr_rover_files = [] for directory in run_directories: gatk_files.extend(glob.glob(directory + '/variants/gatk/*.g.vcf')) undr_rover_files.extend( glob.glob(directory + '/variants/undr_rover/*sorted.vcf.gz')) # Stages are dependent on the state stages = Stages(state) # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.glob_gatk, name='glob_gatk', output=gatk_files) #Dummy stage to grab the undr rover files pipeline.originate(task_func=stages.glob_undr_rover, name='glob_undr_rover', output=undr_rover_files) safe_make_dir('variants') safe_make_dir('variants/gatk') safe_make_dir('variants/undr_rover') pipeline.merge(task_func=stages.concatenate_vcfs, name='concatenate_vcfs', input=output_from('glob_undr_rover'), output='variants/undr_rover/combined_undr_rover.vcf.gz') pipeline.transform(task_func=stages.index_final_vcf, name='index_final_vcf', input=output_from('concatenate_vcfs'), filter=suffix('.vcf.gz'), output='.vcf.gz.tbi') # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('glob_gatk'), output='ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # Apply GT filters to genotyped vcf pipeline.transform(task_func=stages.genotype_filter_gatk, name='genotype_filter_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output='.raw.gt-filter.vcf') # Decompose and normalise multiallelic sites pipeline.transform(task_func=stages.vt_decompose_normalise, name='vt_decompose_normalise', input=output_from('genotype_filter_gatk'), filter=suffix('.raw.gt-filter.vcf'), output='.raw.gt-filter.decomp.norm.vcf') # Annotate VCF file using GATK pipeline.transform(task_func=stages.variant_annotator_gatk, name='variant_annotator_gatk', input=output_from('vt_decompose_normalise'), filter=suffix('.raw.gt-filter.decomp.norm.vcf'), output='.raw.gt-filter.decomp.norm.annotate.vcf') # Filter vcf pipeline.transform( task_func=stages.gatk_filter, name='gatk_filter', input=output_from('variant_annotator_gatk'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.vcf'), output='.raw.gt-filter.decomp.norm.annotate.filter.vcf') #Apply VEP (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('gatk_filter'), filter=suffix('.raw.gt-filter.decomp.norm.annotate.filter.vcf'), add_inputs=add_inputs( ['variants/undr_rover/combined_undr_rover.vcf.gz']), output='.raw.gt-filter.decomp.norm.annotate.filter.vep.vcf').follows( 'index_final_vcf')) return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='methylation_pipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = PipelineStages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Run bismark genome preparation on the reference genome pipeline.originate(task_func=stages.bismark_genome_prepare, name='bismark_genome_prepare', output='reference/Bisulfite_Genome') # Run FASTQC on the input fastq files pipeline.transform( task_func=stages.fastqc, name='fastqc', input=output_from('original_fastqs'), filter=formatter('(?P<path>.+)/(?P<filename>.+).fastq.gz'), output='{path[0]}/{filename[0]}_fastqc') # Run bismark on the input fastq files (pipeline.transform( task_func=stages.bismark, name='bismark', input=output_from('original_fastqs'), filter=formatter( '(?P<path>.+)/(?P<filename>.+)_R1_(?P<num>.+).fastq.gz'), add_inputs=add_inputs('{path[0]}/{filename[0]}_R2_{num[0]}.fastq.gz'), extras=['{path[0]}/bismark_output/'], output= '{path[0]}/bismark_output/{filename[0]}_R1_{num[0]}_bismark_bt2_pe.sam.gz' )).follows('bismark_genome_prepare') # Run bismark methylation extractor on the bismark output pipeline.transform( task_func=stages.bismark_methylation_extractor, name='bismark_methylation_extractor', input=output_from('bismark'), filter=formatter( '(?P<path>.+)/(?P<filename>.+)_R1_(?P<num>.+)_bismark_bt2_pe.sam.gz' ), extras=['{path[0]}'], output= '{path[0]}/CpG_context_{filename[0]}_R1_{num[0]}_bismark_bt2_pe.sam.gz.txt' ) # Run methpt on the bismark methylation extractor output pipeline.transform( task_func=stages.methpat, name='methpat', input=output_from('bismark_methylation_extractor'), filter=formatter('(?P<path>.+)/CpG_context_(?P<filename>.+)'), extras=['{path[0]}', '{filename[0]}'], output='{path[0]}/CpG_context_{filename[0]}.methpat.html') return pipeline
def main(): ######### # SETUP # ######### # catch jgi logon and password from cli parser = ruffus.cmdline.get_argparse( description='5 accessions variant calling pipeline.') parser.add_argument('--email', '-e', help='Logon email address for JGI', type=str, dest='jgi_logon') parser.add_argument('--password', '-p', help='JGI password', type=str, dest='jgi_password') options = parser.parse_args() jgi_logon = options.jgi_logon jgi_password = options.jgi_password ################## # PIPELINE STEPS # ################## # test function for checking input/output passed to job_script and parsing # by io_parser test_job_function = functions.generate_job_function( job_script='src/sh/io_parser', job_name='test') # initialise pipeline main_pipeline = ruffus.Pipeline.pipelines["main"] # bamfiles raw_files = [x.path for x in os.scandir('data/bam') if x.name.endswith('.bam') and x.is_file] # subset the files while the pipeline is in development. Make this equal # to the raw_files to run the whole pipline. # active_raw_files = [x for x in raw_files if # 'G1' in x or 'G4' in x or 'J1' in x or 'J4' in x] active_raw_files = raw_files # species short names for vcf splitting species_short_names = list(set( [os.path.basename(x)[0] for x in active_raw_files])) # check that the files exist mapped_raw = main_pipeline.originate( name='mapped_raw', task_func=os.path.isfile, output=active_raw_files) # genome fasta ref_fa = main_pipeline.originate( name='ref_fa', task_func=functions.generate_job_function( job_script='src/sh/download_genome', job_name='ref_fa', job_type='download'), output='data/genome/Osativa_323_v7.0.fa', extras=[jgi_logon, jgi_password]) # indexes fa_idx = main_pipeline.transform( name='fa_idx', task_func=functions.generate_job_function( job_script='src/sh/fa_idx', job_name='fa_idx', job_type='transform', cpus_per_task=6), input=ref_fa, filter=ruffus.suffix(".fa"), output=['.dict', '.fa.fai']) # annotation annot = main_pipeline.originate( name='annot', task_func=functions.generate_job_function( job_script='src/sh/download_genome', job_name='annot', job_type='download'), output=('data/genome/' 'Osativa_323_v7.0.gene_exons.gffread.rRNAremoved.gtf'), extras=[jgi_logon, jgi_password]) # convert annotation to .bed annot_bed = main_pipeline.transform( name='annot_bed', task_func=functions.generate_job_function( job_script='src/sh/annot_bed', job_name='annot_bed', job_type='transform', cpus_per_task=7), input=annot, filter=ruffus.suffix('.gtf'), output='.bed') # mark duplicates with picard deduped = main_pipeline.transform( name='dedupe', task_func=functions.generate_job_function( job_script='src/sh/mark_duplicates_and_sort', job_name='dedupe', job_type='transform', cpus_per_task=2), input=mapped_raw, filter=ruffus.regex(r"data/bam/(.*).Aligned.out.bam"), output=(r"output/mark_duplicates_and_sort/\1.deduped.bam")) # Split'N'Trim and reassign mapping qualities split_and_trimmed = main_pipeline.transform( name='split_trim', task_func=functions.generate_job_function( job_script='src/sh/split_trim', job_name='split_trim', job_type='transform', cpus_per_task=2), input=deduped, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.formatter( "output/mark_duplicates_and_sort/(?P<LIB>.+).deduped.bam"), output=["{subdir[0][1]}/split_trim/{LIB[0]}.split.bam"])\ .follows(fa_idx) # we're going to recycle call_variants, merge_variants, filter_variants # and analyze_covar so we'll get the functions in advance call_variants = functions.generate_queue_job_function( job_script='src/sh/call_variants', job_name='call_variants') merge_variants = functions.generate_job_function( job_script='src/sh/merge_variants', job_name='merge_variants', job_type='transform', cpus_per_task=8) filter_variants = functions.generate_job_function( job_script='src/sh/filter_variants', job_name='filter_variants', job_type='transform', cpus_per_task=1) analyze_covar = functions.generate_queue_job_function( job_script='src/sh/analyze_covar', job_name='analyze_covar') # call variants without recalibration tables uncalibrated_variants = main_pipeline.transform( name='uncalibrated_variants', task_func=call_variants, input=split_and_trimmed, add_inputs=ruffus.add_inputs([ref_fa, annot_bed]), filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'), output='{subdir[0][1]}/variants_uncalibrated/{LIB[0]}.g.vcf.gz') # merge gVCF variants uncalibrated_variants_merged = main_pipeline.merge( name='uncalibrated_variants_merged', task_func=merge_variants, input=[uncalibrated_variants, ref_fa], output='output/variants_uncalibrated/variants_uncalibrated.vcf.gz') # filter variants on un-corrected bamfiles uncalibrated_variants_filtered = main_pipeline.transform( name='uncalibrated_variants_filtered', task_func=filter_variants, input=uncalibrated_variants_merged, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('_uncalibrated.vcf.gz'), output='_uncalibrated_filtered.vcf.gz') # select variant (only recalibrate using passed SNPs) uncalibrated_variants_selected = main_pipeline.transform( name='uncalibrated_variants_selected', task_func=functions.generate_job_function( job_script='src/sh/select_variants', job_name='select_variants', job_type='transform'), input=uncalibrated_variants_filtered, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('_uncalibrated_filtered.vcf.gz'), output='_uncalibrated_selected.vcf.gz') # create recalibration report with filtered variants covar_report = main_pipeline.merge( name='covar_report', task_func=analyze_covar, input=[split_and_trimmed, ref_fa, annot_bed, uncalibrated_variants_selected], output="output/covar_analysis/recal_data.table") # second pass to analyze covariation remaining after recalibration second_pass_covar_report = main_pipeline.merge( name='second_pass_covar_report', task_func=analyze_covar, input=[split_and_trimmed, ref_fa, annot_bed, uncalibrated_variants_filtered, covar_report], output="output/covar_analysis/post_recal_data.table") # plot effect of base recalibration recal_plot = main_pipeline.transform( name='recal_plot', task_func=functions.generate_job_function( job_script='src/R/recal_plot.R', job_name='recal_plot', job_type='transform', cpus_per_task=1), input=second_pass_covar_report, filter=ruffus.suffix('post_recal_data.table'), add_inputs=ruffus.add_inputs(covar_report), output='recalibration_plots.pdf') # recalibrate bases using recalibration report recalibrated = main_pipeline.transform( name='recalibrate', task_func=functions.generate_job_function( job_script='src/sh/recalibrate', job_name='recalibrate', job_type='transform', cpus_per_task=2), input=split_and_trimmed, add_inputs=ruffus.add_inputs([ref_fa, covar_report]), filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'), output='{subdir[0][1]}/recal/{LIB[0]}.recal.bam') # final variant calling variants = main_pipeline.transform( name='variants', task_func=call_variants, input=recalibrated, add_inputs=ruffus.add_inputs(ref_fa, annot_bed), filter=ruffus.formatter('output/recal/(?P<LIB>.+).recal.bam'), output='{subdir[0][1]}/variants/{LIB[0]}.g.vcf.gz') # merge gVCF variants variants_merged = main_pipeline.merge( name='variants_merged', task_func=merge_variants, input=[variants, ref_fa], output='output/variants/variants.vcf.gz') # variant filtering variants_filtered = main_pipeline.transform( name='variants_filtered', task_func=filter_variants, input=variants_merged, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('.vcf.gz'), output='_filtered.vcf.gz') # variants by species split_variants = main_pipeline.subdivide( name='split_variants', task_func=functions.generate_job_function( job_script='src/sh/split_variants', job_name='split_variants', job_type='transform', cpus_per_task=1, ntasks=len(species_short_names)), input=variants_filtered, filter=ruffus.formatter(), add_inputs=ruffus.add_inputs(ref_fa), output=[('output/split_variants/' + x + '.variants_filtered.vcf.gz') for x in species_short_names]) # count variants per gene per species cds_variants = main_pipeline.transform( name='cds_variants', task_func=functions.generate_job_function( job_script='src/R/cds_variants.R', job_name='cds_variants', job_type='transform'), input=split_variants, add_inputs=ruffus.add_inputs([ref_fa, annot]), filter=ruffus.formatter( 'output/split_variants/(?P<LIB>.+).variants_filtered.vcf.gz'), output='{subdir[0][1]}/cds_variants/{LIB[0]}.cds_variants.Rds') # merge counted variants variants_per_gene = main_pipeline.merge( name='cds_merge', task_func=functions.generate_job_function( job_script='src/R/cds_merge.R', job_name='cds_merge', job_type='transform'), input=cds_variants, output='output/cds_variants/cds_variants.Rds') ################### # RUFFUS COMMANDS # ################### # print the flowchart ruffus.pipeline_printout_graph( "ruffus/flowchart.pdf", "pdf", pipeline_name="5 accessions variant calling pipeline") # run the pipeline ruffus.cmdline.run(options, multithread=8)
def main(): # parse CLI parser = ruffus.cmdline.get_argparse( description='Vv mtDNA assembly pipeline') parser.add_argument('--email', '-e', help='Email address, reported to NCBI', type=str, dest='email') options = parser.parse_args() # store the email variable for logon if options.email: os.environ['NCBI_EMAIL'] = options.email # initialise pipeline main_pipeline = ruffus.Pipeline.pipelines['main'] # TEST FUNCTION test_job_function = tompltools.generate_job_function( job_script='src/sh/io_parser', job_name='test', verbose=True) # download COI seed file download_coi_fasta = main_pipeline.originate( name='download_coi_fasta.py', task_func=tompltools.generate_job_function( job_type='originate', job_script='src/py/download_coi_fasta.py', job_name='download_coi_fasta.py'), output='data/GU207861.1.fasta') # define files sample_list = 'data/samples.txt' with open(sample_list, 'r') as f: csvreader = csv.reader(f) next(csvreader) file_list = {x[0]: [x[1], x[2]] for x in csvreader} pe_filenames = file_list['pe'] mp_filenames = file_list['mp'] pe_files = find_all(pe_filenames, 'data') # filter out weird hidden directories, what are these anyway? pe_files_filtered = [x for x in pe_files if '/.' not in x] # load files into ruffus raw_fq_files = main_pipeline.originate(name='raw_fq_files', task_func=os.path.isfile, output=pe_files_filtered) # trim adaptors trim_bbduk = main_pipeline.merge( name='trim_bbduk', task_func=tompltools.generate_job_function( job_script='src/sh/trim_bbduk', job_name='trim_bbduk', cpus_per_task=8), input=raw_fq_files, output='output/trim_bbduk/pe_trimmed.fastq.gz') # subsample # something like ['bof' + str(i) for i in range(1,4)] number_of_repeats = 5 subsample_reads = main_pipeline.subdivide( name='subsample_reads', task_func=tompltools.generate_job_function( job_script='src/sh/subsample_reads', job_name='subsample_reads', ntasks=number_of_repeats), input=trim_bbduk, filter=ruffus.formatter(), output=([ 'output/subsample_reads/pe_trimmed_subsampled_' + str(i) + '.fastq.gz' for i in range(1, number_of_repeats + 1) ])) # run mitobim mitobim_quick = main_pipeline.transform( name='run_mitobim', task_func=tompltools.generate_job_function( job_script='src/sh/run_mitobim', job_name='run_mitobim'), input=subsample_reads, add_inputs=ruffus.add_inputs(download_coi_fasta), filter=ruffus.formatter( r'output/subsample_reads/pe_trimmed_subsampled_' '(?P<RN>\d).fastq.gz'), output='output/mitobim_quick_{RN[0]}/mitobim.log.txt') # re-fish with longest assembly find_longest_assembly = main_pipeline.originate( name='find_longest_assembly', task_func=tompltools.generate_job_function( job_type='originate', job_script='src/py/find_longest_assembly.py', job_name='find_longest_assembly'), output='output/longest_quick_scaffold.fasta')\ .follows(mitobim_quick) mitobim_full = main_pipeline.transform( name='mitobim_full', task_func=tompltools.generate_job_function( job_script='src/sh/run_mitobim', job_name='run_mitobim'), input=trim_bbduk, add_inputs=ruffus.add_inputs(find_longest_assembly), filter=ruffus.formatter(), output='output/mitobim_full/mitobim.log.txt') ################### # RUFFUS COMMANDS # ################### # print the flowchart ruffus.pipeline_printout_graph("ruffus/flowchart.pdf", "pdf", pipeline_name="Vv mtDNA assembly pipeline") # run the pipeline ruffus.cmdline.run(options, multithread=32)
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='thepipeline') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. # filter=formatter('(?P<path>.+)/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+)_1.fastq.gz'), # 1_HFYLVCCXX:2:TCCGCGAA_2_GE0343_1.fastq.gz # 1_HCJWFBCXX:GGACTCCT_L001_9071584415739518822-AGRF-023_R2.fastq.gz filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # e.g. C2WPF.5_Solexa-201237_5_X4311_1.fastq.gz add_inputs=add_inputs( '{path[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{readid[0]}', '{lib[0]}', '{lane[0]}', '{sample[0]}'], # extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Local realignment using GATK # Generate RealignerTargetCreator using GATK pipeline.transform( task_func=stages.realigner_target_creator, name='realigner_target_creator', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('realigner_target_creator'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), filter=formatter( '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).intervals'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.bam'), output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'), # '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).recal_data.csv'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'), # output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') output='alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Merge lane bams to sample bams pipeline.collate( task_func=stages.merge_sample_bams, name='merge_sample_bams', filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).sort.dedup.realn.recal.bam'), '.+/(?P<readid>[a-zA-Z0-9-]+)_(?P<lib>[a-zA-Z0-9-:]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9-]+).sort.dedup.realn.recal.bam'), # inputs=add_inputs('alignments/{sample[0]}/{readid[0]}_{lib[0]}_{lane[0]}_{sample[0]}.sort.dedup.realn.bam'), input=output_from('print_reads_gatk'), output='alignments/{sample[0]}/{sample[0]}.merged.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard2', input=output_from('merge_sample_bams'), # filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).merged.bam'), filter=suffix('.merged.bam'), # XXX should make metricsup an extra output? output=['.merged.dedup.bam', '.metricsdup']) # Local realignment2 using GATK # Generate RealignerTargetCreator using GATK pipeline.transform( task_func=stages.realigner_target_creator, name='realigner_target_creator2', input=output_from('mark_duplicates_picard2'), filter=suffix('.dedup.bam'), output='.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk2', input=output_from('realigner_target_creator2'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.intervals'), # filter=formatter( # '.+/(?P<readid>[a-zA-Z0-9-\.]+)_(?P<lib>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_(?P<sample>[a-zA-Z0-9]+).intervals'), # add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), add_inputs=add_inputs( 'alignments/{sample[0]}/{sample[0]}.merged.dedup.bam'), output='alignments/{sample[0]}/{sample[0]}.merged.dedup.realn.bam') .follows('mark_duplicates_picard2')) # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('local_realignment_gatk2'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+).merged.dedup.realn.bam'), output='variants/{sample[0]}.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_haplotypecaller_gatk'), output='variants/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), add_inputs=add_inputs(['ALL.snp_recal', 'ALL.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), add_inputs=add_inputs( ['ALL.indel_recal', 'ALL.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['ALL.recal_INDEL.vcf']), # output='.combined.vcf') output='ALL.raw.vqsr.vcf') .follows('apply_indel_recalibrate_gatk')) # # # Select variants using GATK # pipeline.transform( # task_func=stages.select_variants_gatk, # name='select_variants_gatk', # input=output_from('combine_variants_gatk'), # filter=suffix('.combined.vcf'), # output='.selected.vcf') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='complexo') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform( task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam') .follows('mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform( task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam') .follows('local_realignment_gatk')) # Call variants using GATK pipeline.transform( task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge( task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='PCExomes.mergegvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform( task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergegvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform( task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.snp_recal', 'PCExomes.snp_tranches']), output='.recal_SNP.vcf') .follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['PCExomes.indel_recal', 'PCExomes.indel_tranches']), output='.recal_INDEL.vcf') .follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['PCExomes.recal_INDEL.vcf']), output='.combined.vcf') .follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform( task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline
''' statement = ''' cgat cgat_fasta2cDNA --log=%(outfile)s.log %(infile)s > %(outfile)s ''' P.run(statement, job_memory="16G") @follows(makeSplicedCatalog) @transform(makeSplicedCatalog, regex("transcripts.dir/(.+).spliced.fa"), add_inputs("%s" % PARAMS['ercc_fasta']), r"transcripts.dir/\1.ercc.fa") def addSpikeIn(infiles, outfile): ''' add ERCC-92 spike in fasta sequences ''' infile = " ".join(infiles) statement = ''' cat %(infile)s > %(outfile)s ''' P.run(statement)
def refseq_genes_to_regions(in_genes, out_pattern): """make regions (promoter, downstream, 5UTR, etc) from refseq_genes""" args = shlex.split('''%s --promoter_size=%s --promoter_extend=%s --downstream_size=%s --downstream_extend=%s --with_gene_name''' % ( in_genes, cfg.get('genes', 'promoter_size'), cfg.get('genes', 'promoter_extend'), cfg.get('genes', 'downstream_size'), cfg.get('genes', 'downstream_extend'))) makeGeneStructure.main(args) @follows(refseq_genes_to_bed, convert_gtf_genes_to_bed) @collate(call_peaks.all_peak_caller_functions + ['*.custom.peaks'], regex(r'(.+)\.treat\.(.+)\.peaks'), add_inputs('%s*_genes.all' % cfg.get('DEFAULT', 'genome')), r'\1.treat.\2.peaks.nearby.genes') #add_inputs('%s*_genes.tss' % cfg.get('DEFAULT', 'genome')), r'\1.treat.\2.peaks.nearby.genes') def find_nearby_genes(in_files, out_genes): """report which genes are within a certain distance of a peak""" in_peaks, in_genes = in_files[0] tmp_output = tempfile.NamedTemporaryFile(delete=False).name cmd = 'closestBed -a %s -b %s -t first -d > %s' % (in_peaks, in_genes, tmp_output) sys_call(cmd) with open(tmp_output) as infile: with open(out_genes, 'w') as outfile: for line in infile: if not line: continue fields = line.strip().split('\t') dist = int(fields[-1])
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name="rnapipe") # Get the details of the experiment (samples, config, inputs, ...) experiment = Experiment(state) # Get reference file locations reference_genome = state.config.get_options("reference_genome") gene_ref = state.config.get_options("gene_ref") # Print out samples sample_text = [s.info() for s in experiment.sample_list] logging.info("Analysis samples:\n{}".format("\n".join(sample_text))) # Stages are dependent on the state. Experiment object is also passed so # we can access metadata later. stages = PipelineStages(state, experiment=experiment) # Make directories output_dir = get_output_paths( results_dir=state.config.get_options("results_dir"), default_paths=OUTPUT_PATHS) make_output_dirs(output_dir) logging.debug(output_dir) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.do_nothing, name="original_fastqs", output=experiment.R1_files) # Create reference index for alignment if not experiment.index_provided: pipeline.originate(task_func=stages.do_nothing, name="reference_genome", output=reference_genome) if experiment.alignment_method == "star": # Create reference index for STAR pipeline.transform(task_func=stages.create_star_index, name="create_star_index", input=output_from("reference_genome"), filter=formatter(".*"), add_inputs=add_inputs(gene_ref), output=path_list_join( output_dir["star_index"], ["SA", "Genome", "genomeParameters.txt"]), extras=[output_dir["star_index"]]) elif experiment.alignment_method == "hisat2": # Create reference index for HISAT2 hisat_basename = path.join(output_dir["hisat_index"], "genome") pipeline.transform( task_func=stages.create_hisat_index, name="create_hisat_index", input=output_from("reference_genome"), filter=formatter(".*"), add_inputs=add_inputs(gene_ref), output=path_list_join(output_dir["hisat_index"], ["genome.1.ht2", "genome.2.ht2"]), extras=[hisat_basename]) else: # Don't create index if index is supplied if experiment.alignment_method == "star": output_dir["star_index"] = state.config.get_options("star_index") pipeline.originate(task_func=stages.do_nothing, name="create_star_index", output=path_list_join( output_dir["star_index"], ["SA", "Genome", "genomeParameters.txt"])) elif experiment.alignment_method == "hisat2": hisat_basename = state.config.get_options("hisat_index") output_dir["hisat_index"] = path.dirname(hisat_basename) prefix = path.basename(hisat_basename) pipeline.originate(task_func=stages.do_nothing, name="create_hisat_index", output=path_list_join( output_dir["hisat_index"], [ "{prefix}.1.ht2".format(prefix=prefix), "{prefix}.2.ht2".format(prefix=prefix) ])) # Pre-trim FastQC if experiment.paired_end: pipeline.transform( task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"), add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"), output=path_list_join( output_dir["fastqc"], ["{sample[0]}_R1_fastqc.zip", "{sample[0]}_R2_fastqc.zip"]), extras=[output_dir["fastqc"]]) else: pipeline.transform(task_func=stages.fastqc, name="fastqc", input=output_from("original_fastqs"), filter=suffix(".fastq.gz"), output="_fastqc.zip", output_dir=output_dir["fastqc"], extras=[output_dir["fastqc"]]) # Trimmomatic if experiment.trim_reads and experiment.paired_end: pipeline.transform( task_func=stages.trim_reads, name="trim_reads", input=output_from("original_fastqs"), # Get R1 file and the corresponding R2 file filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"), add_inputs=add_inputs("{path[0]}/{sample[0]}_R2.fastq.gz"), output=path_list_join(output_dir["seq"], [ "{sample[0]}_R1.trimmed.fastq.gz", "{sample[0]}_R2.trimmed.fastq.gz" ]), extras=path_list_join(output_dir["seq"], [ "{sample[0]}_R1.unpaired.fastq.gz", "{sample[0]}_R2.unpaired.fastq.gz" ])) elif experiment.trim_reads: pipeline.transform( task_func=stages.trim_reads, name="trim_reads", input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.fastq.gz"), output=path.join(output_dir["seq"], "{sample[0]}_R1.trimmed.fastq.gz")) # Post-trim FastQC if experiment.paired_end and experiment.trim_reads: pipeline.transform( task_func=stages.fastqc, name="post_trim_fastqc", input=output_from("trim_reads"), filter=formatter( ".+/(?P<sample>[a-zA-Z0-9-_]+)_R1.trimmed.fastq.gz"), output=path_list_join(output_dir["post_trim_fastqc"], [ "{sample[0]}_R1.trimmed_fastqc.gz", "{sample[0]}_R2.trimmed_fastqc.gz" ]), extras=["results/qc/post_trim_fastqc/"]) elif experiment.trim_reads: pipeline.transform(task_func=stages.fastqc, name="post_trim_fastqc", input=output_from("trim_reads"), filter=suffix(".trimmed.fastq.gz"), output=".trimmed_fastqc.gz", output_dir=output_dir["post_trim_fastqc"], extras=[output_dir["post_trim_fastqc"]]) # If there are technical replicates, each is mapped independently. # This is so each technical replicate maintains a separate read group. if experiment.alignment_method == "star": align_task_name = "star_align" if experiment.trim_reads: (pipeline.transform( task_func=stages.star_align, name=align_task_name, input=output_from("trim_reads"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \ % output_dir["alignments"], extras=[output_dir["star_index"], "{sample[0]}"]) ).follows("create_star_index") else: (pipeline.transform( task_func=stages.star_align, name=align_task_name, input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.star.Aligned.out.bam" \ % output_dir["alignments"], extras=[output_dir["star_index"], "{sample[0]}"]) ).follows("create_star_index") if experiment.alignment_method == "hisat2": align_task_name = "hisat_align" if experiment.trim_reads: (pipeline.transform( task_func=stages.hisat_align, name="hisat_align", input=output_from("trim_reads"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \ % output_dir["alignments"], extras=[hisat_basename, "{sample[0]}"]) ).follows("create_hisat_index") else: (pipeline.transform( task_func=stages.hisat_align, name="hisat_align", input=output_from("original_fastqs"), filter=formatter(".+/(?P<sample>[a-zA-Z0-9-_]+)" \ "_R[12](.trimmed)?.fastq.gz"), output="%s/{sample[0]}/{sample[0]}.hisat2.bam" \ % output_dir["alignments"], extras=[hisat_basename, "{sample[0]}"]) ).follows("create_hisat_index") # Sort BAM by coordinates pipeline.transform( task_func=stages.sort_bam_by_coordinate, name="sort_bam_by_coordinate", input=output_from(align_task_name), filter=formatter( ".+/(?P<sample>[a-zA-Z0-9-_]+)\.(?P<method>(star|hisat2))\..*bam"), output=[ "{path[0]}/{sample[0]}.{method[0]}.sorted.bam", "{path[0]}/{sample[0]}.{method[0]}.sorted.bam.bai" ]) # Merge files with the same sample name if experiment.multiple_technical_replicates: pipeline.collate( task_func=stages.merge_bams, name="merge_bams", input=output_from("sort_bam_by_coordinate"), filter=formatter( ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam" ), output=path_list_join( output_dir["alignments"], ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"])) else: pipeline.transform( task_func=stages.create_symlinks, name="merge_bams", input=output_from("sort_bam_by_coordinate"), filter=formatter( ".+/(SM_)?(?P<sm>[a-zA-Z0-9-]+)[^.]*\.(?P<method>(star|hisat2)).sorted.bam" ), output=path_list_join( output_dir["alignments"], ["{sm[0]}.{method[0]}.bam", "{sm[0]}.{method[0]}.bam.bai"])) # Sort BAM by name for counting features pipeline.transform(task_func=stages.sort_bam_by_name, name="sort_bam_by_name", input=output_from("merge_bams"), filter=suffix(".bam"), output=".nameSorted.bam") # Count features with HTSeq-count pipeline.transform(task_func=stages.htseq_count, name="htseq_count", input=output_from("sort_bam_by_name"), filter=suffix(".nameSorted.bam"), output_dir=output_dir["counts"], output=".htseq.txt") # Count features with featureCounts pipeline.transform(task_func=stages.featurecounts, name="featurecounts", input=output_from("sort_bam_by_name"), filter=suffix(".nameSorted.bam"), output_dir=output_dir["counts"], output=".featureCounts.txt") # TODO: add multiqc step # # Stringtie assembly # pipeline.transform( # task_func=stages.stringtie_assembly, # name="stringtie_assembly", # input=output_from("merge_bams"), # filter=suffix(".bam"), # output_dir=output_dir["stringtie_assembly"], # output=".gtf") # Stringtie estimates pipeline.transform( task_func=stages.stringtie_estimates, name="stringtie_estimates", input=output_from("merge_bams"), filter=formatter( ".+/(?P<sm>[a-zA-Z0-9-]+)\.(?P<method>(star|hisat2)).bam"), output=path_list_join(output_dir["stringtie_estimates"], ["{sm[0]}/{sm[0]}.gtf", "{sm[0]}/e_data.ctab"])) # Stringtie counts pipeline.collate( task_func=stages.stringtie_prepDE, name="stringtie_prepDE", input=output_from("stringtie_estimates"), filter=formatter(".+\.gtf"), output=path_list_join( output_dir["stringtie_estimates"], ["gene_count_matrix.csv", "transcript_count_matrix.csv"])) return pipeline
if filetype == "bam": preamble += "samtools index %(tmpfile)s && " postamble += " && rm %(tmpfile)s.bai " elif filetype == "bed.gz": tmp2 = P.get_temp_filename(shared=False) preamble += ''' zcat %(tmpfile)s | sort -k1,1 -k2,2n | bgzip > %(tmp2)s && mv %(tmp2)s %(tmpfile)s && tabix -p bed %(tmpfile)s && ''' postamble += "&& rm %(tmpfile)s.tbi" return preamble % locals(), postamble % locals(), tmpfile, filetype # ------------------------------------------------------------------------------ @subdivide("*.categories.tsv", regex("(.+).categories.tsv"), add_inputs(PARAMS["geneset"]), r"\1_*.gtf.gz", r"\1") def split_gtf_by_category(infiles, outfiles, catname): catfile, gtffile = infiles categories = pd.read_csv(catfile, index_col=0, squeeze=True, sep="\t") # create output filepool outpool = iotools.FilePool("{}_%s.gtf.gz".format(catname), force=True) gtffile = iotools.open_file(gtffile) for gtfline in gtf.iterator(gtffile): try: transcript_id = gtfline.transcript_id except AttributeError:
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq # new sample name = OHI031002-P02F04 filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq' ), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq add_inputs=add_inputs( '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}_{readid[0]}/{sample[0]}_{readid[0]}.bam' ) # Call variants using undr_rover pipeline.transform( task_func=stages.apply_undr_rover, name='apply_undr_rover', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq' ), add_inputs=add_inputs( '{path[0]}/{sample[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # extras=['{sample[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], extras=['{sample[0]}', '{readid[0]}'], # The output file name is the sample name with a .bam extension. output='variants/undr_rover/{sample[0]}_{readid[0]}.vcf') # Sort the BAM file using Picard pipeline.transform(task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # High quality and primary alignments pipeline.transform(task_func=stages.primary_bam, name='primary_bam', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), output='.primary.bam') # index bam file pipeline.transform(task_func=stages.index_sort_bam_picard, name='index_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.bam.bai') # Clip the primer_seq from BAM File (pipeline.transform( task_func=stages.clip_bam, name='clip_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.primerclipped.bam').follows('index_bam')) ###### GATK VARIANT CALLING ###### # Call variants using GATK pipeline.transform( task_func=stages.call_haplotypecaller_gatk, name='call_haplotypecaller_gatk', input=output_from('clip_bam'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-_]+).primary.primerclipped.bam'), output='variants/gatk/{sample[0]}.g.vcf') # .follows('index_sort_bam_picard')) # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_haplotypecaller_gatk'), output='variants/gatk/ALL.combined.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.combined.vcf'), output='.raw.vcf') # Annotate VCF file using GATK pipeline.transform(task_func=stages.variant_annotator_gatk, name='variant_annotator_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.raw.vcf'), output='.raw.annotate.vcf') # Apply VariantFiltration using GATK pipeline.transform(task_func=stages.apply_variant_filtration_gatk_lenient, name='apply_variant_filtration_gatk_lenient', input=output_from('variant_annotator_gatk'), filter=suffix('.raw.annotate.vcf'), output='.raw.annotate.filtered_lenient.vcf') return pipeline
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='complexo') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+)_R1.fastq.gz'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort the BAM file using Picard pipeline.transform(task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # Mark duplicates in the BAM file using Picard pipeline.transform( task_func=stages.mark_duplicates_picard, name='mark_duplicates_picard', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), # XXX should make metricsup an extra output? output=['.sort.dedup.bam', '.metricsdup']) # Generate chromosome intervals using GATK pipeline.transform(task_func=stages.chrom_intervals_gatk, name='chrom_intervals_gatk', input=output_from('mark_duplicates_picard'), filter=suffix('.sort.dedup.bam'), output='.chr.intervals') # Local realignment using GATK (pipeline.transform( task_func=stages.local_realignment_gatk, name='local_realignment_gatk', input=output_from('chrom_intervals_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).chr.intervals'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.bam').follows( 'mark_duplicates_picard')) # Base recalibration using GATK pipeline.transform(task_func=stages.base_recalibration_gatk, name='base_recalibration_gatk', input=output_from('local_realignment_gatk'), filter=suffix('.sort.dedup.realn.bam'), output=['.recal_data.csv', '.count_cov.log']) # Print reads using GATK (pipeline.transform( task_func=stages.print_reads_gatk, name='print_reads_gatk', input=output_from('base_recalibration_gatk'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9_]+).recal_data.csv'), add_inputs=add_inputs('{path[0]}/{sample[0]}.sort.dedup.realn.bam'), output='{path[0]}/{sample[0]}.sort.dedup.realn.recal.bam').follows( 'local_realignment_gatk')) # Call variants using GATK pipeline.transform(task_func=stages.call_variants_gatk, name='call_variants_gatk', input=output_from('print_reads_gatk'), filter=suffix('.sort.dedup.realn.recal.bam'), output='.raw.snps.indels.g.vcf') # Combine G.VCF files for all samples using GATK pipeline.merge(task_func=stages.combine_gvcf_gatk, name='combine_gvcf_gatk', input=output_from('call_variants_gatk'), output='COMPLEXO.mergedgvcf.vcf') # Genotype G.VCF files using GATK pipeline.transform(task_func=stages.genotype_gvcf_gatk, name='genotype_gvcf_gatk', input=output_from('combine_gvcf_gatk'), filter=suffix('.mergedgvcf.vcf'), output='.genotyped.vcf') # SNP recalibration using GATK pipeline.transform(task_func=stages.snp_recalibrate_gatk, name='snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.snp_recal', '.snp_tranches', '.snp_plots.R']) # INDEL recalibration using GATK pipeline.transform( task_func=stages.indel_recalibrate_gatk, name='indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), output=['.indel_recal', '.indel_tranches', '.indel_plots.R']) # Apply SNP recalibration using GATK (pipeline.transform( task_func=stages.apply_snp_recalibrate_gatk, name='apply_snp_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs(['COMPLEXO.snp_recal', 'COMPLEXO.snp_tranches']), output='.recal_SNP.vcf').follows('snp_recalibrate_gatk')) # Apply INDEL recalibration using GATK (pipeline.transform( task_func=stages.apply_indel_recalibrate_gatk, name='apply_indel_recalibrate_gatk', input=output_from('genotype_gvcf_gatk'), filter=suffix('.genotyped.vcf'), add_inputs=add_inputs( ['COMPLEXO.indel_recal', 'COMPLEXO.indel_tranches']), output='.recal_INDEL.vcf').follows('indel_recalibrate_gatk')) # Combine variants using GATK (pipeline.transform( task_func=stages.combine_variants_gatk, name='combine_variants_gatk', input=output_from('apply_snp_recalibrate_gatk'), filter=suffix('.recal_SNP.vcf'), add_inputs=add_inputs(['COMPLEXO.recal_INDEL.vcf']), output='.combined.vcf').follows('apply_indel_recalibrate_gatk')) # Select variants using GATK pipeline.transform(task_func=stages.select_variants_gatk, name='select_variants_gatk', input=output_from('combine_variants_gatk'), filter=suffix('.combined.vcf'), output='.selected.vcf') return pipeline
with open(input_file_name) as input_file: for sentences, label in json.loads(input_file.read()): for words in sentences: dictionary.update(words) dictionary = list(sorted( w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN'] with open(output_file_name, 'w') as output_file: json.dump(dictionary, output_file) output_file.write("\n") @ruffus.follows(build_word_dictionary) @ruffus.transform(clean_data, ruffus.suffix(".json"), ruffus.add_inputs(r"\1.dictionary.json"), ".projected.json") def project_sentences(input_file_names, output_file_name): review_file_name, dictionary_file_name = input_file_names with open(review_file_name) as review_file: reviews = json.load(review_file) dictionary_file_name = dictionary_file_name.replace('test', 'train') dictionary_file_name = dictionary_file_name.replace('unsup', 'train') with open(dictionary_file_name) as dictionary_file: dictionary = json.load(dictionary_file) def project_sentence(s): return [w if w in dictionary else "UNKNOWN" for w in s]
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='hiplexpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate( task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # Hi-Plex example: OHI031002-P02F04_S318_L001_R1_001.fastq # new sample name = OHI031002-P02F04 filter=formatter( '.+/(?P<sample>[a-zA-Z0-9-]+)-(?P<tumor>[TN]+)_(?P<readid>[a-zA-Z0-9-]+)_(?P<lane>[a-zA-Z0-9]+)_R1_(?P<lib>[a-zA-Z0-9-:]+).fastq'), # Add one more inputs to the stage: # 1. The corresponding R2 FASTQ file # Hi-Plex example: OHI031002-P02F04_S318_L001_R2_001.fastq add_inputs=add_inputs( '{path[0]}/{sample[0]}-{tumor[0]}_{readid[0]}_{lane[0]}_R2_{lib[0]}.fastq'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}', '{tumor[0]}', '{readid[0]}', '{lane[0]}', '{lib[0]}'], # The output file name is the sample name with a .bam extension. output='alignments/{sample[0]}/{sample[0]}_{tumor[0]}.bam') # Sort the BAM file using Picard pipeline.transform( task_func=stages.sort_bam_picard, name='sort_bam_picard', input=output_from('align_bwa'), filter=suffix('.bam'), output='.sort.bam') # High quality and primary alignments pipeline.transform( task_func=stages.primary_bam, name='primary_bam', input=output_from('sort_bam_picard'), filter=suffix('.sort.bam'), output='.primary.bam') # index bam file pipeline.transform( task_func=stages.index_sort_bam_picard, name='index_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.bam.bai') # Clip the primer_seq from BAM File (pipeline.transform( task_func=stages.clip_bam, name='clip_bam', input=output_from('primary_bam'), filter=suffix('.primary.bam'), output='.primary.primerclipped.bam') .follows('index_bam')) ###### GATK VARIANT CALLING - MuTect2 ###### # Call somatics variants using MuTect2 pipeline.transform( task_func=stages.call_mutect2_gatk, name='call_mutect2_gatk', input=output_from('clip_bam'), # filter=suffix('.merged.dedup.realn.bam'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9-]+)_T.primary.primerclipped.bam'), add_inputs=add_inputs( '{path[0]}/{sample[0]}_N.primary.primerclipped.bam'), # extras=['{sample[0]}'], output='variants/mutect2/{sample[0]}.mutect2.vcf') # .follows('clip_bam') ###### GATK VARIANT CALLING - MuTect2 ###### # -------- VEP ---------- # Apply NORM (pipeline.transform( task_func=stages.apply_vt, name='apply_vt', input=output_from('call_mutect2_gatk'), filter=suffix('.mutect2.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.mutect2.vt.vcf') .follows('call_mutect2_gatk')) # # Apply VEP (pipeline.transform( task_func=stages.apply_vep, name='apply_vep', input=output_from('apply_vt'), filter=suffix('.mutect2.vt.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.mutect2.vt.vep.vcf') .follows('apply_vt')) # # Apply vcfanno (pipeline.transform( task_func=stages.apply_vcfanno, name='apply_vcfanno', input=output_from('apply_vep'), filter=suffix('.mutect2.vt.vep.vcf'), # add_inputs=add_inputs(['variants/ALL.indel_recal', 'variants/ALL.indel_tranches']), output='.mutect2.annotated.vcf') .follows('apply_vep')) return pipeline
cmd = 'MosaikJump -ia %s -out %s -hs %s' % (in_dat, out_jump_base, cfg.getint('mapping', 'mosaik_hash_size')) sys_call(cmd) @active_if(cfg.getboolean('mapping', 'run_mosaik')) @transform(preprocessing.final_output, suffix(''), '.mosaik_reads_dat') def run_mosaik_build_reads(in_fastq, out_dat): 'convert reads to mosaik binary' cmd = 'MosaikBuild -q %s -out %s -st illumina' % (in_fastq, out_dat) sys_call(cmd) @jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled') @transform(run_mosaik_build_reads, suffix('.mosaik_reads_dat'), add_inputs(run_mosaik_build_reference, run_mosiak_jump_reference), '.mosaik_align_dat') def run_mosaik_align(in_files, out_align): 'align reads to reference using MosaikAligner' # MosaikAligner -in sequence_archives/c_elegans_chr2_test.dat -out sequence_archives/c_elegans_chr2_test_aligned.dat -ia reference/c.elegans_chr2.dat -hs 14 -act 17 -mm 2 -m unique in_reads, in_genome_dat, in_genome_jump, _, _ = in_files in_genome_jump = in_genome_jump.replace('_keys.jmp', '') cmd = 'MosaikAligner -in %s -ia %s -j %s -out %s -hs %s %s' cmd = cmd % (in_reads, in_genome_dat, in_genome_jump, out_align, cfg.getint('mapping', 'mosaik_hash_size'), cfg.get('mapping', 'mosaik_params')) sys_call(cmd) @transform(run_mosaik_align, suffix('.mosaik_align_dat'), '.mosaik_align_sam') def mosaik_to_sam(in_dat, out_sam):
def make_pipeline(state): '''Build the pipeline by constructing stages and connecting them together''' # Build an empty pipeline pipeline = Pipeline(name='crpipe') # Get a list of paths to all the FASTQ files fastq_files = state.config.get_option('fastqs') # Find the path to the reference genome # Stages are dependent on the state stages = Stages(state) # The original FASTQ files # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. pipeline.originate(task_func=stages.original_fastqs, name='original_fastqs', output=fastq_files) # Convert FASTQ file to FASTA using fastx toolkit # pipeline.transform( # task_func=stages.fastq_to_fasta, # name='fastq_to_fasta', # input=output_from('original_fastqs'), # filter=suffix('.fastq.gz'), # output='.fasta') # The original reference file # This is a dummy stage. It is useful because it makes a node in the # pipeline graph, and gives the pipeline an obvious starting point. #pipeline.originate( # task_func=stages.original_reference, # name='original_reference', # output=reference_file) # Run fastQC on the FASTQ files pipeline.transform(task_func=stages.fastqc, name='fastqc', input=output_from('original_fastqs'), filter=suffix('.fastq.gz'), output='_fastqc') # Index the reference using BWA #pipeline.transform( # task_func=stages.index_reference_bwa, # name='index_reference_bwa', # input=output_from('original_reference'), # filter=suffix('.fa'), # output=['.fa.amb', '.fa.ann', '.fa.pac', '.fa.sa', '.fa.bwt']) # Index the reference using samtools # pipeline.transform( # task_func=stages.index_reference_samtools, # name='index_reference_samtools', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.fa.fai') # Index the reference using bowtie 2 # pipeline.transform( # task_func=stages.index_reference_bowtie2, # name='index_reference_bowtie2', # input=output_from('original_reference'), # filter=formatter('.+/(?P<refname>[a-zA-Z0-9]+\.fa)'), # output=['{path[0]}/{refname[0]}.1.bt2', # '{path[0]}/{refname[0]}.2.bt2', # '{path[0]}/{refname[0]}.3.bt2', # '{path[0]}/{refname[0]}.4.bt2', # '{path[0]}/{refname[0]}.rev.1.bt2', # '{path[0]}/{refname[0]}.rev.2.bt2'], # extras=['{path[0]}/{refname[0]}']) # # Create a FASTA sequence dictionary for the reference using picard # pipeline.transform( # task_func=stages.reference_dictionary_picard, # name='reference_dictionary_picard', # input=output_from('original_reference'), # filter=suffix('.fa'), # output='.dict') # Align paired end reads in FASTQ to the reference producing a BAM file pipeline.transform( task_func=stages.align_bwa, name='align_bwa', input=output_from('original_fastqs'), # Match the R1 (read 1) FASTQ file and grab the path and sample name. # This will be the first input to the stage. # We assume the sample name may consist of only alphanumeric # characters. filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fastq.gz'), # Add two more inputs to the stage: # 1. The corresponding R2 FASTQ file add_inputs=add_inputs('{path[0]}/{sample[0]}_R2.fastq.gz'), # Add an "extra" argument to the state (beyond the inputs and outputs) # which is the sample name. This is needed within the stage for finding out # sample specific configuration options extras=['{sample[0]}'], # The output file name is the sample name with a .bam extension. output='{path[0]}/{sample[0]}.bam') # Sort alignment with sambamba pipeline.transform(task_func=stages.sort_bam_sambamba, name='sort_alignment', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.sorted.bam') # Extract MMR genes from the sorted BAM file pipeline.transform( task_func=stages.extract_genes_bedtools, name='extract_genes_bedtools', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.mmr.bam') # Extract selected chromosomes from the sorted BAM file pipeline.transform( task_func=stages.extract_chromosomes_samtools, name='extract_chromosomes_samtools', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.chroms.bam') # Index the MMR genes bam file with samtools pipeline.transform(task_func=stages.index_bam, name='index_mmr_alignment', input=output_from('extract_genes_bedtools'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).mmr.bam'), output='{path[0]}/{sample[0]}.mmr.bam.bai') # Compute depth of coverage of the alignment with GATK DepthOfCoverage #pipeline.transform( # task_func=stages.alignment_coverage_gatk, # name='alignment_coverage_gatk', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs([reference_file]), # output='{path[0]}/{sample[0]}.coverage_summary', # extras=['{path[0]}/{sample[0]}_coverage']) # Index the alignment with samtools pipeline.transform( task_func=stages.index_bam, name='index_alignment', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), output='{path[0]}/{sample[0]}.sorted.bam.bai') # Generate alignment stats with bamtools pipeline.transform(task_func=stages.bamtools_stats, name='bamtools_stats', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.stats.txt') # Extract the discordant paired-end alignments pipeline.transform(task_func=stages.extract_discordant_alignments, name='extract_discordant_alignments', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.discordants.unsorted.bam') # Extract split-read alignments pipeline.transform(task_func=stages.extract_split_read_alignments, name='extract_split_read_alignments', input=output_from('align_bwa'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).bam'), output='{path[0]}/{sample[0]}.splitters.unsorted.bam') # Sort discordant reads. # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name='sort_discordants', input=output_from('extract_discordant_alignments'), filter=formatter( '.+/(?P<sample>[a-zA-Z0-9]+).discordants.unsorted.bam'), extras=['{path[0]}/{sample[0]}.discordants'], output='{path[0]}/{sample[0]}.discordants.bam') # Index the sorted discordant bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_discordants', # input=output_from('sort_discordants'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).discordants.bam'), # output='{path[0]}/{sample[0]}.discordants.bam.bai') # Sort discordant reads # Samtools annoyingly takes the prefix of the output bam name as its argument. # So we pass this as an extra argument. However Ruffus needs to know the full name # of the output bam file, so we pass that as the normal output parameter. pipeline.transform( task_func=stages.sort_bam, name='sort_splitters', input=output_from('extract_split_read_alignments'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.unsorted.bam'), extras=['{path[0]}/{sample[0]}.splitters'], output='{path[0]}/{sample[0]}.splitters.bam') # Index the sorted splitters bam with samtools # pipeline.transform( # task_func=stages.index_bam, # name='index_splitters', # input=output_from('sort_splitters'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).splitters.bam'), # output='{path[0]}/{sample[0]}.splitters.bam.bai') # Call structural variants with lumpy (pipeline.transform( task_func=stages.structural_variants_lumpy, name='structural_variants_lumpy', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), add_inputs=add_inputs([ '{path[0]}/{sample[0]}.splitters.bam', '{path[0]}/{sample[0]}.discordants.bam' ]), output='{path[0]}/{sample[0]}.lumpy.vcf').follows('index_alignment'). follows('sort_splitters').follows('sort_discordants')) # Call genotypes on lumpy output using SVTyper #(pipeline.transform( # task_func=stages.genotype_svtyper, # name='genotype_svtyper', # input=output_from('structural_variants_lumpy'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).lumpy.vcf'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.sorted.bam', '{path[0]}/{sample[0]}.splitters.bam']), # output='{path[0]}/{sample[0]}.svtyper.vcf') # .follows('align_bwa') # .follows('sort_splitters') # .follows('index_alignment') # .follows('index_splitters') # .follows('index_discordants')) # Call SVs with Socrates (pipeline.transform( task_func=stages.structural_variants_socrates, name='structural_variants_socrates', input=output_from('sort_alignment'), filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # output goes to {path[0]}/socrates/ output= '{path[0]}/socrates/results_Socrates_paired_{sample[0]}.sorted_long_sc_l25_q5_m5_i95.txt', extras=['{path[0]}'])) # Call DELs with DELLY pipeline.merge(task_func=stages.deletions_delly, name='deletions_delly', input=output_from('sort_alignment'), output='delly.DEL.vcf') # Call DUPs with DELLY pipeline.merge(task_func=stages.duplications_delly, name='duplications_delly', input=output_from('sort_alignment'), output='delly.DUP.vcf') # Call INVs with DELLY pipeline.merge(task_func=stages.inversions_delly, name='inversions_delly', input=output_from('sort_alignment'), output='delly.INV.vcf') # Call TRAs with DELLY pipeline.merge(task_func=stages.translocations_delly, name='translocations_delly', input=output_from('sort_alignment'), output='delly.TRA.vcf') # Join both read pair files using gustaf_mate_joining #pipeline.transform( # task_func=stages.gustaf_mate_joining, # name='gustaf_mate_joining', # input=output_from('fastq_to_fasta'), # # Match the R1 (read 1) FASTA file and grab the path and sample name. # # This will be the first input to the stage. # # We assume the sample name may consist of only alphanumeric # # characters. # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+)_R1.fasta'), # # Add one more input to the stage: # # 1. The corresponding R2 FASTA file # add_inputs=add_inputs(['{path[0]}/{sample[0]}_R2.fasta']), # output='{path[0]}/{sample[0]}.joined_mates.fasta') # Call structural variants with pindel #(pipeline.transform( # task_func=stages.structural_variants_pindel, # name='structural_variants_pindel', # input=output_from('sort_alignment'), # filter=formatter('.+/(?P<sample>[a-zA-Z0-9]+).sorted.bam'), # add_inputs=add_inputs(['{path[0]}/{sample[0]}.pindel_config.txt', reference_file]), # output='{path[0]}/{sample[0]}.pindel') # .follows('index_reference_bwa') # .follows('index_reference_samtools')) return pipeline
import re import os from ruffus import transform, regex, suffix, follows, formatter, add_inputs, merge from cgatcore import pipeline as P from cgatcore import iotools # load options from the config file PARAMS = P.get_parameters([ "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml" ]) # ----------------------------------------------------------------------------- @transform("*.bam", formatter(), add_inputs(PARAMS["regions_of_high_mappability"], PARAMS["regions_of_low_mappability"], PARAMS["regions_of_interest"]), r"filtered_bams.dir/{basename[0]}.bam") def filter_bamfiles(infiles, outfile): inreads, high_mapability, low_mapability, roi = infiles # filter not in regions of low or high mappability statement = ''' bedtools intersect -abam %(inreads)s -b %(high_mapability)s -v | bedtools intersect -abam - -b %(low_mapability)s -v ''' # filter only reads in regions of interest if specified if roi: statement += ''' | bedtools intersect -u -abam - -b %(roi)s'''