def delete_fastqs(self): """ Delete fastqs when all callers' jobs are finished """ jobs = [] for sample in self.samples: defuse_result = os.path.join("fusions", "defuse", sample.name, "results.filtered.tsv") fusionmap_result = os.path.join("fusions", "fusionmap", sample.name, "02_RNA.FusionReport.txt") ericscript_result = os.path.join("fusions", "ericscript", sample.name, "fusion.results.filtered.tsv") integrate_result = os.path.join("fusions", "integrate", sample.name, "breakpoints.cov.tsv") star_seqr_result = os.path.join("fusions", "star_seqr", sample.name, "out_STAR-SEQR_candidates.txt") arriba_result = os.path.join("fusions", "arriba", sample.name, "fusions.tsv") star_fusion_result = os.path.join( "fusions", "star_fusion", sample.name, "star-fusion.fusion_predictions.abridged.coding_effect.tsv") cicero_result = os.path.join("fusions", "cicero", sample.name, "final_fusions.txt") # result_file_list = [defuse_result, fusionmap_result, ericscript_result, integrate_result, # star_seqr_result, arriba_result, star_fusion_result] result_file_list = [defuse_result, fusionmap_result] del_job = delete_fastqs.delete_fastqs(sample.name, result_file_list) job = concat_jobs([Job(command="mkdir -p delete_fastqs"), del_job], name="delete_fastqs." + sample.name) # job = concat_jobs([ # Job(command="mkdir -p delete_fastqs") # ], name="delete_fastqs." + sample.name) job.input_files = [ defuse_result, fusionmap_result, ericscript_result, integrate_result, star_seqr_result, arriba_result, star_fusion_result, cicero_result ] jobs.append(job) # DELETE BAMS JOB (one across all samples) del_bams_job = concat_jobs( [delete_fastqs.delete_bams(result_file_list, self._output_dir)], name="delete_bams") jobs.append(del_bams_job) return jobs
def gunzip_fastq(self): """ Gunzip .fastq.gz files or symlink if already uncompressed """ jobs = [] for readset in self.readsets: out_dir = os.path.join("fusions", "gunzip_fastq", readset.sample.name) # Find input readset FASTQs first from previous trimmomatic job, # then from original FASTQs in the readset sheet if readset.run_type == "PAIRED_END": candidate_input_files = [] if readset.fastq1 and readset.fastq2: candidate_input_files.append( [readset.fastq1, readset.fastq2]) if readset.bam: picard_dir = os.path.join("fusions", "picard_sam_to_fastq", readset.sample.name) candidate_input_files.append([ os.path.join( picard_dir, os.path.basename( re.sub(r"\.bam$", ".pair1.fastq.gz", readset.bam))), os.path.join( picard_dir, os.path.basename( re.sub(r"\.bam$", ".pair2.fastq.gz", readset.bam))) ]) if readset.cram: picard_dir = os.path.join("fusions", "picard_sam_to_fastq", readset.sample.name) candidate_input_files.append([ os.path.join( picard_dir, os.path.basename(readset.cram) + ".pair1.fastq.gz"), os.path.join( picard_dir, os.path.basename(readset.cram) + ".pair2.fastq.gz") ]) [fastq1, fastq2] = self.select_input_files(candidate_input_files) else: raise Exception("Error: run type \"" + readset.run_type + "\" is invalid for readset \"" + readset.name + "\" (should be PAIRED_END)!") gunzip1_job = gunzip.gunzip_fastq(fastq1, out_dir) gunzip2_job = gunzip.gunzip_fastq(fastq2, out_dir) job = concat_jobs( [Job(command="mkdir -p " + out_dir), gunzip1_job, gunzip2_job], name="gunzip_fastq." + readset.sample.name + "." + readset.name) jobs.append(job) return jobs
def merge_fastq(self): """ Merge paired end fastqs of the same sample """ jobs = [] for sample in self.samples: if len(sample.readsets) > 1: input_dir = os.path.join("fusions", "gunzip_fastq", sample.name) fastq1_list = [] fastq2_list = [] for readset in sample.readsets: if readset.bam: fastq1 = os.path.join( input_dir, os.path.basename( re.sub(r"\.bam$", ".pair1.fastq", readset.bam))) fastq2 = os.path.join( input_dir, os.path.basename( re.sub(r"\.bam$", ".pair2.fastq", readset.bam))) elif readset.fastq1: if readset.fastq1.endswith(".gz"): # input files are gzipped fastqs fastq1 = os.path.join( input_dir, os.path.basename( re.sub(r"\.gz$", "", readset.fastq1))) fastq2 = os.path.join( input_dir, os.path.basename( re.sub(r"\.gz$", "", readset.fastq2))) else: # input files are fastqs fastq1 = os.path.join( input_dir, os.path.basename(readset.fastq1)) fastq2 = os.path.join( input_dir, os.path.basename(readset.fastq2)) else: raise Exception fastq1_list.append(fastq1) fastq2_list.append(fastq2) merge_fastq_job = merge_fastq.merge_fastq( fastq1_list, fastq2_list, input_dir) job = concat_jobs([ merge_fastq_job, ], name="merge_fastq." + sample.name) jobs.append(job) return jobs
def run_star_seqr(self): """ RNA Fusion Detection and Quantification using STAR https://github.com/ExpressionAnalysis/STAR-SEQR """ jobs = [] for sample in self.samples: if len(sample.readsets) > 1: raise Exception("Error: only one read set per sample allowed") if sample.readsets[0].bam: # .bam input fastq_dir = os.path.join("fusions", "picard_sam_to_fastq", sample.name) bam = sample.readsets[0].bam # fastq1 = os.path.join(out_dir, os.path.basename(re.sub("\.bam$", ".pair1.fastq.gz", out_bam))) # fastq2 = os.path.join(out_dir, os.path.basename(re.sub("\.bam$", ".pair2.fastq.gz", out_bam))) left_fastq = os.path.join( fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) right_fastq = os.path.join( fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split( ".")[-1] == "gz": # print(sample.readsets[0].fastq2) # print(sample.readsets[0].fastq2.split(".")[-1]) left_fastq = sample.readsets[0].fastq1 right_fastq = sample.readsets[0].fastq2 else: raise Exception( "Error: only .bam and .fastq.gz inputs allowed") output_dir = os.path.join("fusions", "star_seqr", sample.name) job = concat_jobs([ Job(command="mkdir -p " + output_dir), star_seqr.run(left_fastq, right_fastq, output_dir, sample.name, keep_bam=self.args.keep_bams) ], name="run_star_seqr." + sample.name) job.samples = [sample] jobs.append(job) return jobs
def integrate_make_result_file(self): """ Merge infomation from breakpoints.tsv and reads.txt """ jobs = [] for sample in self.samples: input_dir = os.path.join("fusions", "integrate", sample.name) make_result_job = integrate.make_result_file(input_dir) job = concat_jobs([make_result_job], name="integrate_make_result." + sample.name) jobs.append(job) return jobs
def MetaFusion_IsoHunter(self): """ Run MetaFusion.IsoHunter """ jobs = [] out_dir_abspath = self._output_dir isohunter_outdir = os.path.join("fusions", "metafusion_isohunter") metafusion_job = metafusion_isohunter.run_isohunter_singularity( out_dir_abspath) job = concat_jobs( [Job(command="mkdir -p " + isohunter_outdir), metafusion_job], name="MetaFusion.IsoHunter") jobs.append(job) return jobs
def MetaFusion_clinical(self): """ Run MetaFusion.IsoHunter.clinical """ jobs = [] out_dir_abspath = self._output_dir metafusion_outdir = os.path.join("fusions", "metafusion_clinical") metafusion_job = metafusion_clinical.run_metafusion_clinical( out_dir_abspath, self.args.database) job = concat_jobs( [Job(command="mkdir -p " + metafusion_outdir), metafusion_job], name="MetaFusion.clinical") jobs.append(job) return jobs
def tophat2(self): """ Run Tophat2 for Integrate. Determines accepted hits and unmapped reads, and outputs corresponding .bam files required as input files for integrate step. """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join(self.output_dir, "fusions", "tophat2", sample.name) tophat2_job = tophat2.tophat2(fastq1, fastq2, out_dir) job = concat_jobs( [Job(command="mkdir -p " + out_dir), tophat2_job], name="tophat2." + sample.name) jobs.append(job) return jobs
def MetaFusion(self): """ Run MetaFusion """ jobs = [] cff_dir_abspath = os.path.join(self._output_dir, "fusions", "cff") out_dir_abspath = os.path.join(self._output_dir, "fusions", "metafusion") metafusion_job = metafusion.run_metafusion_singularity(out_dir_abspath) # metafusion_job.name = "MetaFusion" job = concat_jobs( [Job(command="mkdir -p " + out_dir_abspath), metafusion_job], name="MetaFusion") jobs.append(job) return jobs
def run_arriba(self): """ """ jobs = [] for sample in self.samples: if len(sample.readsets) > 1: raise Exception("Error: only one read set per sample allowed") if sample.readsets[0].bam: # .bam input fastq_dir = os.path.join("fusions", "picard_sam_to_fastq", sample.name) bam = sample.readsets[0].bam left_fastq = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) right_fastq = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split( ".")[-1] == "gz": left_fastq = sample.readsets[0].fastq1 right_fastq = sample.readsets[0].fastq2 else: raise Exception( "Error: only .bam and .fastq.gz inputs allowed") output_dir = os.path.join("fusions", "arriba", sample.name) # JOBS chgdir_job = Job(command="cd " + output_dir) back_to_outdir_job = Job(command="cd " + self._output_dir) # CONCAT job = concat_jobs([ Job(command="mkdir -p " + output_dir), chgdir_job, arriba.run(left_fastq, right_fastq, self._output_dir, output_dir, keep_bam=self.args.keep_bams), back_to_outdir_job ], name="run_arriba." + sample.name) job.samples = [sample] jobs.append(job) return jobs
def defuse(self): """ Run Defuse to call gene fusions """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "defuse", sample.name) defuse_job = defuse.defuse(fastq1, fastq2, out_dir, keep_bam=self.args.keep_bams) job = concat_jobs([Job(command="mkdir -p " + out_dir), defuse_job], name="defuse." + sample.name) jobs.append(job) return jobs
def chimerascan(self): """ Run chimerascan to call gene fusions """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "chimerascan", sample.name) chimerascan_job = chimerascan.run(fastq1, fastq2, out_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), Job(command="rm -r " + out_dir), chimerascan_job ], name="chimerascan." + sample.name) jobs.append(job) return jobs
def ericscript(self): """ Run EricScript to call gene fusions """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "ericscript", sample.name) ericscript_job = ericscript.ericscript( fastq1, fastq2, out_dir, keep_bam=self.args.keep_bams) job = concat_jobs([ Job(command="mkdir -p " + out_dir), Job(command="rm -r " + out_dir), ericscript_job ], name="ericscript." + sample.name) jobs.append(job) return jobs
def integrate(self): """ Run Integrate to call gene fusions """ jobs = [] for sample in self.samples: input_dir = os.path.join("fusions", "tophat2", sample.name) accepted_bam = os.path.join(self.output_dir, input_dir, "accepted_hits.bam") unmapped_bam = os.path.join(self.output_dir, input_dir, "unmapped.bam") out_dir = os.path.join("fusions", "integrate", sample.name) integrate_job = integrate.integrate(accepted_bam, unmapped_bam, out_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), Job(command="cd " + out_dir), integrate_job, Job(command="cd -") ], name="integrate." + sample.name) jobs.append(job) return jobs
def fusionmap(self): """ Run FusionMap to call gene fusions """ jobs = [] for sample in self.samples: # add pipeline top outpud dir as input to bfx fusionmap script # self._output_dir assigned from command line args in pipeline.py top_dir = self._output_dir fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "fusionmap", sample.name) fusionmap_job = fusionmap.fusionmap(fastq1, fastq2, out_dir, top_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), fusionmap_job, Job(command="ls " + out_dir + "/02_RNA*") ], name="fusionmap." + sample.name) jobs.append(job) return jobs
def star_fusion(self): """ Run STAR-Fusion to call gene fusions """ jobs = [] CTAT_resource_lib = "/hpf/largeprojects/ccmbio/mapostolides/validate_fusion/test_star_star-fusion/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play/ctat_genome_lib_build_dir" for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "star_fusion", sample.name) # star_fusion_job = star_fusion.star_fusion(fastq1, fastq2, out_dir, CTAT_resource_lib) star_fusion_job = star_fusion.star_fusion( fastq1, fastq2, CTAT_resource_lib, out_dir, keep_bam=self.args.keep_bams) job = concat_jobs( [Job(command="mkdir -p " + out_dir), star_fusion_job], name="star_fusion." + sample.name) jobs.append(job) return jobs
def fusion_stats(self): """ Outputs count files and plots about the detected gene fusions. """ jobs = [] cff_dir = os.path.join("fusions", "cff") out_dir = os.path.join("fusions", "fusion_stats") sampleinfo_file = os.path.relpath(self.args.sampleinfo.name, self.output_dir) fusion_stats_job = fusion_stats.fusion_stats(cff_dir, out_dir, sampleinfo_file) category_table_job = fusion_stats.generate_category_count_table( cff_dir, out_dir) category_barplot_job = fusion_stats.generate_categories_barplot( fusion_stats_dir=out_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), fusion_stats_job, category_table_job, category_barplot_job ], name="fusion_stats") jobs.append(job) return jobs
def merge_cff_fusion(self): """ Merge all cff files into one single file """ jobs = [] cff_files = [] cff_dir = os.path.join("fusions", "cff") out_dir = os.path.join("fusions", "cff") # put defuse .cff file last, which means inverted defuse calls will be always be "fusion2" in # "generate_common_fusion_stats_by_breakpoints" function of pygeneann.py. # This makes sense, since defuse is only one to make "flipped/inverted" calls. # If defuse is not "fusion2" this results in errors in the case where defuse makes a flipped call # tool_list = ["star_seqr", "arriba", "star_fusion", "fusionmap", "ericscript", "integrate", "defuse"] for tool in self.tool_list: cff_files.extend([ os.path.join(cff_dir, sample.name + "." + tool + ".cff") for sample in self.samples ]) merge_job = merge_and_reannotate_cff_fusion.merge_cff_fusion( cff_files, out_dir) job = concat_jobs([merge_job], name="merge_cff_fusion") jobs.append(job) return jobs
def convert_fusion_results_to_cff(self): """ Convert fusion results of all 4 gene fusion callers to cff format """ jobs = [] out_dir = os.path.join("fusions", "cff") job_list = [Job(command="mkdir -p " + out_dir)] sampleinfo_file = os.path.relpath(self.args.sampleinfo.name, self.output_dir) for sample in self.samples: # Define result files # output_file = os.path.join(output_dir, prefix + "_STAR-SEQR", prefix + "_STAR-SEQR_candidates.txt") # star_seqr_result = os.path.join("fusions", "star_seqr", sample.name, # "out_STAR-SEQR", "out_STAR-SEQR_candidates.txt") star_seqr_result = os.path.join("fusions", "star_seqr", sample.name, "out_STAR-SEQR_candidates.txt") # print >> sys.stderr, star_seqr_result arriba_result = os.path.join("fusions", "arriba", sample.name, "fusions.tsv") # star_fusion_result = os.path.join("fusions", "star_fusion", # sample.name, "star-fusion.fusion_predictions.abridged.tsv") star_fusion_result = os.path.join( "fusions", "star_fusion", sample.name, "star-fusion.fusion_predictions.abridged.coding_effect.tsv") defuse_result = os.path.join("fusions", "defuse", sample.name, "results.filtered.tsv") fusionmap_result = os.path.join("fusions", "fusionmap", sample.name, "02_RNA.FusionReport.txt") ericscript_result = os.path.join("fusions", "ericscript", sample.name, "fusion.results.filtered.tsv") integrate_result = os.path.join("fusions", "integrate", sample.name, "breakpoints.cov.tsv") cicero_result = os.path.join("fusions", "cicero", sample.name, "final_fusions.txt") # Build tool_results list based on self.tool_list result_file_dict = { "star_seqr": star_seqr_result, "arriba": arriba_result, "star_fusion": star_fusion_result, "defuse": defuse_result, "fusionmap": fusionmap_result, "ericscript": ericscript_result, "integrate": integrate_result, "cicero": cicero_result } tool_results = [(key, result_file_dict[key]) for key in result_file_dict.keys() if key in self.tool_list] # tool_results = [("star_seqr",star_seqr_result), ("arriba", arriba_result), # ("star_fusion", star_fusion_result), ("defuse", defuse_result), # ("fusionmap", fusionmap_result), ("ericscript", ericscript_result), # ("integrate", integrate_result)] # tool_results = [("arriba", arriba_result), ("star_fusion", star_fusion_result), # ("defuse", defuse_result), ("fusionmap", fusionmap_result), # ("ericscript", ericscript_result), ("integrate", integrate_result)] # determine sample_type """ sample_type = "" for contrast in self.contrasts: if sample in contrast.controls: sample_type = "Normal" elif sample in contrast.treatments: sample_type = "Tumor" if sample_type: disease_name = contrast.name break if not sample_type: raise Exception("Error: sample " + sample.name + " not found in design file " + self.args.design.name) """ # convert caller output files to common fusion format(cff) for tool, result_file in tool_results: job = cff_conversion.cff_convert(sample.name, result_file, sampleinfo_file, tool, out_dir) job.command = job.command.strip() job_list.append(job) job = concat_jobs(job_list, name="cff_conversion") jobs.append(job) return jobs
def picard_sam_to_fastq(self): """ Convert SAM/BAM files from the input readset file into FASTQ format if FASTQ files are not already specified in the readset file. Do nothing otherwise. rewritten from common.Illumina.picard_sam_to_fastq, make directory for this step under result folder in case the original bam file directory is not writable """ jobs = [] for readset in self.readsets: # If readset FASTQ files are available, skip this step if not readset.fastq1: if readset.cram: # convert cram to bam then to fastq. fastq and bam are saved on localhd out_bam = os.path.join( "$TMPDIR", os.path.basename(readset.cram) + ".bam") cram2bam_job = samtools_1_1.view(readset.cram, out_bam) if readset.run_type == "PAIRED_END": out_dir = os.path.join("fusions", "picard_sam_to_fastq", readset.sample.name) fastq1 = os.path.join( out_dir, os.path.basename( re.sub(r"\.bam$", ".pair1.fastq.gz", out_bam))) fastq2 = os.path.join( out_dir, os.path.basename( re.sub(r"\.bam$", ".pair2.fastq.gz", out_bam))) else: raise Exception( "Error: run type \"" + readset.run_type + "\" is invalid for readset \"" + readset.name + "\" (should be PAIRED_END or SINGLE_END)!") picard_job = picard.sam_to_fastq(out_bam, fastq1, fastq2) job = concat_jobs([ Job(command="mkdir -p " + out_dir), cram2bam_job, picard_job ], name="picard_sam_to_fastq." + readset.name) jobs.append(job) elif readset.bam: if readset.run_type == "PAIRED_END": out_dir = os.path.join("fusions", "picard_sam_to_fastq", readset.sample.name) fastq1 = os.path.join( out_dir, os.path.basename( re.sub(r"\.bam$", ".pair1.fastq.gz", readset.bam))) fastq2 = os.path.join( out_dir, os.path.basename( re.sub(r"\.bam$", ".pair2.fastq.gz", readset.bam))) else: raise Exception( "Error: run type \"" + readset.run_type + "\" is invalid for readset \"" + readset.name + "\" (should be PAIRED_END or SINGLE_END)!") picard_job = picard.sam_to_fastq(readset.bam, fastq1, fastq2) job = concat_jobs( [Job(command="mkdir -p " + out_dir), picard_job], name="picard_sam_to_fastq." + readset.name) jobs.append(job) else: raise Exception( "Error: BAM file not available for readset \"" + readset.name + "\"!") return jobs
def run_cicero(self): """ Fusion detection specializing in internal tandem duplication (ITD) https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02043-x https://github.com/stjude/Cicero This software runs as a docker application. However, this can also be installed manually. As of May 2021, versions 0.2.0, 0.3.0 and 1.4.2 are available as modules on the HPF. Also runs RNApeg, a complementary tool to generate the junctions file for use by CICERO. Available on the HPF via RNApeg/20210226 and runs as a singularity container. """ jobs = [] for sample in self.samples: # Get fastq files if len(sample.readsets) > 1: raise Exception("Error: only one read set per sample allowed") if sample.readsets[0].bam: # .bam input fastq_dir = os.path.join("fusions", "picard_sam_to_fastq", sample.name) bam = sample.readsets[0].bam fq1 = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) fq2 = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split( ".")[-1] == "gz": fq1 = sample.readsets[0].fastq1 fq2 = sample.readsets[0].fastq2 else: raise Exception( "Error: only .bam and .fastq.gz inputs allowed") # Directories tmp_dir = "/localhd/${PBS_JOBID}" # The variable should be unevaluated in the qsub script trim_dir = os.path.join(tmp_dir, "trimmomatic") align_dir = os.path.join(tmp_dir, "star") cicero_dir = os.path.join(tmp_dir, "cicero") rnapeg_dir = os.path.join(tmp_dir, "rnapeg") output_dir = os.path.join("fusions", "cicero", sample.name) # Files fq1_trimmed = os.path.join( trim_dir, "".join([sample.name, ".trimmed.R1.fq.gz"])) fq2_trimmed = os.path.join( trim_dir, "".join([sample.name, ".trimmed.R2.fq.gz"])) fq1_dropped = os.path.join( trim_dir, "".join([sample.name, ".filtered.R1.fq.gz"])) fq2_dropped = os.path.join( trim_dir, "".join([sample.name, ".filtered.R2.fq.gz"])) trim_log = os.path.join(trim_dir, "".join([sample.name, ".trim.log"])) star_bam = os.path.join(align_dir, "Aligned.sortedByCoord.out.bam") dedup_bam = os.path.join(align_dir, "Aligned.sortedByCoord.dedup.bam") dedup_metrics = os.path.join( align_dir, "Aligned.sortedByCoord.dedup.metrics") symlink_bam = os.path.join(cicero_dir, sample.name + ".bam") junction_file = os.path.join( rnapeg_dir, sample.name + ".bam.junctions.tab.shifted.tab") # Jobs trim = trimmomatic.trimmomatic( fq1, fq2, fq1_trimmed, fq1_dropped, fq2_trimmed, fq2_dropped, None, None, config.param("trimmomatic", "adapter_fasta", required=False), trim_log) align = star.align(fq1_trimmed, fq2_trimmed, align_dir, config.param("run_cicero", "genome_build"), rg_id=sample.name, rg_library=sample.name, rg_sample=sample.name, rg_platform="ILLUMINA", sort_bam=True) index = samtools.index(star_bam) # Also indexes for us! idx_file=re.sub(r"\.bam$", ".bai", dedup_bam) dedup = picard.mark_duplicates([star_bam], dedup_bam, dedup_metrics) # RNApeg rna_peg = Job( input_files=[dedup_bam], output_files=[junction_file], module_entries=[("run_cicero", "module_rnapeg")], name="RNApeg", command="""ln -s \\\n{idx_file} \\\n{new_idx_file} && \\ ln -s {bamfile} \\\n{new_bamfile} && \\ singularity exec --cleanenv -B /hpf:/hpf -B /localhd:/localhd -B {outpath}:/results \\ $(which rnapeg.sif) RNApeg.sh -b {new_bamfile} \\\n -f {ref} \\\n -r {reflat}""" .format(bamfile=dedup_bam, ref=config.param("run_cicero", "reference", required=True), reflat=config.param("run_cicero", "reflat", required=True), outpath=rnapeg_dir, idx_file=re.sub(r"\.bam$", ".bai", dedup_bam), new_bamfile=symlink_bam, new_idx_file=symlink_bam + ".bai")) # Cicero cicero = Job( input_files=[dedup_bam, junction_file], output_files=[ os.path.join(cicero_dir, "CICERO_DATADIR", sample.name, "final_fusions.txt") ], module_entries=[("run_cicero", "module_cicero")], name="run_cicero" + sample.name, command= """singularity exec --cleanenv -B /hpf:/hpf -B /localhd:/localhd \\ $CICERO_PATH/CICERO_1.4.2.sif \\ Cicero.sh -n {threads} -b {bamfile} \\\n -g {genome} \\\n -r {reference} \\\n -j {junction} -o {out_dir}""" .format(threads=config.param("run_cicero", "threads", required=True), bamfile=symlink_bam, genome=config.param("run_cicero", "genome", required=True), reference=config.param("run_cicero", "cicero_data", required=True), junction=junction_file, out_dir=cicero_dir)) save_out = Job( input_files=[ os.path.join(cicero_dir, "CICERO_DATADIR", sample.name, "final_fusions.txt") ], output_files=[os.path.join(output_dir, "final_fusions.txt")], name="save_cicero_results" + sample.name, command="""mv {files_to_keep} {target_dir}""".format( files_to_keep=" ".join([ junction_file, os.path.join(cicero_dir, "0*.{err,log}"), # Logs os.path.join(cicero_dir, "CICERO_DATADIR", sample.name, "*.{txt,frame.tab,html}") # # Result files ]), target_dir=output_dir) ) # the files in /localhd/ should be removed automatically upon job end job_mkdir = Job( command="mkdir -p {trim} {align} {cicero} {output} {rnapeg}". format(trim=trim_dir, align=align_dir, cicero=cicero_dir, output=output_dir, rnapeg=rnapeg_dir)) combined_job = concat_jobs([ job_mkdir, trim, align, index, dedup, rna_peg, cicero, save_out ], name="run_cicero." + sample.name) # Replace input and output specification combined_job._output_files = [ os.path.join(output_dir, "final_fusions.txt") ] combined_job.input_files = [fq1, fq2] jobs.append(combined_job) return jobs