def test_bs_seeker_filter_02(): """ Test that it is possible to call the BSseeker filter """ resource_path = os.path.join(os.path.dirname(__file__), "data/") home = os.path.expanduser('~') input_files = {"fastq": resource_path + "bsSeeker.Mouse.SRR892982_2.fastq"} output_files = { "fastq_filtered": resource_path + "bsSeeker.Mouse.SRR892982_2.filtered.fastq" } metadata = { "fastq": Metadata("data_wgbs", "fastq", input_files["fastq"], None, {'assembly': 'test'}) } config_param = { "aligner": "bowtie2", "aligner_path": home + "/lib/bowtie2-2.3.4-linux-x86_64", "bss_path": home + "/lib/BSseeker2" } bsi = bs_seeker_filter.filterReadsTool(config_param) bsi.run(input_files, metadata, output_files) assert os.path.isfile(output_files["fastq_filtered"]) is True assert os.path.getsize(output_files["fastq_filtered"]) > 0
def work(self): """ Worker function for aligning single ended FASTQ reads using Bowtie2 Parameters ---------- genome_fa : str Location of the FASTA file of the genome to align the reads to genome_idx : str Location of the index files in .tar.gz file prepared by the BWA indexer fastq_file : str Location of the FASTQ file output_bam : str Location of the aligned reads in bam format """ frt = filterReadsTool() frt.bss_seeker_filter(self.fastq_file, self.fastq_filtered, self.bss_path) bss_aligner = bssAlignerTool({"no-untar": True}) bss_aligner.bs_seeker_aligner_single(self.fastq_filtered, self.aligner, self.aligner_path, self.bss_path, [], self.genome_fa, self.genome_idx, self.output_bam) bam_handle = bamUtils() bam_handle.bam_sort(self.output_bam)
def run(self, input_files, metadata, output_files): """ This pipeline processes paired-end FASTQ files to identify methylated regions within the genome. Parameters ---------- input_files : dict List of strings for the locations of files. These should include: genome_fa : str Genome assembly in FASTA fastq1 : str Location for the first FASTQ file for single or paired end reads fastq2 : str [OPTIONAL]Location for the second FASTQ file if paired end reads metadata : dict Input file meta data associated with their roles genome_fa : str fastq1 : str fastq2 : str [OPTIONAL] output_files : dict index : str fastq1_filtered : str fastq2_filtered : str [OPTIONAL] bam : str bai : str wig_file : str cgmap_file : str atcgmap_file : str Returns ------- fastq1_filtered|fastq1_filtered : str Locations of the filtered FASTQ files from which alignments were made bam|bai : str Location of the alignment bam file and the associated index wig_file : str Location of the wig file containing the methylation peak calls cgmap_file : str Location of the CGmap file generated by BS-Seeker2 atcgmap_file : str Location of the ATCGmap file generated by BS-Seeker2 """ output_results_files = {} output_metadata = {} logger.info("WGBS - BS-Seeker2 Index") # Build the matching WGBS genome index builder = bssIndexerTool(self.configuration) genome_idx, gidx_meta = builder.run(remap(input_files, "genome"), remap(metadata, "genome"), remap(output_files, "index")) output_results_files["index"] = genome_idx["index"] output_metadata["index"] = gidx_meta["index"] # Filter the FASTQ reads to remove duplicates logger.info("WGBS - Filter") frt = filterReadsTool(self.configuration) fastq1f, filter1_meta = frt.run( {"fastq": input_files["fastq1"]}, {"fastq": metadata["fastq1"]}, {"fastq_filtered": output_files["fastq1_filtered"]}) try: output_results_files["fastq1_filtered"] = fastq1f["fastq_filtered"] output_metadata["fastq1_filtered"] = filter1_meta["fastq_filtered"] tool_name = output_metadata["fastq1_filtered"].meta_data["tool"] output_metadata["fastq1_filtered"].meta_data[ "tool_description"] = tool_name output_metadata["fastq1_filtered"].meta_data[ "tool"] = "process_wgbs" except KeyError: logger.fatal("WGBS - FILTER: Error while filtering") return {}, {} if "fastq2" in input_files: logger.info("WGBS - Filter background") fastq2f, filter2_meta = frt.run( {"fastq": input_files["fastq2"]}, {"fastq": metadata["fastq2"]}, {"fastq_filtered": output_files["fastq2_filtered"]}) try: output_results_files["fastq2_filtered"] = fastq2f[ "fastq_filtered"] output_metadata["fastq2_filtered"] = filter2_meta[ "fastq_filtered"] tool_name = output_metadata["fastq2_filtered"].meta_data[ "tool"] output_metadata["fastq2_filtered"].meta_data[ "tool_description"] = tool_name output_metadata["fastq2_filtered"].meta_data[ "tool"] = "process_wgbs" except KeyError: logger.fatal( "WGBS - FILTER (background): Error while filtering") return {}, {} logger.info("WGBS - BS-Seeker2 Aligner") # Handles the alignment of all of the split packets then merges them # back together. bss_aligner = bssAlignerTool(self.configuration) aligner_input_files = { "genome": input_files["genome"], "fastq1": fastq1f["fastq_filtered"] } aligner_input_files["index"] = genome_idx["index"] aligner_meta = { "genome": metadata["genome"], "fastq1": filter1_meta["fastq_filtered"], "index": output_metadata["index"] } if "fastq2" in input_files: aligner_input_files["fastq2"] = fastq2f["fastq_filtered"] aligner_meta["fastq2"] = filter2_meta["fastq_filtered"] bam, bam_meta = bss_aligner.run(aligner_input_files, aligner_meta, remap(output_files, "bam", "bai")) try: output_results_files["bam"] = bam["bam"] output_results_files["bai"] = bam["bai"] output_metadata["bam"] = bam_meta["bam"] output_metadata["bai"] = bam_meta["bai"] tool_name = output_metadata["bam"].meta_data["tool"] output_metadata["bam"].meta_data["tool_description"] = tool_name output_metadata["bam"].meta_data["tool"] = "process_wgbs" tool_name = output_metadata["bai"].meta_data["tool"] output_metadata["bai"].meta_data["tool_description"] = tool_name output_metadata["bai"].meta_data["tool"] = "process_wgbs" except KeyError: logger.fatal("WGBS - Aligner failed") return {}, {} # Methylation peak caller peak_caller_handle = bssMethylationCallerTool(self.configuration) mct_input_files = { "genome": input_files["genome"], "index": genome_idx["index"], "fastq1": fastq1f["fastq_filtered"], "bam": bam["bam"], "bai": bam["bai"] } mct_meta = { "genome": metadata["genome"], "index": gidx_meta["index"], "fastq1": filter1_meta["fastq_filtered"], "bam": output_metadata["bam"], "bai": bam_meta["bai"] } if "fastq2" in input_files: mct_input_files["fastq2"] = fastq2f["fastq_filtered"] mct_meta["fastq2"] = filter2_meta["fastq_filtered"] peak_files, peak_meta = peak_caller_handle.run( mct_input_files, mct_meta, remap(output_files, "wig_file", "cgmap_file", "atcgmap_file")) # output_metadata["peak_calling"] = peak_meta try: output_results_files["wig_file"] = peak_files["wig_file"] output_results_files["cgmap_file"] = peak_files["cgmap_file"] output_results_files["atcgmap_file"] = peak_files["atcgmap_file"] output_metadata["wig_file"] = peak_meta["wig_file"] output_metadata["cgmap_file"] = peak_meta["cgmap_file"] output_metadata["atcgmap_file"] = peak_meta["atcgmap_file"] output_metadata["wig_file"].meta_data["tool_description"] = output_metadata["wig_file"].meta_data["tool"] # pylint: disable=line-too-long output_metadata["wig_file"].meta_data["tool"] = "process_wgbs" output_metadata["cgmap_file"].meta_data["tool_description"] = output_metadata["cgmap_file"].meta_data["tool"] # pylint: disable=line-too-long output_metadata["cgmap_file"].meta_data["tool"] = "process_wgbs" output_metadata["atcgmap_file"].meta_data["tool_description"] = output_metadata["atcgmap_file"].meta_data["tool"] # pylint: disable=line-too-long output_metadata["atcgmap_file"].meta_data["tool"] = "process_wgbs" except KeyError: logger.fatal("WGBS - Peak caller failed") return {}, {} return (output_results_files, output_metadata)
def run(self, input_files, metadata, output_files): """ This pipeline processes FASTQ files to filter duplicate entries Parameters ---------- input_files : dict List of strings for the locations of files. These should include: fastq1 : str Location for the first FASTQ file for single or paired end reads fastq2 : str Location for the second FASTQ file if paired end reads [OPTIONAL] metadata : dict Input file meta data associated with their roles fastq1 : str fastq2 : str [OPTIONAL] output_files : dict fastq1_filtered : str fastq2_filtered : str [OPTIONAL] Returns ------- fastq1_filtered|fastq1_filtered : str Locations of the filtered FASTQ files from which alignments were made fastq2_filtered|fastq2_filtered : str Locations of the filtered FASTQ files from which alignments were made """ output_results_files = {} output_metadata = {} logger.info("BS-Filter") frt = filterReadsTool(self.configuration) fastq1f, filter1_meta = frt.run( {"fastq": input_files["fastq1"]}, {"fastq": metadata["fastq1"]}, {"fastq_filtered": output_files["fastq1_filtered"]}) try: output_results_files["fastq1_filtered"] = fastq1f["fastq_filtered"] output_metadata["fastq1_filtered"] = filter1_meta["fastq_filtered"] tool_name = output_metadata["fastq1_filtered"].meta_data["tool"] output_metadata["fastq1_filtered"].meta_data[ "tool_description"] = tool_name output_metadata["fastq1_filtered"].meta_data[ "tool"] = "process_wgbs" except KeyError: logger.fatal("WGBS - FILTER: Error while filtering") return {}, {} if "fastq2" in input_files: logger.info("WGBS - Filter background") fastq2f, filter2_meta = frt.run( {"fastq": input_files["fastq2"]}, {"fastq": metadata["fastq2"]}, {"fastq_filtered": output_files["fastq2_filtered"]}) try: output_results_files["fastq2_filtered"] = fastq2f[ "fastq_filtered"] output_metadata["fastq2_filtered"] = filter2_meta[ "fastq_filtered"] tool_name = output_metadata["fastq2_filtered"].meta_data[ "tool"] output_metadata["fastq2_filtered"].meta_data[ "tool_description"] = tool_name output_metadata["fastq2_filtered"].meta_data[ "tool"] = "process_wgbs" except KeyError: logger.fatal( "WGBS - FILTER (background): Error while filtering") return {}, {} return (output_results_files, output_metadata)
def _filter_reads_mp(self, fastq_in, fastq_out, filter_path): frt = filterReadsTool() frt.bss_seeker_filter(fastq_in, fastq_out, filter_path) fq_handle = fastqUtils() fq_handle.fastq_sort_file(fastq_out)