def work(self): """ Worker function for filtering bam files using BioBamBam Parameters ---------- in_bam_file : str Location of the bam file to filter in_bam_file : str Location of the filtered bam file """ macs2_handle = macs2() real_params = [] if len(self.macs2_params) > 0: real_params = self.macs2_params.split(",") macs2_handle.macs2_peak_calling_nobgd( "luigi_lsf", self.in_bam_file, real_params, self.narrowPeak_file, self.summits_file, self.broadPeak_file, self.gappedPeak_file)
def test_macs2_background(): """ Function to test MACS2 """ resource_path = os.path.join(os.path.dirname(__file__), "data/") input_files = { "bam": resource_path + "macs2.Human.DRR000150.22_aln_filtered.bam", "bam_bg": resource_path + "macs2.Human.DRR000150.22_aln_filtered.bam" } output_files = { "narrow_peak": resource_path + "macs2.Human.DRR000150.22_peaks.narrowPeak", "summits": resource_path + "macs2.Human.DRR000150.22_peaks.summits.bed", "broad_peak": resource_path + "macs2.Human.DRR000150.22_peaks.broadPeak", "gapped_peak": resource_path + "macs2.Human.DRR000150.22_peaks.gappedPeak" } metadata = { "bam": Metadata("data_chipseq", "fastq", [], None, {'assembly': 'test'}), "bam_bg": Metadata("data_chipseq", "fastq", [], None, {'assembly': 'test'}), } macs_handle = macs2({"macs_nomodel_param": True}) macs_handle.run(input_files, metadata, output_files) assert os.path.isfile(resource_path + "macs2.Human.DRR000150.22_peaks.narrowPeak") is True assert os.path.getsize(resource_path + "macs2.Human.DRR000150.22_peaks.narrowPeak") > 0 assert os.path.isfile(resource_path + "macs2.Human.DRR000150.22_peaks.summits.bed") is True assert os.path.getsize(resource_path + "macs2.Human.DRR000150.22_peaks.summits.bed") > 0
def run(self, input_files, metadata, output_files): """ Main run function for processing ChIP-seq FastQ data. Pipeline aligns the FASTQ files to the genome using BWA. MACS 2 is then used for peak calling to identify transcription factor binding sites within the genome. Currently this can only handle a single data file and a single background file. Parameters ---------- input_files : dict Location of the initial input files required by the workflow bam : str Location of the aligned reads file bam_bg : str Location of the background aligned FASTQ reads file [OPTIONAL] metadata : dict Input file meta data associated with their roles bam : str bam_bg : str [OPTIONAL] output_files : dict Output file locations narrow_peak : str summits : str broad_peak : str gapped_peak : str Returns ------- output_files : dict Output file locations associated with their roles, for the output narrow_peak : str Results files in bed4+1 format summits : str Results files in bed6+4 format broad_peak : str Results files in bed6+3 format gapped_peak : str Results files in bed12+3 format output_metadata : dict Output metadata for the associated files in output_files narrow_peak : Metadata summits : Metadata broad_peak : Metadata gapped_peak : Metadata """ output_files_generated = {} output_metadata = {} # MACS2 to call peaks macs_caller = macs2(self.configuration) macs_inputs = {"bam": input_files["bam"]} macs_metadt = {"bam": metadata['bam']} if "bg_loc" in input_files: macs_inputs["bam"] = input_files["bam_bg"] macs_metadt["bam"] = output_metadata['bam_bg'] m_results_files, m_results_meta = macs_caller.run( macs_inputs, macs_metadt, # Outputs of the final step may match workflow outputs; # Extra entries in output_files will be disregarded. remap(output_files, 'narrow_peak', 'summits', 'broad_peak', 'gapped_peak')) if 'narrow_peak' in m_results_meta: output_files_generated['narrow_peak'] = m_results_files[ 'narrow_peak'] output_metadata['narrow_peak'] = m_results_meta['narrow_peak'] tool_name = output_metadata['narrow_peak'].meta_data['tool'] output_metadata['narrow_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['narrow_peak'].meta_data[ 'tool'] = "process_chipseq" if 'summits' in m_results_meta: output_files_generated['summits'] = m_results_files['summits'] output_metadata['summits'] = m_results_meta['summits'] tool_name = output_metadata['summits'].meta_data['tool'] output_metadata['summits'].meta_data[ 'tool_description'] = tool_name output_metadata['summits'].meta_data['tool'] = "process_chipseq" if 'broad_peak' in m_results_meta: output_files_generated['broad_peak'] = m_results_files[ 'broad_peak'] output_metadata['broad_peak'] = m_results_meta['broad_peak'] tool_name = output_metadata['broad_peak'].meta_data['tool'] output_metadata['broad_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['broad_peak'].meta_data['tool'] = "process_chipseq" if 'gapped_peak' in m_results_meta: output_files_generated['gapped_peak'] = m_results_files[ 'gapped_peak'] output_metadata['gapped_peak'] = m_results_meta['gapped_peak'] tool_name = output_metadata['gapped_peak'].meta_data['tool'] output_metadata['gapped_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['gapped_peak'].meta_data[ 'tool'] = "process_chipseq" return output_files_generated, output_metadata
def run(self, input_files, metadata, output_files): # pylint: disable=too-many-branches """ Main run function for processing ChIP-seq FastQ data. Pipeline aligns the FASTQ files to the genome using BWA. MACS 2 is then used for peak calling to identify transcription factor binding sites within the genome. Currently this can only handle a single data file and a single background file. Parameters ---------- input_files : dict Location of the initial input files required by the workflow genome : str Genome FASTA file index : str Location of the BWA archived index files loc : str Location of the FASTQ reads files fastq2 : str Location of the paired end FASTQ file [OPTIONAL] bg_loc : str Location of the background FASTQ reads files [OPTIONAL] fastq2_bg : str Location of the paired end background FASTQ reads files [OPTIONAL] metadata : dict Input file meta data associated with their roles genome : str index : str bg_loc : str [OPTIONAL] output_files : dict Output file locations bam [, "bam_bg"] : str filtered [, "filtered_bg"] : str narrow_peak : str summits : str broad_peak : str gapped_peak : str Returns ------- output_files : dict Output file locations associated with their roles, for the output bam [, "bam_bg"] : str Aligned FASTQ short read file [ and aligned background file] locations filtered [, "filtered_bg"] : str Filtered versions of the respective bam files narrow_peak : str Results files in bed4+1 format summits : str Results files in bed6+4 format broad_peak : str Results files in bed6+3 format gapped_peak : str Results files in bed12+3 format output_metadata : dict Output metadata for the associated files in output_files bam [, "bam_bg"] : Metadata filtered [, "filtered_bg"] : Metadata narrow_peak : Metadata summits : Metadata broad_peak : Metadata gapped_peak : Metadata """ output_files_generated = {} output_metadata = {} logger.info("PROCESS CHIPSEQ - DEFINED OUTPUT:", output_files["bam"]) align_input_files = remap(input_files, "genome", "loc", "index") align_input_file_meta = remap(metadata, "genome", "loc", "index") if "fastq2" in input_files: align_input_files["fastq2"] = input_files["fastq2"] align_input_file_meta["fastq2"] = metadata["fastq2"] bwa = bwaAlignerTool(self.configuration) bwa_files, bwa_meta = bwa.run(align_input_files, align_input_file_meta, {"output": output_files["bam"]}) try: output_files_generated["bam"] = bwa_files["bam"] output_metadata["bam"] = bwa_meta["bam"] tool_name = output_metadata['bam'].meta_data['tool'] output_metadata['bam'].meta_data['tool_description'] = tool_name output_metadata['bam'].meta_data['tool'] = "process_chipseq" except KeyError: logger.fatal("BWA aligner failed") if "bg_loc" in input_files: # Align background files align_input_files_bg = remap(input_files, "genome", "index", loc="bg_loc") align_input_file_meta_bg = remap(metadata, "genome", "index", loc="bg_loc") if "fastq2" in input_files: align_input_files_bg["fastq2"] = input_files["fastq2_bg"] align_input_file_meta_bg["fastq2"] = metadata["fastq2_bg"] bwa_bg_files, bwa_bg_meta = bwa.run( align_input_files_bg, align_input_file_meta_bg, {"output": output_files["bam_bg"]}) try: output_files_generated["bam_bg"] = bwa_bg_files["bam_bg"] output_metadata["bam_bg"] = bwa_bg_meta["bam_bg"] tool_name = output_metadata['bam_bg'].meta_data['tool'] output_metadata['bam_bg'].meta_data[ 'tool_description'] = tool_name output_metadata['bam_bg'].meta_data['tool'] = "process_chipseq" except KeyError: logger.fatal("Background BWA aligner failed") # Filter the bams b3f = biobambam(self.configuration) b3f_files, b3f_meta = b3f.run({"input": bwa_files['bam']}, {"input": bwa_meta['bam']}, {"output": output_files["filtered"]}) try: output_files_generated["filtered"] = b3f_files["bam"] output_metadata["filtered"] = b3f_meta["bam"] tool_name = output_metadata['filtered'].meta_data['tool'] output_metadata['filtered'].meta_data[ 'tool_description'] = tool_name output_metadata['filtered'].meta_data['tool'] = "process_chipseq" except KeyError: logger.fatal("BioBamBam filtering failed") if "bg_loc" in input_files: # Filter background aligned files b3f_bg_files, b3f_bg_meta = b3f.run( {"input": bwa_bg_files['bam']}, {"input": bwa_bg_meta['bam']}, {"output": output_files["filtered_bg"]}) try: output_files_generated["filtered_bg"] = b3f_bg_files["bam"] output_metadata["filtered_bg"] = b3f_bg_meta["bam"] tool_name = output_metadata['filtered_bg'].meta_data['tool'] output_metadata['filtered_bg'].meta_data[ 'tool_description'] = tool_name output_metadata['filtered_bg'].meta_data[ 'tool'] = "process_chipseq" except KeyError: logger.fatal("Background BioBamBam filtering failed") # MACS2 to call peaks macs_caller = macs2(self.configuration) macs_inputs = {"bam": output_files_generated["filtered"]} macs_metadt = {"bam": output_metadata['filtered']} if "bg_loc" in input_files: macs_inputs["bam_bg"] = output_files_generated["filtered_bg"] macs_metadt["bam_bg"] = output_metadata['filtered_bg'] m_results_files, m_results_meta = macs_caller.run( macs_inputs, macs_metadt, # Outputs of the final step may match workflow outputs; # Extra entries in output_files will be disregarded. remap(output_files, 'narrow_peak', 'summits', 'broad_peak', 'gapped_peak')) if not m_results_meta: logger.fatal("MACS2 peak calling failed") if 'narrow_peak' in m_results_meta: output_files_generated['narrow_peak'] = m_results_files[ 'narrow_peak'] output_metadata['narrow_peak'] = m_results_meta['narrow_peak'] tool_name = output_metadata['narrow_peak'].meta_data['tool'] output_metadata['narrow_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['narrow_peak'].meta_data[ 'tool'] = "process_chipseq" if 'summits' in m_results_meta: output_files_generated['summits'] = m_results_files['summits'] output_metadata['summits'] = m_results_meta['summits'] tool_name = output_metadata['summits'].meta_data['tool'] output_metadata['summits'].meta_data[ 'tool_description'] = tool_name output_metadata['summits'].meta_data['tool'] = "process_chipseq" if 'broad_peak' in m_results_meta: output_files_generated['broad_peak'] = m_results_files[ 'broad_peak'] output_metadata['broad_peak'] = m_results_meta['broad_peak'] tool_name = output_metadata['broad_peak'].meta_data['tool'] output_metadata['broad_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['broad_peak'].meta_data['tool'] = "process_chipseq" if 'gapped_peak' in m_results_meta: output_files_generated['gapped_peak'] = m_results_files[ 'gapped_peak'] output_metadata['gapped_peak'] = m_results_meta['gapped_peak'] tool_name = output_metadata['gapped_peak'].meta_data['tool'] output_metadata['gapped_peak'].meta_data[ 'tool_description'] = tool_name output_metadata['gapped_peak'].meta_data[ 'tool'] = "process_chipseq" return output_files_generated, output_metadata