示例#1
0
    def work(self):
        """
        Worker function for filtering bam files using BioBamBam

        Parameters
        ----------
        in_bam_file : str
            Location of the bam file to filter
        in_bam_file : str
            Location of the filtered bam file
        """
        macs2_handle = macs2()

        real_params = []
        if len(self.macs2_params) > 0:
            real_params = self.macs2_params.split(",")

        macs2_handle.macs2_peak_calling_nobgd(
            "luigi_lsf", self.in_bam_file, real_params, self.narrowPeak_file,
            self.summits_file, self.broadPeak_file, self.gappedPeak_file)
def test_macs2_background():
    """
    Function to test MACS2
    """

    resource_path = os.path.join(os.path.dirname(__file__), "data/")

    input_files = {
        "bam": resource_path + "macs2.Human.DRR000150.22_aln_filtered.bam",
        "bam_bg": resource_path + "macs2.Human.DRR000150.22_aln_filtered.bam"
    }

    output_files = {
        "narrow_peak":
        resource_path + "macs2.Human.DRR000150.22_peaks.narrowPeak",
        "summits":
        resource_path + "macs2.Human.DRR000150.22_peaks.summits.bed",
        "broad_peak":
        resource_path + "macs2.Human.DRR000150.22_peaks.broadPeak",
        "gapped_peak":
        resource_path + "macs2.Human.DRR000150.22_peaks.gappedPeak"
    }

    metadata = {
        "bam":
        Metadata("data_chipseq", "fastq", [], None, {'assembly': 'test'}),
        "bam_bg":
        Metadata("data_chipseq", "fastq", [], None, {'assembly': 'test'}),
    }

    macs_handle = macs2({"macs_nomodel_param": True})
    macs_handle.run(input_files, metadata, output_files)

    assert os.path.isfile(resource_path +
                          "macs2.Human.DRR000150.22_peaks.narrowPeak") is True
    assert os.path.getsize(resource_path +
                           "macs2.Human.DRR000150.22_peaks.narrowPeak") > 0
    assert os.path.isfile(resource_path +
                          "macs2.Human.DRR000150.22_peaks.summits.bed") is True
    assert os.path.getsize(resource_path +
                           "macs2.Human.DRR000150.22_peaks.summits.bed") > 0
示例#3
0
    def run(self, input_files, metadata, output_files):
        """
        Main run function for processing ChIP-seq FastQ data. Pipeline aligns
        the FASTQ files to the genome using BWA. MACS 2 is then used for peak
        calling to identify transcription factor binding sites within the
        genome.

        Currently this can only handle a single data file and a single
        background file.

        Parameters
        ----------
        input_files : dict
            Location of the initial input files required by the workflow

            bam : str
                Location of the aligned reads file

            bam_bg : str
                Location of the background aligned FASTQ reads file [OPTIONAL]

        metadata : dict
            Input file meta data associated with their roles

            bam : str

            bam_bg : str
                [OPTIONAL]

        output_files : dict
            Output file locations

            narrow_peak : str
            summits : str
            broad_peak : str
            gapped_peak : str

        Returns
        -------
        output_files : dict
            Output file locations associated with their roles, for the output

            narrow_peak : str
                Results files in bed4+1 format

            summits : str
                Results files in bed6+4 format

            broad_peak : str
                Results files in bed6+3 format

            gapped_peak : str
                Results files in bed12+3 format

        output_metadata : dict
            Output metadata for the associated files in output_files

            narrow_peak : Metadata
            summits : Metadata
            broad_peak : Metadata
            gapped_peak : Metadata
        """
        output_files_generated = {}
        output_metadata = {}

        # MACS2 to call peaks
        macs_caller = macs2(self.configuration)
        macs_inputs = {"bam": input_files["bam"]}
        macs_metadt = {"bam": metadata['bam']}

        if "bg_loc" in input_files:
            macs_inputs["bam"] = input_files["bam_bg"]
            macs_metadt["bam"] = output_metadata['bam_bg']

        m_results_files, m_results_meta = macs_caller.run(
            macs_inputs,
            macs_metadt,
            # Outputs of the final step may match workflow outputs;
            # Extra entries in output_files will be disregarded.
            remap(output_files, 'narrow_peak', 'summits', 'broad_peak',
                  'gapped_peak'))

        if 'narrow_peak' in m_results_meta:
            output_files_generated['narrow_peak'] = m_results_files[
                'narrow_peak']
            output_metadata['narrow_peak'] = m_results_meta['narrow_peak']

            tool_name = output_metadata['narrow_peak'].meta_data['tool']
            output_metadata['narrow_peak'].meta_data[
                'tool_description'] = tool_name
            output_metadata['narrow_peak'].meta_data[
                'tool'] = "process_chipseq"
        if 'summits' in m_results_meta:
            output_files_generated['summits'] = m_results_files['summits']
            output_metadata['summits'] = m_results_meta['summits']

            tool_name = output_metadata['summits'].meta_data['tool']
            output_metadata['summits'].meta_data[
                'tool_description'] = tool_name
            output_metadata['summits'].meta_data['tool'] = "process_chipseq"
        if 'broad_peak' in m_results_meta:
            output_files_generated['broad_peak'] = m_results_files[
                'broad_peak']
            output_metadata['broad_peak'] = m_results_meta['broad_peak']

            tool_name = output_metadata['broad_peak'].meta_data['tool']
            output_metadata['broad_peak'].meta_data[
                'tool_description'] = tool_name
            output_metadata['broad_peak'].meta_data['tool'] = "process_chipseq"
        if 'gapped_peak' in m_results_meta:
            output_files_generated['gapped_peak'] = m_results_files[
                'gapped_peak']
            output_metadata['gapped_peak'] = m_results_meta['gapped_peak']

            tool_name = output_metadata['gapped_peak'].meta_data['tool']
            output_metadata['gapped_peak'].meta_data[
                'tool_description'] = tool_name
            output_metadata['gapped_peak'].meta_data[
                'tool'] = "process_chipseq"

        return output_files_generated, output_metadata
    def run(self, input_files, metadata, output_files):  # pylint: disable=too-many-branches
        """
        Main run function for processing ChIP-seq FastQ data. Pipeline aligns
        the FASTQ files to the genome using BWA. MACS 2 is then used for peak
        calling to identify transcription factor binding sites within the
        genome.

        Currently this can only handle a single data file and a single
        background file.

        Parameters
        ----------
        input_files : dict
            Location of the initial input files required by the workflow

            genome : str
                Genome FASTA file

            index : str
                Location of the BWA archived index files

            loc : str
                Location of the FASTQ reads files

            fastq2 : str
                Location of the paired end FASTQ file [OPTIONAL]

            bg_loc : str
                Location of the background FASTQ reads files [OPTIONAL]

            fastq2_bg : str
                Location of the paired end background FASTQ reads files [OPTIONAL]

        metadata : dict
            Input file meta data associated with their roles

            genome : str
            index : str

            bg_loc : str
                [OPTIONAL]

        output_files : dict
            Output file locations

            bam [, "bam_bg"] : str
            filtered [, "filtered_bg"] : str
            narrow_peak : str
            summits : str
            broad_peak : str
            gapped_peak : str

        Returns
        -------
        output_files : dict
            Output file locations associated with their roles, for the output

            bam [, "bam_bg"] : str
                Aligned FASTQ short read file [ and aligned background file]
                locations
            filtered [, "filtered_bg"] : str
                Filtered versions of the respective bam files
            narrow_peak : str
                Results files in bed4+1 format
            summits : str
                Results files in bed6+4 format
            broad_peak : str
                Results files in bed6+3 format
            gapped_peak : str
                Results files in bed12+3 format
        output_metadata : dict
            Output metadata for the associated files in output_files

            bam [, "bam_bg"] : Metadata
            filtered [, "filtered_bg"] : Metadata
            narrow_peak : Metadata
            summits : Metadata
            broad_peak : Metadata
            gapped_peak : Metadata
        """
        output_files_generated = {}
        output_metadata = {}

        logger.info("PROCESS CHIPSEQ - DEFINED OUTPUT:", output_files["bam"])

        align_input_files = remap(input_files, "genome", "loc", "index")
        align_input_file_meta = remap(metadata, "genome", "loc", "index")

        if "fastq2" in input_files:
            align_input_files["fastq2"] = input_files["fastq2"]
            align_input_file_meta["fastq2"] = metadata["fastq2"]

        bwa = bwaAlignerTool(self.configuration)
        bwa_files, bwa_meta = bwa.run(align_input_files, align_input_file_meta,
                                      {"output": output_files["bam"]})
        try:
            output_files_generated["bam"] = bwa_files["bam"]
            output_metadata["bam"] = bwa_meta["bam"]

            tool_name = output_metadata['bam'].meta_data['tool']
            output_metadata['bam'].meta_data['tool_description'] = tool_name
            output_metadata['bam'].meta_data['tool'] = "process_chipseq"
        except KeyError:
            logger.fatal("BWA aligner failed")

        if "bg_loc" in input_files:
            # Align background files
            align_input_files_bg = remap(input_files,
                                         "genome",
                                         "index",
                                         loc="bg_loc")
            align_input_file_meta_bg = remap(metadata,
                                             "genome",
                                             "index",
                                             loc="bg_loc")

            if "fastq2" in input_files:
                align_input_files_bg["fastq2"] = input_files["fastq2_bg"]
                align_input_file_meta_bg["fastq2"] = metadata["fastq2_bg"]

            bwa_bg_files, bwa_bg_meta = bwa.run(
                align_input_files_bg, align_input_file_meta_bg,
                {"output": output_files["bam_bg"]})

            try:
                output_files_generated["bam_bg"] = bwa_bg_files["bam_bg"]
                output_metadata["bam_bg"] = bwa_bg_meta["bam_bg"]

                tool_name = output_metadata['bam_bg'].meta_data['tool']
                output_metadata['bam_bg'].meta_data[
                    'tool_description'] = tool_name
                output_metadata['bam_bg'].meta_data['tool'] = "process_chipseq"
            except KeyError:
                logger.fatal("Background BWA aligner failed")

        # Filter the bams
        b3f = biobambam(self.configuration)
        b3f_files, b3f_meta = b3f.run({"input": bwa_files['bam']},
                                      {"input": bwa_meta['bam']},
                                      {"output": output_files["filtered"]})

        try:
            output_files_generated["filtered"] = b3f_files["bam"]
            output_metadata["filtered"] = b3f_meta["bam"]

            tool_name = output_metadata['filtered'].meta_data['tool']
            output_metadata['filtered'].meta_data[
                'tool_description'] = tool_name
            output_metadata['filtered'].meta_data['tool'] = "process_chipseq"
        except KeyError:
            logger.fatal("BioBamBam filtering failed")

        if "bg_loc" in input_files:
            # Filter background aligned files
            b3f_bg_files, b3f_bg_meta = b3f.run(
                {"input": bwa_bg_files['bam']}, {"input": bwa_bg_meta['bam']},
                {"output": output_files["filtered_bg"]})

            try:
                output_files_generated["filtered_bg"] = b3f_bg_files["bam"]
                output_metadata["filtered_bg"] = b3f_bg_meta["bam"]

                tool_name = output_metadata['filtered_bg'].meta_data['tool']
                output_metadata['filtered_bg'].meta_data[
                    'tool_description'] = tool_name
                output_metadata['filtered_bg'].meta_data[
                    'tool'] = "process_chipseq"
            except KeyError:
                logger.fatal("Background BioBamBam filtering failed")

        # MACS2 to call peaks
        macs_caller = macs2(self.configuration)
        macs_inputs = {"bam": output_files_generated["filtered"]}
        macs_metadt = {"bam": output_metadata['filtered']}

        if "bg_loc" in input_files:
            macs_inputs["bam_bg"] = output_files_generated["filtered_bg"]
            macs_metadt["bam_bg"] = output_metadata['filtered_bg']

        m_results_files, m_results_meta = macs_caller.run(
            macs_inputs,
            macs_metadt,
            # Outputs of the final step may match workflow outputs;
            # Extra entries in output_files will be disregarded.
            remap(output_files, 'narrow_peak', 'summits', 'broad_peak',
                  'gapped_peak'))

        if not m_results_meta:
            logger.fatal("MACS2 peak calling failed")

        if 'narrow_peak' in m_results_meta:
            output_files_generated['narrow_peak'] = m_results_files[
                'narrow_peak']
            output_metadata['narrow_peak'] = m_results_meta['narrow_peak']

            tool_name = output_metadata['narrow_peak'].meta_data['tool']
            output_metadata['narrow_peak'].meta_data[
                'tool_description'] = tool_name
            output_metadata['narrow_peak'].meta_data[
                'tool'] = "process_chipseq"
        if 'summits' in m_results_meta:
            output_files_generated['summits'] = m_results_files['summits']
            output_metadata['summits'] = m_results_meta['summits']

            tool_name = output_metadata['summits'].meta_data['tool']
            output_metadata['summits'].meta_data[
                'tool_description'] = tool_name
            output_metadata['summits'].meta_data['tool'] = "process_chipseq"
        if 'broad_peak' in m_results_meta:
            output_files_generated['broad_peak'] = m_results_files[
                'broad_peak']
            output_metadata['broad_peak'] = m_results_meta['broad_peak']

            tool_name = output_metadata['broad_peak'].meta_data['tool']
            output_metadata['broad_peak'].meta_data[
                'tool_description'] = tool_name
            output_metadata['broad_peak'].meta_data['tool'] = "process_chipseq"
        if 'gapped_peak' in m_results_meta:
            output_files_generated['gapped_peak'] = m_results_files[
                'gapped_peak']
            output_metadata['gapped_peak'] = m_results_meta['gapped_peak']

            tool_name = output_metadata['gapped_peak'].meta_data['tool']
            output_metadata['gapped_peak'].meta_data[
                'tool_description'] = tool_name
            output_metadata['gapped_peak'].meta_data[
                'tool'] = "process_chipseq"

        return output_files_generated, output_metadata