示例#1
0
    def __init__(self, peaks, oracle, strand="+"):
        """
        Class to handle running IDR with more than 2 replicates, which default
        IDR does not handle. Here we run all pairwise IDR, and then select the
        min number of peaks under the IDR threshold and then return that many
        from the provided oracle.

        Parameters
        ----------
        peaks : list
            List of narrowPeak files or pybedtools.BedTool objects pointing to
            narrowPeak files

        oracle : string or pybedtools.BedTool
            Peaks to pull from, generally from original peaks that have been
            merged in some way.

        strand : +, -, .
            Assumes the entire object represents a single strand; specify it
            here.
        """
        #: list of peaks
        self.peaks = peaks

        #: BedTool of merged peaks to uses as oracle
        self.oracle = pybedtools.BedTool(oracle)

        #: This object represents a single strand indicated here
        self.strand = strand

        # Simplified from idr.load_samples()
        self.signal_type = "signal.value"
        self.signal_index = 6
        self.peak_merge_fn = sum
        self.summit_index = 9

        #: Peaks loads as internal IDR data structures
        self.fps = [
            idr.load_bed(open(fn), self.signal_index, self.summit_index)
            for fn in self.peaks
        ]

        self.oracle_peaks = idr.load_bed(
            open(self.oracle.fn), self.signal_index, self.summit_index
        )

        # self._build_oracle()

        #: Holds information from running IDR.
        #: Keys are frozenset([i, j]) indicating the pairwise IDRs between
        #: peaks i and j.
        self.idrs = {}
示例#2
0
    def _build_merged(self, idx1, idx2):
        """
        Initial stage used by IDR.

        Uses IDR's internal routine for merging peaks. This is intended to be
        called by self.idr, which only works with 2 replicates at a time, hence
        the hard-coding of idx1 and idx2. See self._multiway_merge() for
        merging more than 2 replicates.

        Parameters
        ----------

        idx1, idx2 : int
            Indexes into self.peaks


        Returns
        -------
        idr
        """
        logger.info(
            f"Merging peaks for {self.peaks[idx1]} and {self.peaks[idx2]}")
        fn1 = self.peaks[idx1]
        fn2 = self.peaks[idx2]
        f1, f2 = [
            idr.load_bed(open(fp), self.signal_index, self.summit_index)
            for fp in [fn1, fn2]
        ]
        merged_peaks = idr.merge_peaks(
            [f1, f2],
            self.peak_merge_fn,
            self.oracle_peaks,
            use_nonoverlapping_peaks=False,
        )
        return merged_peaks
示例#3
0
    def _build_oracle(self):
        """
        Attempts as building an oracle. Deprecated, but retaining as fodder.
        """
        logger.info("Building oracle peaks...")

        # cat-and-merge strategy
        if 0:
            oracle = (pybedtools.BedTool.from_dataframe(
                pybedtools.BedTool(self.peaks[0]).cat(
                    *self.peaks[1:], o="sum", c=5).to_dataframe().sort_values(
                        "name", ascending=False)).each(to_narrowpeak).saveas())

        # multiintersect strategy
        if 0:
            h = pybedtools.BedTool().multi_intersect(i=self.peaks,
                                                     cluster=True)

            lim = str(len(self.peaks))

            def filt(x):
                if x[3] != lim:
                    return
                return pybedtools.create_interval_from_list(
                    [x.chrom, str(x.start), str(x.stop)])

            oracle = h.each(filt).saveas()

        # clustered strategy
        if 1:
            clustered = (pybedtools.BedTool(self.peaks[0]).cat(
                *self.peaks[1:],
                postmerge=False).sort().cluster().to_dataframe())

            def gen():
                for _, group in clustered.groupby("blockSizes"):
                    score = group["score"].sum()
                    start = group["start"].min()
                    stop = group["end"].max()
                    chrom = group["chrom"].unique()[0]
                    yield pybedtools.create_interval_from_list([
                        chrom,
                        str(start),
                        str(stop),
                        ".",
                        ".",
                        self.strand,
                        str(score),
                        "-1",
                        "-1",
                        "-1",
                    ])

            oracle = sort_by_score(pybedtools.BedTool(gen()).saveas())

        # IDR internal strategy
        if 0:
            oracle = self._multiway_merge()

        # By the time we get here, should have `oracle`
        self.oracle = oracle
        self.oracle_peaks = idr.load_bed(open(oracle.fn), self.signal_index,
                                         self.summit_index)