def __init__(self, peaks, oracle, strand="+"): """ Class to handle running IDR with more than 2 replicates, which default IDR does not handle. Here we run all pairwise IDR, and then select the min number of peaks under the IDR threshold and then return that many from the provided oracle. Parameters ---------- peaks : list List of narrowPeak files or pybedtools.BedTool objects pointing to narrowPeak files oracle : string or pybedtools.BedTool Peaks to pull from, generally from original peaks that have been merged in some way. strand : +, -, . Assumes the entire object represents a single strand; specify it here. """ #: list of peaks self.peaks = peaks #: BedTool of merged peaks to uses as oracle self.oracle = pybedtools.BedTool(oracle) #: This object represents a single strand indicated here self.strand = strand # Simplified from idr.load_samples() self.signal_type = "signal.value" self.signal_index = 6 self.peak_merge_fn = sum self.summit_index = 9 #: Peaks loads as internal IDR data structures self.fps = [ idr.load_bed(open(fn), self.signal_index, self.summit_index) for fn in self.peaks ] self.oracle_peaks = idr.load_bed( open(self.oracle.fn), self.signal_index, self.summit_index ) # self._build_oracle() #: Holds information from running IDR. #: Keys are frozenset([i, j]) indicating the pairwise IDRs between #: peaks i and j. self.idrs = {}
def _build_merged(self, idx1, idx2): """ Initial stage used by IDR. Uses IDR's internal routine for merging peaks. This is intended to be called by self.idr, which only works with 2 replicates at a time, hence the hard-coding of idx1 and idx2. See self._multiway_merge() for merging more than 2 replicates. Parameters ---------- idx1, idx2 : int Indexes into self.peaks Returns ------- idr """ logger.info( f"Merging peaks for {self.peaks[idx1]} and {self.peaks[idx2]}") fn1 = self.peaks[idx1] fn2 = self.peaks[idx2] f1, f2 = [ idr.load_bed(open(fp), self.signal_index, self.summit_index) for fp in [fn1, fn2] ] merged_peaks = idr.merge_peaks( [f1, f2], self.peak_merge_fn, self.oracle_peaks, use_nonoverlapping_peaks=False, ) return merged_peaks
def _build_oracle(self): """ Attempts as building an oracle. Deprecated, but retaining as fodder. """ logger.info("Building oracle peaks...") # cat-and-merge strategy if 0: oracle = (pybedtools.BedTool.from_dataframe( pybedtools.BedTool(self.peaks[0]).cat( *self.peaks[1:], o="sum", c=5).to_dataframe().sort_values( "name", ascending=False)).each(to_narrowpeak).saveas()) # multiintersect strategy if 0: h = pybedtools.BedTool().multi_intersect(i=self.peaks, cluster=True) lim = str(len(self.peaks)) def filt(x): if x[3] != lim: return return pybedtools.create_interval_from_list( [x.chrom, str(x.start), str(x.stop)]) oracle = h.each(filt).saveas() # clustered strategy if 1: clustered = (pybedtools.BedTool(self.peaks[0]).cat( *self.peaks[1:], postmerge=False).sort().cluster().to_dataframe()) def gen(): for _, group in clustered.groupby("blockSizes"): score = group["score"].sum() start = group["start"].min() stop = group["end"].max() chrom = group["chrom"].unique()[0] yield pybedtools.create_interval_from_list([ chrom, str(start), str(stop), ".", ".", self.strand, str(score), "-1", "-1", "-1", ]) oracle = sort_by_score(pybedtools.BedTool(gen()).saveas()) # IDR internal strategy if 0: oracle = self._multiway_merge() # By the time we get here, should have `oracle` self.oracle = oracle self.oracle_peaks = idr.load_bed(open(oracle.fn), self.signal_index, self.summit_index)