def coolerInfo(cool: cooler.api.Cooler, k: str): """Retrieve metadata from Cooler file The required metadata fields are documented in: https://cooler.readthedocs.io/en/latest/schema.html#metadata This function will attempt to return the requested field via the input key `k` directly from the Cooler `cool` object or if that doesn't work, will try to compute it from the contact matrix for certain types of metadata Args: cool (cooler.api.Cooler): Input Cooler object k (str): Key of the metadata field Returns: Requested metadata """ if k in cool.info: return cool.info[k] elif k == 'sum': return cool.pixels()['count'][:].sum() elif k == 'nbins': return cool.bins().shape[0] elif k == 'nnz': return cool.pixels().shape[0] elif k == 'nchroms': return cool.chroms().shape[0] else: raise KeyError(f'Unable to retrieve metadata field \'{k}\'')
def hicrepSCC(cool1: cooler.api.Cooler, cool2: cooler.api.Cooler, h: int, dBPMax: int, bDownSample: bool, chrNames: list = None, excludeChr: set = None): """Compute hicrep score between two input Cooler contact matrices Args: cool1: `cooler.api.Cooler` Input Cooler contact matrix 1 cool2: `cooler.api.Cooler` Input Cooler contact matrix 2 h: `int` Half-size of the mean filter used to smooth the input matrics dBPMax `int` Only include contacts that are at most this genomic distance (bp) away bDownSample: `bool` Down sample the input with more contacts to the same number of contacts as in the other input chrNames: `list` List of chromosome names whose SCC to compute. Default to None, which means all chromosomes in the genome are used to compute SCC excludeChr: `set` Set of chromosome names to exclude from SCC computation. Default to None. Returns: `float` scc scores for each chromosome """ binSize1 = cool1.binsize binSize2 = cool2.binsize assert binSize1 == binSize2,\ f"Input cool files have different bin sizes" assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\ f"Input cool files have different number of bins" assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\ f"Input cool files have different number of chromosomes" assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\ f"Input file have different chromosome names" binSize = binSize1 bins1 = cool1.bins() bins2 = cool2.bins() if binSize is None: # sometimes bin size can be None, e.g., input cool file has # non-uniform size bins. assert np.all(bins1[:] == bins2[:]),\ f"Input cooler files don't have a unique bin size most likely "\ f"because non-uniform bin size was used and the bins are defined "\ f"differently for the two input cooler files" # In that case, use the median bin size binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values)) warnings.warn(f"Input cooler files don't have a unique bin size most "\ f"likely because non-uniform bin size was used. HicRep "\ f"will use median bin size from the first cooler file "\ f"to determine maximal diagonal index to include", RuntimeWarning) if dBPMax == -1: # this is the exclusive upper bound dMax = coolerInfo(cool1, 'nbins') else: dMax = dBPMax // binSize + 1 assert dMax > 1, f"Input dBPmax is smaller than binSize" p1 = cool2pixels(cool1) p2 = cool2pixels(cool2) # get the total number of contacts as normalizing constant n1 = coolerInfo(cool1, 'sum') n2 = coolerInfo(cool2, 'sum') # Use dict here so that the chrNames don't duplicate if chrNames is None: chrNamesDict = dict.fromkeys(cool1.chroms()[:]['name'].tolist()) else: chrNamesDict = dict.fromkeys(chrNames) # It's important to preserve the order of the input chrNames so that the # user knows the order of the output SCC scores so we bail when encounter # duplicate names rather than implicit prunning the names. assert chrNames is None or len(chrNamesDict) == len(chrNames), f""" Found Duplicates in {chrNames}. Please remove them. """ # filter out excluded chromosomes if excludeChr is None: excludeChr = set() chrNames = [ chrName for chrName in chrNamesDict if chrName not in excludeChr ] scc = np.full(len(chrNames), -2.0) for iChr, chrName in enumerate(chrNames): # normalize by total number of contacts mS1 = getSubCoo(p1, bins1, chrName) assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % ( chrName) assert mS1.shape[0] == mS1.shape[1],\ "Contact matrix 1 of chromosome %s is not square" % (chrName) mS2 = getSubCoo(p2, bins2, chrName) assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % ( chrName) assert mS2.shape[0] == mS2.shape[1],\ "Contact matrix 2 of chromosome %s is not square" % (chrName) assert mS1.shape == mS2.shape,\ "Contact matrices of chromosome %s have different input shape" % (chrName) nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0]) rho = np.full(nDiags, np.nan) ws = np.full(nDiags, np.nan) # remove major diagonal and all the diagonals >= nDiags # to save computation time m1 = trimDiags(mS1, nDiags, False) m2 = trimDiags(mS2, nDiags, False) del mS1 del mS2 if bDownSample: # do downsampling size1 = m1.sum() size2 = m2.sum() if size1 > size2: m1 = resample(m1, size2).astype(float) elif size2 > size1: m2 = resample(m2, size1).astype(float) else: # just normalize by total contacts m1 = m1.astype(float) / n1 m2 = m2.astype(float) / n2 if h > 0: # apply smoothing m1 = meanFilterSparse(m1, h) m2 = meanFilterSparse(m2, h) scc[iChr] = sccByDiag(m1, m2, nDiags) return scc
def hicrepSCC(cool1: cooler.api.Cooler, cool2: cooler.api.Cooler, h: int, dBPMax: int, bDownSample: bool): """Compute hicrep score between two input Cooler contact matrices Args: cool1: `cooler.api.Cooler` Input Cooler contact matrix 1 cool2: `cooler.api.Cooler` Input Cooler contact matrix 2 h: `int` Half-size of the mean filter used to smooth the input matrics dBPMax `int` Only include contacts that are at most this genomic distance (bp) away bDownSample: `bool` Down sample the input with more contacts to the same number of contacts as in the other input Returns: `float` scc scores for each chromosome """ binSize1 = cool1.binsize binSize2 = cool2.binsize assert binSize1 == binSize2,\ f"Input cool files have different bin sizes" assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\ f"Input cool files have different number of bins" assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\ f"Input cool files have different number of chromosomes" assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\ f"Input file have different chromosome names" binSize = binSize1 bins1 = cool1.bins() bins2 = cool2.bins() if binSize is None: # sometimes bin size can be None, e.g., input cool file has # non-uniform size bins. assert np.all(bins1[:] == bins2[:]),\ f"Input cooler files don't have a unique bin size most likely "\ f"because non-uniform bin size was used and the bins are defined "\ f"differently for the two input cooler files" # In that case, use the median bin size binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values)) warnings.warn(f"Input cooler files don't have a unique bin size most "\ f"likely because non-uniform bin size was used. HicRep "\ f"will use median bin size from the first cooler file "\ f"to determine maximal diagonal index to include", RuntimeWarning) if dBPMax == -1: # this is the exclusive upper bound dMax = coolerInfo(cool1, 'nbins') else: dMax = dBPMax // binSize + 1 assert dMax > 1, f"Input dBPmax is smaller than binSize" p1 = cool2pixels(cool1) p2 = cool2pixels(cool2) # get the total number of contacts as normalizing constant n1 = coolerInfo(cool1, 'sum') n2 = coolerInfo(cool2, 'sum') chrNames = cool1.chroms()[:]['name'].to_numpy() # filter out mitochondria chromosome chrNames = np.array([name for name in chrNames if name != 'M']) scc = np.full(chrNames.shape[0], -2.0) for iChr in range(chrNames.shape[0]): chrName = chrNames[iChr] # normalize by total number of contacts mS1 = getSubCoo(p1, bins1, chrName) assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % ( chrName) assert mS1.shape[0] == mS1.shape[1],\ "Contact matrix 1 of chromosome %s is not square" % (chrName) mS2 = getSubCoo(p2, bins2, chrName) assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % ( chrName) assert mS2.shape[0] == mS2.shape[1],\ "Contact matrix 2 of chromosome %s is not square" % (chrName) assert mS1.shape == mS2.shape,\ "Contact matrices of chromosome %s have different input shape" % (chrName) nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0]) rho = np.full(nDiags, np.nan) ws = np.full(nDiags, np.nan) # remove major diagonal and all the diagonals >= nDiags # to save computation time m1 = trimDiags(mS1, nDiags, False) m2 = trimDiags(mS2, nDiags, False) del mS1 del mS2 if bDownSample: # do downsampling size1 = m1.sum() size2 = m2.sum() if size1 > size2: m1 = resample(m1, size2).astype(float) elif size2 > size1: m2 = resample(m2, size1).astype(float) else: # just normalize by total contacts m1 = m1.astype(float) / n1 m2 = m2.astype(float) / n2 if h > 0: # apply smoothing m1 = meanFilterSparse(m1, h) m2 = meanFilterSparse(m2, h) scc[iChr] = sccByDiag(m1, m2, nDiags) return scc