def coolerInfo(cool: cooler.api.Cooler, k: str): """Retrieve metadata from Cooler file The required metadata fields are documented in: https://cooler.readthedocs.io/en/latest/schema.html#metadata This function will attempt to return the requested field via the input key `k` directly from the Cooler `cool` object or if that doesn't work, will try to compute it from the contact matrix for certain types of metadata Args: cool (cooler.api.Cooler): Input Cooler object k (str): Key of the metadata field Returns: Requested metadata """ if k in cool.info: return cool.info[k] elif k == 'sum': return cool.pixels()['count'][:].sum() elif k == 'nbins': return cool.bins().shape[0] elif k == 'nnz': return cool.pixels().shape[0] elif k == 'nchroms': return cool.chroms().shape[0] else: raise KeyError(f'Unable to retrieve metadata field \'{k}\'')
def hicrepSCC(cool1: cooler.api.Cooler, cool2: cooler.api.Cooler, h: int, dBPMax: int, bDownSample: bool, chrNames: list = None, excludeChr: set = None): """Compute hicrep score between two input Cooler contact matrices Args: cool1: `cooler.api.Cooler` Input Cooler contact matrix 1 cool2: `cooler.api.Cooler` Input Cooler contact matrix 2 h: `int` Half-size of the mean filter used to smooth the input matrics dBPMax `int` Only include contacts that are at most this genomic distance (bp) away bDownSample: `bool` Down sample the input with more contacts to the same number of contacts as in the other input chrNames: `list` List of chromosome names whose SCC to compute. Default to None, which means all chromosomes in the genome are used to compute SCC excludeChr: `set` Set of chromosome names to exclude from SCC computation. Default to None. Returns: `float` scc scores for each chromosome """ binSize1 = cool1.binsize binSize2 = cool2.binsize assert binSize1 == binSize2,\ f"Input cool files have different bin sizes" assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\ f"Input cool files have different number of bins" assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\ f"Input cool files have different number of chromosomes" assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\ f"Input file have different chromosome names" binSize = binSize1 bins1 = cool1.bins() bins2 = cool2.bins() if binSize is None: # sometimes bin size can be None, e.g., input cool file has # non-uniform size bins. assert np.all(bins1[:] == bins2[:]),\ f"Input cooler files don't have a unique bin size most likely "\ f"because non-uniform bin size was used and the bins are defined "\ f"differently for the two input cooler files" # In that case, use the median bin size binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values)) warnings.warn(f"Input cooler files don't have a unique bin size most "\ f"likely because non-uniform bin size was used. HicRep "\ f"will use median bin size from the first cooler file "\ f"to determine maximal diagonal index to include", RuntimeWarning) if dBPMax == -1: # this is the exclusive upper bound dMax = coolerInfo(cool1, 'nbins') else: dMax = dBPMax // binSize + 1 assert dMax > 1, f"Input dBPmax is smaller than binSize" p1 = cool2pixels(cool1) p2 = cool2pixels(cool2) # get the total number of contacts as normalizing constant n1 = coolerInfo(cool1, 'sum') n2 = coolerInfo(cool2, 'sum') # Use dict here so that the chrNames don't duplicate if chrNames is None: chrNamesDict = dict.fromkeys(cool1.chroms()[:]['name'].tolist()) else: chrNamesDict = dict.fromkeys(chrNames) # It's important to preserve the order of the input chrNames so that the # user knows the order of the output SCC scores so we bail when encounter # duplicate names rather than implicit prunning the names. assert chrNames is None or len(chrNamesDict) == len(chrNames), f""" Found Duplicates in {chrNames}. Please remove them. """ # filter out excluded chromosomes if excludeChr is None: excludeChr = set() chrNames = [ chrName for chrName in chrNamesDict if chrName not in excludeChr ] scc = np.full(len(chrNames), -2.0) for iChr, chrName in enumerate(chrNames): # normalize by total number of contacts mS1 = getSubCoo(p1, bins1, chrName) assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % ( chrName) assert mS1.shape[0] == mS1.shape[1],\ "Contact matrix 1 of chromosome %s is not square" % (chrName) mS2 = getSubCoo(p2, bins2, chrName) assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % ( chrName) assert mS2.shape[0] == mS2.shape[1],\ "Contact matrix 2 of chromosome %s is not square" % (chrName) assert mS1.shape == mS2.shape,\ "Contact matrices of chromosome %s have different input shape" % (chrName) nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0]) rho = np.full(nDiags, np.nan) ws = np.full(nDiags, np.nan) # remove major diagonal and all the diagonals >= nDiags # to save computation time m1 = trimDiags(mS1, nDiags, False) m2 = trimDiags(mS2, nDiags, False) del mS1 del mS2 if bDownSample: # do downsampling size1 = m1.sum() size2 = m2.sum() if size1 > size2: m1 = resample(m1, size2).astype(float) elif size2 > size1: m2 = resample(m2, size1).astype(float) else: # just normalize by total contacts m1 = m1.astype(float) / n1 m2 = m2.astype(float) / n2 if h > 0: # apply smoothing m1 = meanFilterSparse(m1, h) m2 = meanFilterSparse(m2, h) scc[iChr] = sccByDiag(m1, m2, nDiags) return scc
def get_frag(c: cooler.api.Cooler, resolution: int, offsets: pd.core.series.Series, chrom1: str, start1: int, end1: int, chrom2: str, start2: int, end2: int, width: int = 22, height: int = -1, padding: int = 10, normalize: bool = True, balanced: bool = True, percentile: float = 100.0, ignore_diags: int = 0, no_normalize: bool = False) -> np.ndarray: """ Retrieves a matrix fragment. Args: c: Cooler object. chrom1: Chromosome 1. E.g.: `1` or `chr1`. start1: First start position in base pairs relative to `chrom1`. end1: First end position in base pairs relative to `chrom1`. chrom2: Chromosome 2. E.g.: `1` or `chr1`. start2: Second start position in base pairs relative to `chrom2`. end2: Second end position in base pairs relative to `chrom2`. offsets: Pandas Series of chromosome offsets in bins. width: Width of the fragment in pixels. height: Height of the fragments in pixels. If `-1` `height` will equal `width`. Defaults to `-1`. padding: Percental padding related to the dimension of the fragment. E.g., 10 = 10% padding (5% per side). Defaults to `10`. normalize: If `True` the fragment will be normalized to [0, 1]. Defaults to `True`. balanced: If `True` the fragment will be balanced using Cooler. Defaults to `True`. percentile: Percentile clip. E.g., For 99 the maximum will be capped at the 99-percentile. Defaults to `100.0`. ignore_diags: Number of diagonals to be ignored, i.e., set to 0. Defaults to `0`. no_normalize: If `true` the returned matrix is not normalized. Defaults to `False`. Returns: """ if height is -1: height = width # Restrict padding to be [0, 100]% padding = min(100, max(0, padding)) / 100 try: offset1 = offsets[chrom1] offset2 = offsets[chrom2] except KeyError: # One more try before we will fail miserably offset1 = offsets['chr{}'.format(chrom1)] offset2 = offsets['chr{}'.format(chrom2)] start_bin1 = offset1 + int(round(float(start1) / resolution)) end_bin1 = offset1 + int(round(float(end1) / resolution)) + 1 start_bin2 = offset2 + int(round(float(start2) / resolution)) end_bin2 = offset2 + int(round(float(end2) / resolution)) + 1 # Apply percentile padding padding1 = int(round(((end_bin1 - start_bin1) / 2) * padding)) padding2 = int(round(((end_bin2 - start_bin2) / 2) * padding)) start_bin1 -= padding1 start_bin2 -= padding2 end_bin1 += padding1 end_bin2 += padding2 # Get the size of the region dim1 = end_bin1 - start_bin1 dim2 = end_bin2 - start_bin2 # Get additional absolute padding if needed padding1 = 0 if dim1 < width: padding1 = int((width - dim1) / 2) start_bin1 -= padding1 end_bin1 += padding1 padding2 = 0 if dim2 < height: padding2 = int((height - dim2) / 2) start_bin2 -= padding2 end_bin2 += padding2 # In case the final dimension does not math the desired dimension we # increase the end bin. This can be caused when the padding is not # divisible by 2, since the padding is rounded to the nearest integer. abs_dim1 = abs(start_bin1 - end_bin1) if abs_dim1 < width: end_bin1 += width - abs_dim1 abs_dim1 = width abs_dim2 = abs(start_bin2 - end_bin2) if abs_dim2 < height: end_bin2 += height - abs_dim2 abs_dim2 = height # Maximum width / height is 512 if abs_dim1 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge() if abs_dim2 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge() # Finally, adjust to negative values. # Since relative bin IDs are adjusted by the start this will lead to a # white offset. real_start_bin1 = start_bin1 if start_bin1 >= 0 else 0 real_start_bin2 = start_bin2 if start_bin2 >= 0 else 0 # Get the data data = c.matrix(as_pixels=True, balance=False, max_chunk=np.inf)[real_start_bin1:end_bin1, real_start_bin2:end_bin2] # Annotate pixels for balancing bins = c.bins(convert_enum=False)[['weight']] data = cooler.annotate(data, bins, replace=False) # Calculate relative bin IDs rel_bin1 = np.add(data['bin1_id'].values, -start_bin1) rel_bin2 = np.add(data['bin2_id'].values, -start_bin2) # Balance counts if balanced: values = data['count'].values.astype(np.float32) values *= data['weight1'].values * data['weight2'].values else: values = data['count'].values # Get pixel IDs for the upper triangle idx1 = np.add(np.multiply(rel_bin1, abs_dim1), rel_bin2) # Mirror matrix idx2_1 = np.add(data['bin2_id'].values, -start_bin1) idx2_2 = np.add(data['bin1_id'].values, -start_bin2) idx2 = np.add(np.multiply(idx2_1, abs_dim1), idx2_2) validBins = np.where((idx2_1 < abs_dim1) & (idx2_2 >= 0)) # Ignore diagonals diags_start_row = None if ignore_diags > 0: try: diags_start_idx = np.min( np.where(data['bin1_id'].values == data['bin2_id'].values)) diags_start_row = (rel_bin1[diags_start_idx] - rel_bin2[diags_start_idx]) except ValueError: pass # Copy pixel values onto the final array frag_len = abs_dim1 * abs_dim2 frag = np.zeros(frag_len, dtype=np.float32) # Make sure we're within the bounds idx1_f = np.where(idx1 < frag_len) frag[idx1[idx1_f]] = values[idx1_f] frag[idx2[validBins]] = values[validBins] frag = frag.reshape((abs_dim1, abs_dim2)) # Store low quality bins low_quality_bins = np.where(np.isnan(frag)) # Assign 0 for now to avoid influencing the max values frag[low_quality_bins] = 0 # Scale fragment down if needed scaled = False scale_x = width / frag.shape[0] if frag.shape[0] > width or frag.shape[1] > height: scaledFrag = np.zeros((width, height), float) frag = scaledFrag + zoomArray(frag, scaledFrag.shape, order=1) scaled = True # Normalize by minimum if not no_normalize: min_val = np.min(frag) frag -= min_val ignored_idx = None # Remove diagonals if ignore_diags > 0 and diags_start_row is not None: if width == height: scaled_row = int(np.rint(diags_start_row / scale_x)) idx = np.diag_indices(width) scaled_idx = (idx if scaled_row == 0 else [idx[0][scaled_row:], idx[0][:-scaled_row]]) for i in range(ignore_diags): # First set all cells to be ignored to `-1` so that we can # easily query for them later. if i == 0: frag[scaled_idx] = -1 else: dist_to_diag = scaled_row - i dist_neg = min(0, dist_to_diag) off = 0 if dist_to_diag >= 0 else i - scaled_row # Above diagonal frag[((scaled_idx[0] - i)[off:], (scaled_idx[1])[off:])] = -1 # Extra cutoff at the bottom right frag[(range( scaled_idx[0][-1] - i, scaled_idx[0][-1] + 1 + dist_neg, ), range(scaled_idx[1][-1], scaled_idx[1][-1] + i + 1 + dist_neg))] = -1 # Below diagonal frag[((scaled_idx[0] + i)[:-i], (scaled_idx[1])[:-i])] = -1 # Save the final selection of ignored cells for fast access # later and set those values to `0` now. ignored_idx = np.where(frag == -1) frag[ignored_idx] = 0 else: logger.warn( 'Ignoring the diagonal only supported for squared features') # Capp by percentile max_val = np.percentile(frag, percentile) frag = np.clip(frag, 0, max_val) # Normalize by maximum if not no_normalize and max_val > 0: frag /= max_val # Set the ignored diagonal to the maximum if ignored_idx: frag[ignored_idx] = 1.0 if not scaled: # Recover low quality bins frag[low_quality_bins] = -1 return frag
def hicrepSCC(cool1: cooler.api.Cooler, cool2: cooler.api.Cooler, h: int, dBPMax: int, bDownSample: bool): """Compute hicrep score between two input Cooler contact matrices Args: cool1: `cooler.api.Cooler` Input Cooler contact matrix 1 cool2: `cooler.api.Cooler` Input Cooler contact matrix 2 h: `int` Half-size of the mean filter used to smooth the input matrics dBPMax `int` Only include contacts that are at most this genomic distance (bp) away bDownSample: `bool` Down sample the input with more contacts to the same number of contacts as in the other input Returns: `float` scc scores for each chromosome """ binSize1 = cool1.binsize binSize2 = cool2.binsize assert binSize1 == binSize2,\ f"Input cool files have different bin sizes" assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\ f"Input cool files have different number of bins" assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\ f"Input cool files have different number of chromosomes" assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\ f"Input file have different chromosome names" binSize = binSize1 bins1 = cool1.bins() bins2 = cool2.bins() if binSize is None: # sometimes bin size can be None, e.g., input cool file has # non-uniform size bins. assert np.all(bins1[:] == bins2[:]),\ f"Input cooler files don't have a unique bin size most likely "\ f"because non-uniform bin size was used and the bins are defined "\ f"differently for the two input cooler files" # In that case, use the median bin size binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values)) warnings.warn(f"Input cooler files don't have a unique bin size most "\ f"likely because non-uniform bin size was used. HicRep "\ f"will use median bin size from the first cooler file "\ f"to determine maximal diagonal index to include", RuntimeWarning) if dBPMax == -1: # this is the exclusive upper bound dMax = coolerInfo(cool1, 'nbins') else: dMax = dBPMax // binSize + 1 assert dMax > 1, f"Input dBPmax is smaller than binSize" p1 = cool2pixels(cool1) p2 = cool2pixels(cool2) # get the total number of contacts as normalizing constant n1 = coolerInfo(cool1, 'sum') n2 = coolerInfo(cool2, 'sum') chrNames = cool1.chroms()[:]['name'].to_numpy() # filter out mitochondria chromosome chrNames = np.array([name for name in chrNames if name != 'M']) scc = np.full(chrNames.shape[0], -2.0) for iChr in range(chrNames.shape[0]): chrName = chrNames[iChr] # normalize by total number of contacts mS1 = getSubCoo(p1, bins1, chrName) assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % ( chrName) assert mS1.shape[0] == mS1.shape[1],\ "Contact matrix 1 of chromosome %s is not square" % (chrName) mS2 = getSubCoo(p2, bins2, chrName) assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % ( chrName) assert mS2.shape[0] == mS2.shape[1],\ "Contact matrix 2 of chromosome %s is not square" % (chrName) assert mS1.shape == mS2.shape,\ "Contact matrices of chromosome %s have different input shape" % (chrName) nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0]) rho = np.full(nDiags, np.nan) ws = np.full(nDiags, np.nan) # remove major diagonal and all the diagonals >= nDiags # to save computation time m1 = trimDiags(mS1, nDiags, False) m2 = trimDiags(mS2, nDiags, False) del mS1 del mS2 if bDownSample: # do downsampling size1 = m1.sum() size2 = m2.sum() if size1 > size2: m1 = resample(m1, size2).astype(float) elif size2 > size1: m2 = resample(m2, size1).astype(float) else: # just normalize by total contacts m1 = m1.astype(float) / n1 m2 = m2.astype(float) / n2 if h > 0: # apply smoothing m1 = meanFilterSparse(m1, h) m2 = meanFilterSparse(m2, h) scc[iChr] = sccByDiag(m1, m2, nDiags) return scc