def cool2pixels(cool: cooler.api.Cooler): """Return the contact matrix in "pixels" format Args: cool: Input cooler object Returns: cooler.core.RangeSelector2D object """ return cool.matrix(as_pixels=True, balance=False, sparse=True)
def get_frag(c: cooler.api.Cooler, resolution: int, offsets: pd.core.series.Series, chrom1: str, start1: int, end1: int, chrom2: str, start2: int, end2: int, width: int = 22, height: int = -1, padding: int = 10, normalize: bool = True, balanced: bool = True, percentile: float = 100.0, ignore_diags: int = 0, no_normalize: bool = False) -> np.ndarray: """ Retrieves a matrix fragment. Args: c: Cooler object. chrom1: Chromosome 1. E.g.: `1` or `chr1`. start1: First start position in base pairs relative to `chrom1`. end1: First end position in base pairs relative to `chrom1`. chrom2: Chromosome 2. E.g.: `1` or `chr1`. start2: Second start position in base pairs relative to `chrom2`. end2: Second end position in base pairs relative to `chrom2`. offsets: Pandas Series of chromosome offsets in bins. width: Width of the fragment in pixels. height: Height of the fragments in pixels. If `-1` `height` will equal `width`. Defaults to `-1`. padding: Percental padding related to the dimension of the fragment. E.g., 10 = 10% padding (5% per side). Defaults to `10`. normalize: If `True` the fragment will be normalized to [0, 1]. Defaults to `True`. balanced: If `True` the fragment will be balanced using Cooler. Defaults to `True`. percentile: Percentile clip. E.g., For 99 the maximum will be capped at the 99-percentile. Defaults to `100.0`. ignore_diags: Number of diagonals to be ignored, i.e., set to 0. Defaults to `0`. no_normalize: If `true` the returned matrix is not normalized. Defaults to `False`. Returns: """ if height is -1: height = width # Restrict padding to be [0, 100]% padding = min(100, max(0, padding)) / 100 try: offset1 = offsets[chrom1] offset2 = offsets[chrom2] except KeyError: # One more try before we will fail miserably offset1 = offsets['chr{}'.format(chrom1)] offset2 = offsets['chr{}'.format(chrom2)] start_bin1 = offset1 + int(round(float(start1) / resolution)) end_bin1 = offset1 + int(round(float(end1) / resolution)) + 1 start_bin2 = offset2 + int(round(float(start2) / resolution)) end_bin2 = offset2 + int(round(float(end2) / resolution)) + 1 # Apply percentile padding padding1 = int(round(((end_bin1 - start_bin1) / 2) * padding)) padding2 = int(round(((end_bin2 - start_bin2) / 2) * padding)) start_bin1 -= padding1 start_bin2 -= padding2 end_bin1 += padding1 end_bin2 += padding2 # Get the size of the region dim1 = end_bin1 - start_bin1 dim2 = end_bin2 - start_bin2 # Get additional absolute padding if needed padding1 = 0 if dim1 < width: padding1 = int((width - dim1) / 2) start_bin1 -= padding1 end_bin1 += padding1 padding2 = 0 if dim2 < height: padding2 = int((height - dim2) / 2) start_bin2 -= padding2 end_bin2 += padding2 # In case the final dimension does not math the desired dimension we # increase the end bin. This can be caused when the padding is not # divisible by 2, since the padding is rounded to the nearest integer. abs_dim1 = abs(start_bin1 - end_bin1) if abs_dim1 < width: end_bin1 += width - abs_dim1 abs_dim1 = width abs_dim2 = abs(start_bin2 - end_bin2) if abs_dim2 < height: end_bin2 += height - abs_dim2 abs_dim2 = height # Maximum width / height is 512 if abs_dim1 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge() if abs_dim2 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge() # Finally, adjust to negative values. # Since relative bin IDs are adjusted by the start this will lead to a # white offset. real_start_bin1 = start_bin1 if start_bin1 >= 0 else 0 real_start_bin2 = start_bin2 if start_bin2 >= 0 else 0 # Get the data data = c.matrix(as_pixels=True, balance=False, max_chunk=np.inf)[real_start_bin1:end_bin1, real_start_bin2:end_bin2] # Annotate pixels for balancing bins = c.bins(convert_enum=False)[['weight']] data = cooler.annotate(data, bins, replace=False) # Calculate relative bin IDs rel_bin1 = np.add(data['bin1_id'].values, -start_bin1) rel_bin2 = np.add(data['bin2_id'].values, -start_bin2) # Balance counts if balanced: values = data['count'].values.astype(np.float32) values *= data['weight1'].values * data['weight2'].values else: values = data['count'].values # Get pixel IDs for the upper triangle idx1 = np.add(np.multiply(rel_bin1, abs_dim1), rel_bin2) # Mirror matrix idx2_1 = np.add(data['bin2_id'].values, -start_bin1) idx2_2 = np.add(data['bin1_id'].values, -start_bin2) idx2 = np.add(np.multiply(idx2_1, abs_dim1), idx2_2) validBins = np.where((idx2_1 < abs_dim1) & (idx2_2 >= 0)) # Ignore diagonals diags_start_row = None if ignore_diags > 0: try: diags_start_idx = np.min( np.where(data['bin1_id'].values == data['bin2_id'].values)) diags_start_row = (rel_bin1[diags_start_idx] - rel_bin2[diags_start_idx]) except ValueError: pass # Copy pixel values onto the final array frag_len = abs_dim1 * abs_dim2 frag = np.zeros(frag_len, dtype=np.float32) # Make sure we're within the bounds idx1_f = np.where(idx1 < frag_len) frag[idx1[idx1_f]] = values[idx1_f] frag[idx2[validBins]] = values[validBins] frag = frag.reshape((abs_dim1, abs_dim2)) # Store low quality bins low_quality_bins = np.where(np.isnan(frag)) # Assign 0 for now to avoid influencing the max values frag[low_quality_bins] = 0 # Scale fragment down if needed scaled = False scale_x = width / frag.shape[0] if frag.shape[0] > width or frag.shape[1] > height: scaledFrag = np.zeros((width, height), float) frag = scaledFrag + zoomArray(frag, scaledFrag.shape, order=1) scaled = True # Normalize by minimum if not no_normalize: min_val = np.min(frag) frag -= min_val ignored_idx = None # Remove diagonals if ignore_diags > 0 and diags_start_row is not None: if width == height: scaled_row = int(np.rint(diags_start_row / scale_x)) idx = np.diag_indices(width) scaled_idx = (idx if scaled_row == 0 else [idx[0][scaled_row:], idx[0][:-scaled_row]]) for i in range(ignore_diags): # First set all cells to be ignored to `-1` so that we can # easily query for them later. if i == 0: frag[scaled_idx] = -1 else: dist_to_diag = scaled_row - i dist_neg = min(0, dist_to_diag) off = 0 if dist_to_diag >= 0 else i - scaled_row # Above diagonal frag[((scaled_idx[0] - i)[off:], (scaled_idx[1])[off:])] = -1 # Extra cutoff at the bottom right frag[(range( scaled_idx[0][-1] - i, scaled_idx[0][-1] + 1 + dist_neg, ), range(scaled_idx[1][-1], scaled_idx[1][-1] + i + 1 + dist_neg))] = -1 # Below diagonal frag[((scaled_idx[0] + i)[:-i], (scaled_idx[1])[:-i])] = -1 # Save the final selection of ignored cells for fast access # later and set those values to `0` now. ignored_idx = np.where(frag == -1) frag[ignored_idx] = 0 else: logger.warn( 'Ignoring the diagonal only supported for squared features') # Capp by percentile max_val = np.percentile(frag, percentile) frag = np.clip(frag, 0, max_val) # Normalize by maximum if not no_normalize and max_val > 0: frag /= max_val # Set the ignored diagonal to the maximum if ignored_idx: frag[ignored_idx] = 1.0 if not scaled: # Recover low quality bins frag[low_quality_bins] = -1 return frag