Python api.Cooler.bins示例

编程语言: Python

命名空间/包名称: cooler

类/类型: api.Cooler

方法/功能: bins

hotexamples.com的示例: 4

Python api.Cooler.bins - 已找到4个示例。这些是从开源项目中提取的最受好评的cooler.api.Cooler.bins现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

bins(4)

chroms(3)

matrix(2)

pixels(1)

示例#1

显示文件

def coolerInfo(cool: cooler.api.Cooler, k: str):
    """Retrieve metadata from Cooler file

    The required metadata fields are documented in:
    https://cooler.readthedocs.io/en/latest/schema.html#metadata

    This function will attempt to return the requested field via the input key
    `k` directly from the Cooler `cool` object or if that doesn't work, will try
    to compute it from the contact matrix for certain types of metadata


    Args:
        cool (cooler.api.Cooler): Input Cooler object
        k (str): Key of the metadata field
    Returns: Requested metadata
    """
    if k in cool.info:
        return cool.info[k]
    elif k == 'sum':
        return cool.pixels()['count'][:].sum()
    elif k == 'nbins':
        return cool.bins().shape[0]
    elif k == 'nnz':
        return cool.pixels().shape[0]
    elif k == 'nchroms':
        return cool.chroms().shape[0]
    else:
        raise KeyError(f'Unable to retrieve metadata field \'{k}\'')

示例#2

显示文件

def hicrepSCC(cool1: cooler.api.Cooler,
              cool2: cooler.api.Cooler,
              h: int,
              dBPMax: int,
              bDownSample: bool,
              chrNames: list = None,
              excludeChr: set = None):
    """Compute hicrep score between two input Cooler contact matrices

    Args:
        cool1: `cooler.api.Cooler` Input Cooler contact matrix 1
        cool2: `cooler.api.Cooler` Input Cooler contact matrix 2
        h: `int` Half-size of the mean filter used to smooth the
        input matrics
        dBPMax `int` Only include contacts that are at most this genomic
        distance (bp) away
        bDownSample: `bool` Down sample the input with more contacts
        to the same number of contacts as in the other input
        chrNames: `list` List of chromosome names whose SCC to
        compute. Default to None, which means all chromosomes in the
        genome are used to compute SCC
        excludeChr: `set` Set of chromosome names to exclude from SCC
        computation. Default to None.

    Returns:
        `float` scc scores for each chromosome
    """
    binSize1 = cool1.binsize
    binSize2 = cool2.binsize
    assert binSize1 == binSize2,\
        f"Input cool files have different bin sizes"
    assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\
        f"Input cool files have different number of bins"
    assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\
        f"Input cool files have different number of chromosomes"
    assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\
        f"Input file have different chromosome names"
    binSize = binSize1
    bins1 = cool1.bins()
    bins2 = cool2.bins()
    if binSize is None:
        # sometimes bin size can be None, e.g., input cool file has
        # non-uniform size bins.
        assert np.all(bins1[:] == bins2[:]),\
            f"Input cooler files don't have a unique bin size most likely "\
            f"because non-uniform bin size was used and the bins are defined "\
            f"differently for the two input cooler files"
        # In that case, use the median bin size
        binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values))
        warnings.warn(f"Input cooler files don't have a unique bin size most "\
                      f"likely because non-uniform bin size was used. HicRep "\
                      f"will use median bin size from the first cooler file "\
                      f"to determine maximal diagonal index to include", RuntimeWarning)
    if dBPMax == -1:
        # this is the exclusive upper bound
        dMax = coolerInfo(cool1, 'nbins')
    else:
        dMax = dBPMax // binSize + 1
    assert dMax > 1, f"Input dBPmax is smaller than binSize"
    p1 = cool2pixels(cool1)
    p2 = cool2pixels(cool2)
    # get the total number of contacts as normalizing constant
    n1 = coolerInfo(cool1, 'sum')
    n2 = coolerInfo(cool2, 'sum')
    # Use dict here so that the chrNames don't duplicate
    if chrNames is None:
        chrNamesDict = dict.fromkeys(cool1.chroms()[:]['name'].tolist())
    else:
        chrNamesDict = dict.fromkeys(chrNames)
    # It's important to preserve the order of the input chrNames so that the
    # user knows the order of the output SCC scores so we bail when encounter
    # duplicate names rather than implicit prunning the names.
    assert chrNames is None or len(chrNamesDict) == len(chrNames), f"""
        Found Duplicates in {chrNames}. Please remove them.
        """
    # filter out excluded chromosomes
    if excludeChr is None:
        excludeChr = set()
    chrNames = [
        chrName for chrName in chrNamesDict if chrName not in excludeChr
    ]
    scc = np.full(len(chrNames), -2.0)
    for iChr, chrName in enumerate(chrNames):
        # normalize by total number of contacts
        mS1 = getSubCoo(p1, bins1, chrName)
        assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % (
            chrName)
        assert mS1.shape[0] == mS1.shape[1],\
            "Contact matrix 1 of chromosome %s is not square" % (chrName)
        mS2 = getSubCoo(p2, bins2, chrName)
        assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % (
            chrName)
        assert mS2.shape[0] == mS2.shape[1],\
            "Contact matrix 2 of chromosome %s is not square" % (chrName)
        assert mS1.shape == mS2.shape,\
            "Contact matrices of chromosome %s have different input shape" % (chrName)
        nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0])
        rho = np.full(nDiags, np.nan)
        ws = np.full(nDiags, np.nan)
        # remove major diagonal and all the diagonals >= nDiags
        # to save computation time
        m1 = trimDiags(mS1, nDiags, False)
        m2 = trimDiags(mS2, nDiags, False)
        del mS1
        del mS2
        if bDownSample:
            # do downsampling
            size1 = m1.sum()
            size2 = m2.sum()
            if size1 > size2:
                m1 = resample(m1, size2).astype(float)
            elif size2 > size1:
                m2 = resample(m2, size1).astype(float)
        else:
            # just normalize by total contacts
            m1 = m1.astype(float) / n1
            m2 = m2.astype(float) / n2
        if h > 0:
            # apply smoothing
            m1 = meanFilterSparse(m1, h)
            m2 = meanFilterSparse(m2, h)
        scc[iChr] = sccByDiag(m1, m2, nDiags)
    return scc

示例#3

显示文件

def get_frag(c: cooler.api.Cooler,
             resolution: int,
             offsets: pd.core.series.Series,
             chrom1: str,
             start1: int,
             end1: int,
             chrom2: str,
             start2: int,
             end2: int,
             width: int = 22,
             height: int = -1,
             padding: int = 10,
             normalize: bool = True,
             balanced: bool = True,
             percentile: float = 100.0,
             ignore_diags: int = 0,
             no_normalize: bool = False) -> np.ndarray:
    """
    Retrieves a matrix fragment.

    Args:
        c:
            Cooler object.
        chrom1:
            Chromosome 1. E.g.: `1` or `chr1`.
        start1:
            First start position in base pairs relative to `chrom1`.
        end1:
            First end position in base pairs relative to `chrom1`.
        chrom2:
            Chromosome 2. E.g.: `1` or `chr1`.
        start2:
            Second start position in base pairs relative to `chrom2`.
        end2:
            Second end position in base pairs relative to `chrom2`.
        offsets:
            Pandas Series of chromosome offsets in bins.
        width:
            Width of the fragment in pixels.
        height:
            Height of the fragments in pixels. If `-1` `height` will equal
            `width`. Defaults to `-1`.
        padding: Percental padding related to the dimension of the fragment.
            E.g., 10 = 10% padding (5% per side). Defaults to `10`.
        normalize:
            If `True` the fragment will be normalized to [0, 1].
            Defaults to `True`.
        balanced:
            If `True` the fragment will be balanced using Cooler.
            Defaults to `True`.
        percentile:
            Percentile clip. E.g., For 99 the maximum will be
            capped at the 99-percentile. Defaults to `100.0`.
        ignore_diags:
            Number of diagonals to be ignored, i.e., set to 0.
            Defaults to `0`.
        no_normalize:
            If `true` the returned matrix is not normalized.
            Defaults to `False`.

    Returns:

    """

    if height is -1:
        height = width

    # Restrict padding to be [0, 100]%
    padding = min(100, max(0, padding)) / 100

    try:
        offset1 = offsets[chrom1]
        offset2 = offsets[chrom2]
    except KeyError:
        # One more try before we will fail miserably
        offset1 = offsets['chr{}'.format(chrom1)]
        offset2 = offsets['chr{}'.format(chrom2)]

    start_bin1 = offset1 + int(round(float(start1) / resolution))
    end_bin1 = offset1 + int(round(float(end1) / resolution)) + 1

    start_bin2 = offset2 + int(round(float(start2) / resolution))
    end_bin2 = offset2 + int(round(float(end2) / resolution)) + 1

    # Apply percentile padding
    padding1 = int(round(((end_bin1 - start_bin1) / 2) * padding))
    padding2 = int(round(((end_bin2 - start_bin2) / 2) * padding))
    start_bin1 -= padding1
    start_bin2 -= padding2
    end_bin1 += padding1
    end_bin2 += padding2

    # Get the size of the region
    dim1 = end_bin1 - start_bin1
    dim2 = end_bin2 - start_bin2

    # Get additional absolute padding if needed
    padding1 = 0
    if dim1 < width:
        padding1 = int((width - dim1) / 2)
        start_bin1 -= padding1
        end_bin1 += padding1

    padding2 = 0
    if dim2 < height:
        padding2 = int((height - dim2) / 2)
        start_bin2 -= padding2
        end_bin2 += padding2

    # In case the final dimension does not math the desired dimension we
    # increase the end bin. This can be caused when the padding is not
    # divisible by 2, since the padding is rounded to the nearest integer.
    abs_dim1 = abs(start_bin1 - end_bin1)
    if abs_dim1 < width:
        end_bin1 += width - abs_dim1
        abs_dim1 = width

    abs_dim2 = abs(start_bin2 - end_bin2)
    if abs_dim2 < height:
        end_bin2 += height - abs_dim2
        abs_dim2 = height

    # Maximum width / height is 512
    if abs_dim1 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge()
    if abs_dim2 > hss.SNIPPET_MAT_MAX_DATA_DIM: raise SnippetTooLarge()

    # Finally, adjust to negative values.
    # Since relative bin IDs are adjusted by the start this will lead to a
    # white offset.
    real_start_bin1 = start_bin1 if start_bin1 >= 0 else 0
    real_start_bin2 = start_bin2 if start_bin2 >= 0 else 0

    # Get the data
    data = c.matrix(as_pixels=True, balance=False,
                    max_chunk=np.inf)[real_start_bin1:end_bin1,
                                      real_start_bin2:end_bin2]

    # Annotate pixels for balancing
    bins = c.bins(convert_enum=False)[['weight']]
    data = cooler.annotate(data, bins, replace=False)

    # Calculate relative bin IDs
    rel_bin1 = np.add(data['bin1_id'].values, -start_bin1)
    rel_bin2 = np.add(data['bin2_id'].values, -start_bin2)

    # Balance counts
    if balanced:
        values = data['count'].values.astype(np.float32)
        values *= data['weight1'].values * data['weight2'].values
    else:
        values = data['count'].values

    # Get pixel IDs for the upper triangle
    idx1 = np.add(np.multiply(rel_bin1, abs_dim1), rel_bin2)

    # Mirror matrix
    idx2_1 = np.add(data['bin2_id'].values, -start_bin1)
    idx2_2 = np.add(data['bin1_id'].values, -start_bin2)
    idx2 = np.add(np.multiply(idx2_1, abs_dim1), idx2_2)
    validBins = np.where((idx2_1 < abs_dim1) & (idx2_2 >= 0))

    # Ignore diagonals
    diags_start_row = None
    if ignore_diags > 0:
        try:
            diags_start_idx = np.min(
                np.where(data['bin1_id'].values == data['bin2_id'].values))
            diags_start_row = (rel_bin1[diags_start_idx] -
                               rel_bin2[diags_start_idx])
        except ValueError:
            pass

    # Copy pixel values onto the final array
    frag_len = abs_dim1 * abs_dim2
    frag = np.zeros(frag_len, dtype=np.float32)
    # Make sure we're within the bounds
    idx1_f = np.where(idx1 < frag_len)
    frag[idx1[idx1_f]] = values[idx1_f]
    frag[idx2[validBins]] = values[validBins]
    frag = frag.reshape((abs_dim1, abs_dim2))

    # Store low quality bins
    low_quality_bins = np.where(np.isnan(frag))

    # Assign 0 for now to avoid influencing the max values
    frag[low_quality_bins] = 0

    # Scale fragment down if needed
    scaled = False
    scale_x = width / frag.shape[0]
    if frag.shape[0] > width or frag.shape[1] > height:
        scaledFrag = np.zeros((width, height), float)
        frag = scaledFrag + zoomArray(frag, scaledFrag.shape, order=1)
        scaled = True

    # Normalize by minimum
    if not no_normalize:
        min_val = np.min(frag)
        frag -= min_val

    ignored_idx = None

    # Remove diagonals
    if ignore_diags > 0 and diags_start_row is not None:
        if width == height:
            scaled_row = int(np.rint(diags_start_row / scale_x))

            idx = np.diag_indices(width)
            scaled_idx = (idx if scaled_row == 0 else
                          [idx[0][scaled_row:], idx[0][:-scaled_row]])

            for i in range(ignore_diags):

                # First set all cells to be ignored to `-1` so that we can
                # easily query for them later.
                if i == 0:
                    frag[scaled_idx] = -1
                else:
                    dist_to_diag = scaled_row - i
                    dist_neg = min(0, dist_to_diag)
                    off = 0 if dist_to_diag >= 0 else i - scaled_row

                    # Above diagonal
                    frag[((scaled_idx[0] - i)[off:],
                          (scaled_idx[1])[off:])] = -1

                    # Extra cutoff at the bottom right
                    frag[(range(
                        scaled_idx[0][-1] - i,
                        scaled_idx[0][-1] + 1 + dist_neg,
                    ),
                          range(scaled_idx[1][-1],
                                scaled_idx[1][-1] + i + 1 + dist_neg))] = -1

                    # Below diagonal
                    frag[((scaled_idx[0] + i)[:-i], (scaled_idx[1])[:-i])] = -1

            # Save the final selection of ignored cells for fast access
            # later and set those values to `0` now.
            ignored_idx = np.where(frag == -1)
            frag[ignored_idx] = 0

        else:
            logger.warn(
                'Ignoring the diagonal only supported for squared features')

    # Capp by percentile
    max_val = np.percentile(frag, percentile)
    frag = np.clip(frag, 0, max_val)

    # Normalize by maximum
    if not no_normalize and max_val > 0:
        frag /= max_val

    # Set the ignored diagonal to the maximum
    if ignored_idx:
        frag[ignored_idx] = 1.0

    if not scaled:
        # Recover low quality bins
        frag[low_quality_bins] = -1

    return frag

示例#4

显示文件

文件： hicrep.py 项目： justin-a-sanders/hicrep

def hicrepSCC(cool1: cooler.api.Cooler, cool2: cooler.api.Cooler, h: int,
              dBPMax: int, bDownSample: bool):
    """Compute hicrep score between two input Cooler contact matrices

    Args:
        cool1: `cooler.api.Cooler` Input Cooler contact matrix 1
        cool2: `cooler.api.Cooler` Input Cooler contact matrix 2
        h: `int` Half-size of the mean filter used to smooth the
        input matrics
        dBPMax `int` Only include contacts that are at most this genomic
        distance (bp) away
        bDownSample: `bool` Down sample the input with more contacts
        to the same number of contacts as in the other input

    Returns:
        `float` scc scores for each chromosome
    """
    binSize1 = cool1.binsize
    binSize2 = cool2.binsize
    assert binSize1 == binSize2,\
        f"Input cool files have different bin sizes"
    assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\
        f"Input cool files have different number of bins"
    assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\
        f"Input cool files have different number of chromosomes"
    assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\
        f"Input file have different chromosome names"
    binSize = binSize1
    bins1 = cool1.bins()
    bins2 = cool2.bins()
    if binSize is None:
        # sometimes bin size can be None, e.g., input cool file has
        # non-uniform size bins.
        assert np.all(bins1[:] == bins2[:]),\
            f"Input cooler files don't have a unique bin size most likely "\
            f"because non-uniform bin size was used and the bins are defined "\
            f"differently for the two input cooler files"
        # In that case, use the median bin size
        binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values))
        warnings.warn(f"Input cooler files don't have a unique bin size most "\
                      f"likely because non-uniform bin size was used. HicRep "\
                      f"will use median bin size from the first cooler file "\
                      f"to determine maximal diagonal index to include", RuntimeWarning)
    if dBPMax == -1:
        # this is the exclusive upper bound
        dMax = coolerInfo(cool1, 'nbins')
    else:
        dMax = dBPMax // binSize + 1
    assert dMax > 1, f"Input dBPmax is smaller than binSize"
    p1 = cool2pixels(cool1)
    p2 = cool2pixels(cool2)
    # get the total number of contacts as normalizing constant
    n1 = coolerInfo(cool1, 'sum')
    n2 = coolerInfo(cool2, 'sum')
    chrNames = cool1.chroms()[:]['name'].to_numpy()
    # filter out mitochondria chromosome
    chrNames = np.array([name for name in chrNames if name != 'M'])
    scc = np.full(chrNames.shape[0], -2.0)
    for iChr in range(chrNames.shape[0]):
        chrName = chrNames[iChr]
        # normalize by total number of contacts
        mS1 = getSubCoo(p1, bins1, chrName)
        assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % (
            chrName)
        assert mS1.shape[0] == mS1.shape[1],\
            "Contact matrix 1 of chromosome %s is not square" % (chrName)
        mS2 = getSubCoo(p2, bins2, chrName)
        assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % (
            chrName)
        assert mS2.shape[0] == mS2.shape[1],\
            "Contact matrix 2 of chromosome %s is not square" % (chrName)
        assert mS1.shape == mS2.shape,\
            "Contact matrices of chromosome %s have different input shape" % (chrName)
        nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0])
        rho = np.full(nDiags, np.nan)
        ws = np.full(nDiags, np.nan)
        # remove major diagonal and all the diagonals >= nDiags
        # to save computation time
        m1 = trimDiags(mS1, nDiags, False)
        m2 = trimDiags(mS2, nDiags, False)
        del mS1
        del mS2
        if bDownSample:
            # do downsampling
            size1 = m1.sum()
            size2 = m2.sum()
            if size1 > size2:
                m1 = resample(m1, size2).astype(float)
            elif size2 > size1:
                m2 = resample(m2, size1).astype(float)
        else:
            # just normalize by total contacts
            m1 = m1.astype(float) / n1
            m2 = m2.astype(float) / n2
        if h > 0:
            # apply smoothing
            m1 = meanFilterSparse(m1, h)
            m2 = meanFilterSparse(m2, h)
        scc[iChr] = sccByDiag(m1, m2, nDiags)
    return scc