예제 #1
0
def coolerInfo(cool: cooler.api.Cooler, k: str):
    """Retrieve metadata from Cooler file

    The required metadata fields are documented in:
    https://cooler.readthedocs.io/en/latest/schema.html#metadata

    This function will attempt to return the requested field via the input key
    `k` directly from the Cooler `cool` object or if that doesn't work, will try
    to compute it from the contact matrix for certain types of metadata


    Args:
        cool (cooler.api.Cooler): Input Cooler object
        k (str): Key of the metadata field
    Returns: Requested metadata
    """
    if k in cool.info:
        return cool.info[k]
    elif k == 'sum':
        return cool.pixels()['count'][:].sum()
    elif k == 'nbins':
        return cool.bins().shape[0]
    elif k == 'nnz':
        return cool.pixels().shape[0]
    elif k == 'nchroms':
        return cool.chroms().shape[0]
    else:
        raise KeyError(f'Unable to retrieve metadata field \'{k}\'')
예제 #2
0
def hicrepSCC(cool1: cooler.api.Cooler,
              cool2: cooler.api.Cooler,
              h: int,
              dBPMax: int,
              bDownSample: bool,
              chrNames: list = None,
              excludeChr: set = None):
    """Compute hicrep score between two input Cooler contact matrices

    Args:
        cool1: `cooler.api.Cooler` Input Cooler contact matrix 1
        cool2: `cooler.api.Cooler` Input Cooler contact matrix 2
        h: `int` Half-size of the mean filter used to smooth the
        input matrics
        dBPMax `int` Only include contacts that are at most this genomic
        distance (bp) away
        bDownSample: `bool` Down sample the input with more contacts
        to the same number of contacts as in the other input
        chrNames: `list` List of chromosome names whose SCC to
        compute. Default to None, which means all chromosomes in the
        genome are used to compute SCC
        excludeChr: `set` Set of chromosome names to exclude from SCC
        computation. Default to None.

    Returns:
        `float` scc scores for each chromosome
    """
    binSize1 = cool1.binsize
    binSize2 = cool2.binsize
    assert binSize1 == binSize2,\
        f"Input cool files have different bin sizes"
    assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\
        f"Input cool files have different number of bins"
    assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\
        f"Input cool files have different number of chromosomes"
    assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\
        f"Input file have different chromosome names"
    binSize = binSize1
    bins1 = cool1.bins()
    bins2 = cool2.bins()
    if binSize is None:
        # sometimes bin size can be None, e.g., input cool file has
        # non-uniform size bins.
        assert np.all(bins1[:] == bins2[:]),\
            f"Input cooler files don't have a unique bin size most likely "\
            f"because non-uniform bin size was used and the bins are defined "\
            f"differently for the two input cooler files"
        # In that case, use the median bin size
        binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values))
        warnings.warn(f"Input cooler files don't have a unique bin size most "\
                      f"likely because non-uniform bin size was used. HicRep "\
                      f"will use median bin size from the first cooler file "\
                      f"to determine maximal diagonal index to include", RuntimeWarning)
    if dBPMax == -1:
        # this is the exclusive upper bound
        dMax = coolerInfo(cool1, 'nbins')
    else:
        dMax = dBPMax // binSize + 1
    assert dMax > 1, f"Input dBPmax is smaller than binSize"
    p1 = cool2pixels(cool1)
    p2 = cool2pixels(cool2)
    # get the total number of contacts as normalizing constant
    n1 = coolerInfo(cool1, 'sum')
    n2 = coolerInfo(cool2, 'sum')
    # Use dict here so that the chrNames don't duplicate
    if chrNames is None:
        chrNamesDict = dict.fromkeys(cool1.chroms()[:]['name'].tolist())
    else:
        chrNamesDict = dict.fromkeys(chrNames)
    # It's important to preserve the order of the input chrNames so that the
    # user knows the order of the output SCC scores so we bail when encounter
    # duplicate names rather than implicit prunning the names.
    assert chrNames is None or len(chrNamesDict) == len(chrNames), f"""
        Found Duplicates in {chrNames}. Please remove them.
        """
    # filter out excluded chromosomes
    if excludeChr is None:
        excludeChr = set()
    chrNames = [
        chrName for chrName in chrNamesDict if chrName not in excludeChr
    ]
    scc = np.full(len(chrNames), -2.0)
    for iChr, chrName in enumerate(chrNames):
        # normalize by total number of contacts
        mS1 = getSubCoo(p1, bins1, chrName)
        assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % (
            chrName)
        assert mS1.shape[0] == mS1.shape[1],\
            "Contact matrix 1 of chromosome %s is not square" % (chrName)
        mS2 = getSubCoo(p2, bins2, chrName)
        assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % (
            chrName)
        assert mS2.shape[0] == mS2.shape[1],\
            "Contact matrix 2 of chromosome %s is not square" % (chrName)
        assert mS1.shape == mS2.shape,\
            "Contact matrices of chromosome %s have different input shape" % (chrName)
        nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0])
        rho = np.full(nDiags, np.nan)
        ws = np.full(nDiags, np.nan)
        # remove major diagonal and all the diagonals >= nDiags
        # to save computation time
        m1 = trimDiags(mS1, nDiags, False)
        m2 = trimDiags(mS2, nDiags, False)
        del mS1
        del mS2
        if bDownSample:
            # do downsampling
            size1 = m1.sum()
            size2 = m2.sum()
            if size1 > size2:
                m1 = resample(m1, size2).astype(float)
            elif size2 > size1:
                m2 = resample(m2, size1).astype(float)
        else:
            # just normalize by total contacts
            m1 = m1.astype(float) / n1
            m2 = m2.astype(float) / n2
        if h > 0:
            # apply smoothing
            m1 = meanFilterSparse(m1, h)
            m2 = meanFilterSparse(m2, h)
        scc[iChr] = sccByDiag(m1, m2, nDiags)
    return scc
예제 #3
0
def hicrepSCC(cool1: cooler.api.Cooler, cool2: cooler.api.Cooler, h: int,
              dBPMax: int, bDownSample: bool):
    """Compute hicrep score between two input Cooler contact matrices

    Args:
        cool1: `cooler.api.Cooler` Input Cooler contact matrix 1
        cool2: `cooler.api.Cooler` Input Cooler contact matrix 2
        h: `int` Half-size of the mean filter used to smooth the
        input matrics
        dBPMax `int` Only include contacts that are at most this genomic
        distance (bp) away
        bDownSample: `bool` Down sample the input with more contacts
        to the same number of contacts as in the other input

    Returns:
        `float` scc scores for each chromosome
    """
    binSize1 = cool1.binsize
    binSize2 = cool2.binsize
    assert binSize1 == binSize2,\
        f"Input cool files have different bin sizes"
    assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\
        f"Input cool files have different number of bins"
    assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\
        f"Input cool files have different number of chromosomes"
    assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\
        f"Input file have different chromosome names"
    binSize = binSize1
    bins1 = cool1.bins()
    bins2 = cool2.bins()
    if binSize is None:
        # sometimes bin size can be None, e.g., input cool file has
        # non-uniform size bins.
        assert np.all(bins1[:] == bins2[:]),\
            f"Input cooler files don't have a unique bin size most likely "\
            f"because non-uniform bin size was used and the bins are defined "\
            f"differently for the two input cooler files"
        # In that case, use the median bin size
        binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values))
        warnings.warn(f"Input cooler files don't have a unique bin size most "\
                      f"likely because non-uniform bin size was used. HicRep "\
                      f"will use median bin size from the first cooler file "\
                      f"to determine maximal diagonal index to include", RuntimeWarning)
    if dBPMax == -1:
        # this is the exclusive upper bound
        dMax = coolerInfo(cool1, 'nbins')
    else:
        dMax = dBPMax // binSize + 1
    assert dMax > 1, f"Input dBPmax is smaller than binSize"
    p1 = cool2pixels(cool1)
    p2 = cool2pixels(cool2)
    # get the total number of contacts as normalizing constant
    n1 = coolerInfo(cool1, 'sum')
    n2 = coolerInfo(cool2, 'sum')
    chrNames = cool1.chroms()[:]['name'].to_numpy()
    # filter out mitochondria chromosome
    chrNames = np.array([name for name in chrNames if name != 'M'])
    scc = np.full(chrNames.shape[0], -2.0)
    for iChr in range(chrNames.shape[0]):
        chrName = chrNames[iChr]
        # normalize by total number of contacts
        mS1 = getSubCoo(p1, bins1, chrName)
        assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % (
            chrName)
        assert mS1.shape[0] == mS1.shape[1],\
            "Contact matrix 1 of chromosome %s is not square" % (chrName)
        mS2 = getSubCoo(p2, bins2, chrName)
        assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % (
            chrName)
        assert mS2.shape[0] == mS2.shape[1],\
            "Contact matrix 2 of chromosome %s is not square" % (chrName)
        assert mS1.shape == mS2.shape,\
            "Contact matrices of chromosome %s have different input shape" % (chrName)
        nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0])
        rho = np.full(nDiags, np.nan)
        ws = np.full(nDiags, np.nan)
        # remove major diagonal and all the diagonals >= nDiags
        # to save computation time
        m1 = trimDiags(mS1, nDiags, False)
        m2 = trimDiags(mS2, nDiags, False)
        del mS1
        del mS2
        if bDownSample:
            # do downsampling
            size1 = m1.sum()
            size2 = m2.sum()
            if size1 > size2:
                m1 = resample(m1, size2).astype(float)
            elif size2 > size1:
                m2 = resample(m2, size1).astype(float)
        else:
            # just normalize by total contacts
            m1 = m1.astype(float) / n1
            m2 = m2.astype(float) / n2
        if h > 0:
            # apply smoothing
            m1 = meanFilterSparse(m1, h)
            m2 = meanFilterSparse(m2, h)
        scc[iChr] = sccByDiag(m1, m2, nDiags)
    return scc