예제 #1
0
def testFlyHiC():
    fmcool1 = "tests/data/fly_hi-c/4DNFI8DRD739_bin100kb.cool"
    fmcool2 = "tests/data/fly_hi-c/4DNFIZ1ZVXC8_bin100kb.cool"
    cool1, binSize1 = readMcool(fmcool1, -1)
    cool2, binSize2 = readMcool(fmcool2, -1)

    #Check various .info() fields for consistency
    assert coolerInfo(cool1, 'bin-size') == binSize1,\
        f'coolerInfo() failed to retrieve metadata \'bin-size\' from {fmcool1}'
    assert coolerInfo(cool2, 'bin-size') == binSize2,\
        f'coolerInfo() failed to retrieve metadata \'bin-size\' from {fmcool2}'

    assert coolerInfo(cool1, 'sum') == cool1.pixels()['count'][:].sum(),\
        f'coolerInfo() failed to retrieve metadata \'sum\' from {fmcool1}'
    assert coolerInfo(cool2, 'sum') == cool2.pixels()['count'][:].sum(),\
        f'coolerInfo() failed to retrieve metadata \'sum\' from {fmcool2}'

    assert coolerInfo(cool1, 'nbins') == cool1.bins().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nbins\' from {fmcool1}'
    assert coolerInfo(cool2, 'nbins') == cool2.bins().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nbins\' from {fmcool2}'

    assert coolerInfo(cool1, 'nnz') == cool1.pixels().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nnz\' from {fmcool1}'
    assert coolerInfo(cool2, 'nnz') == cool2.pixels().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nnz\' from {fmcool2}'

    assert coolerInfo(cool1, 'nchroms') == cool1.chroms().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nchroms\' from {fmcool1}'
    assert coolerInfo(cool2, 'nchroms') == cool2.chroms().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nchroms\' from {fmcool2}'
예제 #2
0
def testHumanHiCInfo():
    fmcool1 = "tests/data/human_hi-c/4DNFITKCX2DO.cool"
    fmcool2 = "tests/data/human_hi-c/4DNFIQ5XCHDB.cool"
    cool1, binSize1 = readMcool(fmcool1, -1)
    cool2, binSize2 = readMcool(fmcool2, -1)
    
    #Check various .info() fields for consistency
    assert coolerInfo(cool1, 'bin-size') == binSize1,\
        f'coolerInfo() failed to retrieve metadata \'bin-size\' from {fmcool1}'
    assert coolerInfo(cool2, 'bin-size') == binSize2,\
        f'coolerInfo() failed to retrieve metadata \'bin-size\' from {fmcool2}'

    assert coolerInfo(cool1, 'sum') == cool1.pixels()['count'][:].sum(),\
        f'coolerInfo() failed to retrieve metadata \'sum\' from {fmcool1}'
    assert coolerInfo(cool2, 'sum') == cool2.pixels()['count'][:].sum(),\
        f'coolerInfo() failed to retrieve metadata \'sum\' from {fmcool2}'

    assert coolerInfo(cool1, 'nbins') == cool1.bins().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nbins\' from {fmcool1}'
    assert coolerInfo(cool2, 'nbins') == cool2.bins().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nbins\' from {fmcool2}'

    assert coolerInfo(cool1, 'nnz') == cool1.pixels().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nnz\' from {fmcool1}'
    assert coolerInfo(cool2, 'nnz') == cool2.pixels().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nnz\' from {fmcool2}'

    assert coolerInfo(cool1, 'nchroms') == cool1.chroms().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nchroms\' from {fmcool1}'
    assert coolerInfo(cool2, 'nchroms') == cool2.chroms().shape[0],\
        f'coolerInfo() failed to retrieve metadata \'nchroms\' from {fmcool2}'
예제 #3
0
def testFlyHiC():
    fmcool1 = "tests/data/fly_hi-c/4DNFI8DRD739_bin100kb.cool"
    fmcool2 = "tests/data/fly_hi-c/4DNFIZ1ZVXC8_bin100kb.cool"
    binSize = -1
    h = 1
    dBPMax = 500000
    bDownSample = False
    cool1, binSize1 = readMcool(fmcool1, binSize)
    cool2, binSize2 = readMcool(fmcool2, binSize)
    assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\
        f"Input cool files {fmcool1} and {fmcool2} have different number of bins"
    assert binSize1 == binSize2,\
        f"Input cool files {fmcool1} and {fmcool2} have different bin sizes"
    assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\
        f"Input cool files {fmcool1} and {fmcool2} have different number of chromosomes"
    assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\
        f"Input file {fmcool1} and {fmcool2} have different chromosome names"
    results = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample)
    expected = np.array([
        9.936753824600870e-01, 9.950138992224218e-01, 9.951519844417879e-01,
        9.935973973292749e-01, 9.933660605077106e-01, 9.927681695925705e-01,
        6.238132870270471e-01
    ])
    assert np.isclose(results, expected).all()

    # Test the computation of a subset of chromosomes give the same results as
    # the whole set
    chrNames = ['chr2L', 'chr2R', 'chrX']
    resultsSub = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample, chrNames)
    chrNamesAll = cool1.chroms()[:]['name'].tolist()
    chrNamesAll = [name for name in chrNamesAll if name != 'M']
    iChrs = np.where(np.isin(chrNamesAll, chrNames))[0]
    assert (results[iChrs] == resultsSub).all(), f"""
        SCC scores between {fmcool1} and {fmcool2} on chromosome subset
        {chrNames} differ from those computed from the whole set. The whole
        genome results are: {results} and the subset indices are {iChrs}.
        """

    # Test the computation when excluding chromosomes give the same results as
    # the whole set
    exclNames = set(['chr2L', 'chr2R', 'chrX'])
    resultsExcl = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample, chrNamesAll,
                            exclNames)
    chrNamesRemain = [name for name in chrNamesAll if name not in exclNames]
    ieChrs = np.where(np.isin(chrNamesAll, chrNamesRemain))[0]
    assert (results[ieChrs] == resultsExcl).all(), f"""
예제 #4
0
def testFlyHiC():
    fmcool1 = "tests/data/fly_hi-c/4DNFI8DRD739_bin100kb.cool"
    fmcool2 = "tests/data/fly_hi-c/4DNFIZ1ZVXC8_bin100kb.cool"
    binSize = -1
    h = 1
    dBPMax = 500000
    bDownSample = False
    cool1, binSize1 = readMcool(fmcool1, binSize)
    cool2, binSize2 = readMcool(fmcool2, binSize)
    assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\
        f"Input cool files {fmcool1} and {fmcool2} have different number of bins"
    assert binSize1 == binSize2,\
        f"Input cool files {fmcool1} and {fmcool2} have different bin sizes"
    assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\
        f"Input cool files {fmcool1} and {fmcool2} have different number of chromosomes"
    assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\
        f"Input file {fmcool1} and {fmcool2} have different chromosome names"
    results = hicrepSCC(cool1, cool2, h, dBPMax, bDownSample)
    expected = np.array([
        9.936753824600870e-01, 9.950138992224218e-01, 9.951519844417879e-01,
        9.935973973292749e-01, 9.933660605077106e-01, 9.927681695925705e-01,
        6.238132870270471e-01
    ])
    assert np.isclose(results, expected).all()
예제 #5
0
def hicrepSCC(cool1: cooler.api.Cooler,
              cool2: cooler.api.Cooler,
              h: int,
              dBPMax: int,
              bDownSample: bool,
              chrNames: list = None,
              excludeChr: set = None):
    """Compute hicrep score between two input Cooler contact matrices

    Args:
        cool1: `cooler.api.Cooler` Input Cooler contact matrix 1
        cool2: `cooler.api.Cooler` Input Cooler contact matrix 2
        h: `int` Half-size of the mean filter used to smooth the
        input matrics
        dBPMax `int` Only include contacts that are at most this genomic
        distance (bp) away
        bDownSample: `bool` Down sample the input with more contacts
        to the same number of contacts as in the other input
        chrNames: `list` List of chromosome names whose SCC to
        compute. Default to None, which means all chromosomes in the
        genome are used to compute SCC
        excludeChr: `set` Set of chromosome names to exclude from SCC
        computation. Default to None.

    Returns:
        `float` scc scores for each chromosome
    """
    binSize1 = cool1.binsize
    binSize2 = cool2.binsize
    assert binSize1 == binSize2,\
        f"Input cool files have different bin sizes"
    assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\
        f"Input cool files have different number of bins"
    assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\
        f"Input cool files have different number of chromosomes"
    assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\
        f"Input file have different chromosome names"
    binSize = binSize1
    bins1 = cool1.bins()
    bins2 = cool2.bins()
    if binSize is None:
        # sometimes bin size can be None, e.g., input cool file has
        # non-uniform size bins.
        assert np.all(bins1[:] == bins2[:]),\
            f"Input cooler files don't have a unique bin size most likely "\
            f"because non-uniform bin size was used and the bins are defined "\
            f"differently for the two input cooler files"
        # In that case, use the median bin size
        binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values))
        warnings.warn(f"Input cooler files don't have a unique bin size most "\
                      f"likely because non-uniform bin size was used. HicRep "\
                      f"will use median bin size from the first cooler file "\
                      f"to determine maximal diagonal index to include", RuntimeWarning)
    if dBPMax == -1:
        # this is the exclusive upper bound
        dMax = coolerInfo(cool1, 'nbins')
    else:
        dMax = dBPMax // binSize + 1
    assert dMax > 1, f"Input dBPmax is smaller than binSize"
    p1 = cool2pixels(cool1)
    p2 = cool2pixels(cool2)
    # get the total number of contacts as normalizing constant
    n1 = coolerInfo(cool1, 'sum')
    n2 = coolerInfo(cool2, 'sum')
    # Use dict here so that the chrNames don't duplicate
    if chrNames is None:
        chrNamesDict = dict.fromkeys(cool1.chroms()[:]['name'].tolist())
    else:
        chrNamesDict = dict.fromkeys(chrNames)
    # It's important to preserve the order of the input chrNames so that the
    # user knows the order of the output SCC scores so we bail when encounter
    # duplicate names rather than implicit prunning the names.
    assert chrNames is None or len(chrNamesDict) == len(chrNames), f"""
        Found Duplicates in {chrNames}. Please remove them.
        """
    # filter out excluded chromosomes
    if excludeChr is None:
        excludeChr = set()
    chrNames = [
        chrName for chrName in chrNamesDict if chrName not in excludeChr
    ]
    scc = np.full(len(chrNames), -2.0)
    for iChr, chrName in enumerate(chrNames):
        # normalize by total number of contacts
        mS1 = getSubCoo(p1, bins1, chrName)
        assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % (
            chrName)
        assert mS1.shape[0] == mS1.shape[1],\
            "Contact matrix 1 of chromosome %s is not square" % (chrName)
        mS2 = getSubCoo(p2, bins2, chrName)
        assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % (
            chrName)
        assert mS2.shape[0] == mS2.shape[1],\
            "Contact matrix 2 of chromosome %s is not square" % (chrName)
        assert mS1.shape == mS2.shape,\
            "Contact matrices of chromosome %s have different input shape" % (chrName)
        nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0])
        rho = np.full(nDiags, np.nan)
        ws = np.full(nDiags, np.nan)
        # remove major diagonal and all the diagonals >= nDiags
        # to save computation time
        m1 = trimDiags(mS1, nDiags, False)
        m2 = trimDiags(mS2, nDiags, False)
        del mS1
        del mS2
        if bDownSample:
            # do downsampling
            size1 = m1.sum()
            size2 = m2.sum()
            if size1 > size2:
                m1 = resample(m1, size2).astype(float)
            elif size2 > size1:
                m2 = resample(m2, size1).astype(float)
        else:
            # just normalize by total contacts
            m1 = m1.astype(float) / n1
            m2 = m2.astype(float) / n2
        if h > 0:
            # apply smoothing
            m1 = meanFilterSparse(m1, h)
            m2 = meanFilterSparse(m2, h)
        scc[iChr] = sccByDiag(m1, m2, nDiags)
    return scc
예제 #6
0
def hicrepSCC(cool1: cooler.api.Cooler, cool2: cooler.api.Cooler, h: int,
              dBPMax: int, bDownSample: bool):
    """Compute hicrep score between two input Cooler contact matrices

    Args:
        cool1: `cooler.api.Cooler` Input Cooler contact matrix 1
        cool2: `cooler.api.Cooler` Input Cooler contact matrix 2
        h: `int` Half-size of the mean filter used to smooth the
        input matrics
        dBPMax `int` Only include contacts that are at most this genomic
        distance (bp) away
        bDownSample: `bool` Down sample the input with more contacts
        to the same number of contacts as in the other input

    Returns:
        `float` scc scores for each chromosome
    """
    binSize1 = cool1.binsize
    binSize2 = cool2.binsize
    assert binSize1 == binSize2,\
        f"Input cool files have different bin sizes"
    assert coolerInfo(cool1, 'nbins') == coolerInfo(cool2, 'nbins'),\
        f"Input cool files have different number of bins"
    assert coolerInfo(cool1, 'nchroms') == coolerInfo(cool2, 'nchroms'),\
        f"Input cool files have different number of chromosomes"
    assert (cool1.chroms()[:] == cool2.chroms()[:]).all()[0],\
        f"Input file have different chromosome names"
    binSize = binSize1
    bins1 = cool1.bins()
    bins2 = cool2.bins()
    if binSize is None:
        # sometimes bin size can be None, e.g., input cool file has
        # non-uniform size bins.
        assert np.all(bins1[:] == bins2[:]),\
            f"Input cooler files don't have a unique bin size most likely "\
            f"because non-uniform bin size was used and the bins are defined "\
            f"differently for the two input cooler files"
        # In that case, use the median bin size
        binSize = int(np.median((bins1[:]["end"] - bins1[:]["start"]).values))
        warnings.warn(f"Input cooler files don't have a unique bin size most "\
                      f"likely because non-uniform bin size was used. HicRep "\
                      f"will use median bin size from the first cooler file "\
                      f"to determine maximal diagonal index to include", RuntimeWarning)
    if dBPMax == -1:
        # this is the exclusive upper bound
        dMax = coolerInfo(cool1, 'nbins')
    else:
        dMax = dBPMax // binSize + 1
    assert dMax > 1, f"Input dBPmax is smaller than binSize"
    p1 = cool2pixels(cool1)
    p2 = cool2pixels(cool2)
    # get the total number of contacts as normalizing constant
    n1 = coolerInfo(cool1, 'sum')
    n2 = coolerInfo(cool2, 'sum')
    chrNames = cool1.chroms()[:]['name'].to_numpy()
    # filter out mitochondria chromosome
    chrNames = np.array([name for name in chrNames if name != 'M'])
    scc = np.full(chrNames.shape[0], -2.0)
    for iChr in range(chrNames.shape[0]):
        chrName = chrNames[iChr]
        # normalize by total number of contacts
        mS1 = getSubCoo(p1, bins1, chrName)
        assert mS1.size > 0, "Contact matrix 1 of chromosome %s is empty" % (
            chrName)
        assert mS1.shape[0] == mS1.shape[1],\
            "Contact matrix 1 of chromosome %s is not square" % (chrName)
        mS2 = getSubCoo(p2, bins2, chrName)
        assert mS2.size > 0, "Contact matrix 2 of chromosome %s is empty" % (
            chrName)
        assert mS2.shape[0] == mS2.shape[1],\
            "Contact matrix 2 of chromosome %s is not square" % (chrName)
        assert mS1.shape == mS2.shape,\
            "Contact matrices of chromosome %s have different input shape" % (chrName)
        nDiags = mS1.shape[0] if dMax < 0 else min(dMax, mS1.shape[0])
        rho = np.full(nDiags, np.nan)
        ws = np.full(nDiags, np.nan)
        # remove major diagonal and all the diagonals >= nDiags
        # to save computation time
        m1 = trimDiags(mS1, nDiags, False)
        m2 = trimDiags(mS2, nDiags, False)
        del mS1
        del mS2
        if bDownSample:
            # do downsampling
            size1 = m1.sum()
            size2 = m2.sum()
            if size1 > size2:
                m1 = resample(m1, size2).astype(float)
            elif size2 > size1:
                m2 = resample(m2, size1).astype(float)
        else:
            # just normalize by total contacts
            m1 = m1.astype(float) / n1
            m2 = m2.astype(float) / n2
        if h > 0:
            # apply smoothing
            m1 = meanFilterSparse(m1, h)
            m2 = meanFilterSparse(m2, h)
        scc[iChr] = sccByDiag(m1, m2, nDiags)
    return scc