예제 #1
0
def sparse_kl_divergence(p, q):
    """Compute the Kullback-Leibler (KL) divergence on sparse matrix.

    Parameters
    ----------
    p : scipy csr_matrix (with 1 row)
        "Ideal"/"true" Probability distribution
    q : scipy csr_matrix (with 1 row)
        Approximation of probability distribution p

    Returns
    -------
    kl : float
        KL divergence of approximating p with the distribution q
    """
    # Get index that appear in both sparse matrix (assume that both are sorted)
    p_idx = su.searchsorted(p.indices, q.indices)
    q_idx = su.searchsorted(q.indices, p.indices)

    # Get only part where BOTH has value.
    p_val = p.data[p_idx]
    q_val = q.data[q_idx]

    kl = np.sum(p_val * np.log2(p_val / q_val))

    return kl
예제 #2
0
def test_searchsorted_all2all():
    """ Shouldn't miss any
    """
    for i in range(20):
        sample = np.sort(
            np.random.choice(np.arange(0, 1000), 100, replace=False))
        expected = np.arange(0, 100)
        answer = su.searchsorted(sample, sample)
        print(answer)
        assert np.array_equal(answer, expected)
예제 #3
0
def test_searchsorted_subset():
    """ When query is a subset of the target
    """
    for i in range(20):
        sample = np.sort(
            np.random.choice(np.arange(0, 1000), 100, replace=False))
        remove = np.random.choice(np.arange(0, 100), 10, replace=False)
        subsample = np.delete(sample, remove)
        expected = np.delete(np.arange(0, 100), remove)
        answer = su.searchsorted(sample, subsample)
        assert np.array_equal(answer, expected)
예제 #4
0
def _calculate_re_vectorize(array0, array1, array2):
    """ Calculate relative entropy using vectorize

    Args:
        array1 (TODO): Main array
        array2 (TODO): -1 order array
        array3 (TODO): -2 order array

    Returns: TODO

    """
    def _calculate_limit(ksize):
        # And then we can use bin to calculate how much to delete from hash
        A_lim = kmerutil.encode("A" + ("T" * (ksize - 1)))[0]
        C_lim = int((A_lim * 2) + 1)
        G_lim = int((A_lim * 3) + 2)
        return np.array([A_lim + 1, C_lim + 1, G_lim + 1])

    ksize = int(
        math.log(array1.shape[1],
                 4)) + 1  # Calculate kmer from size of array to hold all kmer

    # Calculate all index for each level.
    ARes = array0.data  # Obs
    # All merFront
    bins = _calculate_limit(ksize)
    fHash = kmerutil.trimFront(array0.indices, ksize)
    idx = np.argsort(fHash)
    inv_idx = np.argsort(idx)
    fIdx_sorted = su.searchsorted(array1.indices, fHash[idx])
    fIdx = fIdx_sorted[inv_idx]
    FRes = array1.data[fIdx]
    # All merBack
    bHash = kmerutil.trimBack(array0.indices)
    bIdx = su.searchsorted(array1.indices, bHash)
    BRes = array1.data[bIdx]

    # All Mer middle
    # Use fhash and calculate back hash
    # convertMidVec = np.vectorize(lambda khash:_convert_to_middle(khash, ksize))
    # mHash = convertMidVec(array0.indices)
    mHash = kmerutil.trimBack(fHash)
    idx = np.argsort(mHash)
    inv_idx = np.argsort(idx)
    mIdx_sorted = su.searchsorted(array2.indices, mHash[idx])
    mIdx = mIdx_sorted[inv_idx]
    MRes = array2.data[mIdx]

    # Check
    assert len(ARes) == len(FRes)
    assert len(FRes) == len(BRes)
    assert len(BRes) == len(MRes)

    # Calculate by using a factorized version of formula.
    norm0 = array0.data.sum()
    norm1 = array1.data.sum()
    norm2 = array2.data.sum()
    expectation = (FRes * BRes) / MRes
    observation = ARes
    normFactor = (norm1**2) / (norm2 * norm0)
    rhs = np.log2(observation / expectation) + np.log2(normFactor)
    lhs = ARes / norm0
    relativeEntropy = (lhs * rhs).sum()

    ## Version which follow a formula as written in paper. Performance is still the same though.
    ## Roughly the same speed with the reduce version above.
    # norm0 = array0.data.sum()
    # norm1 = array1.data.sum()
    # norm2 = array2.data.sum()
    # expectation = (FRes/norm1) * (BRes/norm1) / (MRes/norm2)
    # observation = ARes / norm0
    # relativeEntropy = (observation * np.log2(observation/expectation)).sum()

    return relativeEntropy
예제 #5
0
def test_searchsorted():
    test_1 = np.array([1, 3, 5, 7, 9])
    test_2 = np.array([3, 6, 9])
    assert np.array_equal(su.searchsorted(test_1, test_2), [1, 4])
    assert np.array_equal(su.searchsorted(test_2, test_1), [0, 2])