def sparse_kl_divergence(p, q): """Compute the Kullback-Leibler (KL) divergence on sparse matrix. Parameters ---------- p : scipy csr_matrix (with 1 row) "Ideal"/"true" Probability distribution q : scipy csr_matrix (with 1 row) Approximation of probability distribution p Returns ------- kl : float KL divergence of approximating p with the distribution q """ # Get index that appear in both sparse matrix (assume that both are sorted) p_idx = su.searchsorted(p.indices, q.indices) q_idx = su.searchsorted(q.indices, p.indices) # Get only part where BOTH has value. p_val = p.data[p_idx] q_val = q.data[q_idx] kl = np.sum(p_val * np.log2(p_val / q_val)) return kl
def test_searchsorted_all2all(): """ Shouldn't miss any """ for i in range(20): sample = np.sort( np.random.choice(np.arange(0, 1000), 100, replace=False)) expected = np.arange(0, 100) answer = su.searchsorted(sample, sample) print(answer) assert np.array_equal(answer, expected)
def test_searchsorted_subset(): """ When query is a subset of the target """ for i in range(20): sample = np.sort( np.random.choice(np.arange(0, 1000), 100, replace=False)) remove = np.random.choice(np.arange(0, 100), 10, replace=False) subsample = np.delete(sample, remove) expected = np.delete(np.arange(0, 100), remove) answer = su.searchsorted(sample, subsample) assert np.array_equal(answer, expected)
def _calculate_re_vectorize(array0, array1, array2): """ Calculate relative entropy using vectorize Args: array1 (TODO): Main array array2 (TODO): -1 order array array3 (TODO): -2 order array Returns: TODO """ def _calculate_limit(ksize): # And then we can use bin to calculate how much to delete from hash A_lim = kmerutil.encode("A" + ("T" * (ksize - 1)))[0] C_lim = int((A_lim * 2) + 1) G_lim = int((A_lim * 3) + 2) return np.array([A_lim + 1, C_lim + 1, G_lim + 1]) ksize = int( math.log(array1.shape[1], 4)) + 1 # Calculate kmer from size of array to hold all kmer # Calculate all index for each level. ARes = array0.data # Obs # All merFront bins = _calculate_limit(ksize) fHash = kmerutil.trimFront(array0.indices, ksize) idx = np.argsort(fHash) inv_idx = np.argsort(idx) fIdx_sorted = su.searchsorted(array1.indices, fHash[idx]) fIdx = fIdx_sorted[inv_idx] FRes = array1.data[fIdx] # All merBack bHash = kmerutil.trimBack(array0.indices) bIdx = su.searchsorted(array1.indices, bHash) BRes = array1.data[bIdx] # All Mer middle # Use fhash and calculate back hash # convertMidVec = np.vectorize(lambda khash:_convert_to_middle(khash, ksize)) # mHash = convertMidVec(array0.indices) mHash = kmerutil.trimBack(fHash) idx = np.argsort(mHash) inv_idx = np.argsort(idx) mIdx_sorted = su.searchsorted(array2.indices, mHash[idx]) mIdx = mIdx_sorted[inv_idx] MRes = array2.data[mIdx] # Check assert len(ARes) == len(FRes) assert len(FRes) == len(BRes) assert len(BRes) == len(MRes) # Calculate by using a factorized version of formula. norm0 = array0.data.sum() norm1 = array1.data.sum() norm2 = array2.data.sum() expectation = (FRes * BRes) / MRes observation = ARes normFactor = (norm1**2) / (norm2 * norm0) rhs = np.log2(observation / expectation) + np.log2(normFactor) lhs = ARes / norm0 relativeEntropy = (lhs * rhs).sum() ## Version which follow a formula as written in paper. Performance is still the same though. ## Roughly the same speed with the reduce version above. # norm0 = array0.data.sum() # norm1 = array1.data.sum() # norm2 = array2.data.sum() # expectation = (FRes/norm1) * (BRes/norm1) / (MRes/norm2) # observation = ARes / norm0 # relativeEntropy = (observation * np.log2(observation/expectation)).sum() return relativeEntropy
def test_searchsorted(): test_1 = np.array([1, 3, 5, 7, 9]) test_2 = np.array([3, 6, 9]) assert np.array_equal(su.searchsorted(test_1, test_2), [1, 4]) assert np.array_equal(su.searchsorted(test_2, test_1), [0, 2])