def test_whitening_tensor_e2_m1(): rng = np.random.RandomState(12) n_features = 300 n_components = 25 min_count = 3 alpha0 = 10. n_samples = rng.randint(100, 150) doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float') doc_word_mtx = sp.csr_matrix(doc_word_mtx) m1, _ = first_order_moments(doc_word_mtx, min_words=min_count) e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count) # create M2 directly m2 = (alpha0 + 1.) * e2.toarray() m2 -= (alpha0 * m1) * m1[:, np.newaxis] m2_vals, m2_vecs = sp.linalg.eigsh(m2, k=n_components) # create whitening matrix W = whitening(m2_vals, m2_vecs) # optimized method wt_m1 = np.dot(W.T, m1) u1_2_3 = whitening_tensor_e2_m1(wt_m1, alpha0) # compute directly u1_2_3_true = _compute_e2_m1_directly(doc_word_mtx, W, wt_m1) assert_array_almost_equal(u1_2_3_true, u1_2_3)
def test_whitening(): rng = np.random.RandomState(1) n_features = 500 n_components = 50 min_count = 3 alpha0 = 10. n_samples = rng.randint(100, 150) doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float') doc_word_mtx = sp.csr_matrix(doc_word_mtx) m1, _ = first_order_moments(doc_word_mtx, min_words=min_count) e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count) # create M2 directly m2 = (alpha0 + 1.) * e2.toarray() m2 -= (alpha0 * m1) * m1[:, np.newaxis] m2_vals, m2_vecs = sp.linalg.eigsh(m2, k=n_components) # create whitening matrix W = whitening(m2_vals, m2_vecs) # check whitening matrix shape assert_equal(n_features, W.shape[0]) assert_equal(n_components, W.shape[1]) # M2(W, W) should be identity matrix identity = np.dot(np.dot(W.T, m2), W) assert_array_almost_equal(np.eye(n_components, n_components), identity)