def test_dummy_precondition(): c1 = pd.Series(pd.Categorical(["a"] * 5 + ["b"] * 5 + ["c"] * 5)) c2 = pd.Series(pd.Categorical(["A", "B", "C", "D", "E"] * 3)) cats = pd.concat([c1, c2], 1) out_arr, cond_arr = dummy_matrix(cats, output_format="array", drop="last", precondition=True) csc = dummy_matrix(cats, output_format="csc", drop="last", precondition=True) out_csc: csc_matrix = csc[0] cond_csc: np.ndarray = csc[1] csr = dummy_matrix(cats, output_format="csr", drop="last", precondition=True) out_csr: csr_matrix = csr[0] cond_csr: np.ndarray = csr[1] assert_allclose((out_arr**2).sum(0), np.ones(out_arr.shape[1])) assert_allclose((out_csc.multiply(out_csc)).sum(0).A1, np.ones(out_arr.shape[1])) assert_allclose(cond_arr, cond_csc) assert_allclose(cond_csr, cond_csc) assert isinstance(out_csr, scipy.sparse.csr_matrix)
def test_absorbing_regressors(cat, cont, interact, weights): areg = AbsorbingRegressor(cat=cat, cont=cont, interactions=interact, weights=weights) rank = areg.approx_rank expected_rank = 0 expected = [] for i, col in enumerate(cat): expected_rank += pd.Series(cat[col].cat.codes).nunique() - (i > 0) expected.append(dummy_matrix(cat, precondition=False)[0]) expected_rank += cont.shape[1] expected.append(csc_matrix(cont)) if interact is not None: for inter in interact: interact_mat = inter.sparse expected_rank += interact_mat.shape[1] expected.append(interact_mat) expected = sp.hstack(expected, format="csc") if weights is not None: expected = (sp.diags(np.sqrt(weights)).dot(expected)).asformat("csc") actual = areg.regressors assert expected.shape == actual.shape assert_array_equal(expected.indptr, actual.indptr) assert_array_equal(expected.indices, actual.indices) assert_allclose(expected.A, actual.A) assert expected_rank == rank
def test_drop_singletons_slow(): rs = np.random.RandomState(0) c1 = rs.randint(0, 10000, (40000, 1)) c2 = rs.randint(0, 20000, (40000, 1)) cats = np.concatenate([c1, c2], 1) retain = in_2core_graph_slow(cats) nonsingletons = cats[retain] for col in (c1, c2): uniq, counts = np.unique(col, return_counts=True) assert not np.any(np.isin(col[retain], uniq[counts == 1])) idx = np.arange(40000) cols = {"c1": c1.copy(), "c2": c2.copy()} for i in range(40000): last = cols["c1"].shape[0] for col in cols: keep = in_2core_graph_slow(cols[col]) for col2 in cols: cols[col2] = cols[col2][keep] idx = idx[keep] if cols["c1"].shape[0] == last: break expected = np.concatenate([c1[idx], c2[idx]], 1) assert_array_equal(nonsingletons, expected) expected = np.concatenate([cols["c1"], cols["c2"]], 1) assert_array_equal(nonsingletons, expected) dummies, _ = dummy_matrix(cats, output_format="csr", precondition=False) to_drop = dummies[~retain] assert to_drop.sum() == 2 * (~retain).sum()
def test_dummy_pandas(): c1 = pd.Series(pd.Categorical(["a"] * 5 + ["b"] * 5 + ["c"] * 5)) c2 = pd.Series(pd.Categorical(["A", "B", "C", "D", "E"] * 3)) cats = pd.concat([c1, c2], 1) out, _ = dummy_matrix(cats, drop="last", precondition=False) assert isinstance(out, scipy.sparse.csc.csc_matrix) assert out.shape == (15, 3 + 5 - 1) expected = np.array([5, 5, 5, 3, 3, 3, 3], dtype=np.int32) assert_array_equal(np.squeeze(np.asarray(out.sum(0), dtype=np.int32)), expected)
def test_dummy_pandas(): c1 = pd.Series(pd.Categorical(['a'] * 5 + ['b'] * 5 + ['c'] * 5)) c2 = pd.Series(pd.Categorical(['A', 'B', 'C', 'D', 'E'] * 3)) cats = pd.concat([c1, c2], 1) out, _ = dummy_matrix(cats, drop='last', precondition=False) assert isinstance(out, scipy.sparse.csc.csc_matrix) assert out.shape == (15, 3 + 5 - 1) expected = np.array([5, 5, 5, 3, 3, 3, 3], dtype=np.int32) assert_array_equal(np.squeeze(np.asarray(out.sum(0), dtype=np.int32)), expected)
def test_dummy_last(): cats = np.zeros([15, 2], dtype=np.int8) cats[5:, 0] = 1 cats[10:, 0] = 2 cats[:, 1] = np.arange(15) % 5 cats[-1, 1] = 0 out, _ = dummy_matrix(cats, drop="last", precondition=False) assert isinstance(out, scipy.sparse.csc.csc_matrix) assert out.shape == (15, 3 + 5 - 1) expected = np.array([5, 5, 5, 4, 3, 3, 3], dtype=np.int32) assert out.shape == (15, 3 + 5 - 1) assert_array_equal(np.squeeze(np.asarray(out.sum(0), dtype=np.int32)), expected)
def test_dummy_format(dummy_format): code, expected_type = dummy_format cats = np.zeros([15, 2], dtype=np.int8) cats[5:, 0] = 1 cats[10:, 0] = 2 cats[:, 1] = np.arange(15) % 5 out, cond = dummy_matrix(cats, output_format=code, precondition=False) assert isinstance(out, expected_type) assert out.shape == (15, 3 + 5 - 1) expected = np.array([5, 5, 5, 3, 3, 3, 3], dtype=np.int32) assert_array_equal(np.squeeze(np.asarray(out.sum(0), dtype=np.int32)), expected) assert_array_equal(cond, np.ones(out.shape[1]))
def test_dummy_precondition(): c1 = pd.Series(pd.Categorical(['a'] * 5 + ['b'] * 5 + ['c'] * 5)) c2 = pd.Series(pd.Categorical(['A', 'B', 'C', 'D', 'E'] * 3)) cats = pd.concat([c1, c2], 1) out_arr, cond_arr = dummy_matrix(cats, format='array', drop='last', precondition=True) out_csc, cond_csc = dummy_matrix(cats, format='csc', drop='last', precondition=True) out_csr, cond_csr = dummy_matrix(cats, format='csr', drop='last', precondition=True) assert_allclose((out_arr**2).sum(0), np.ones(out_arr.shape[1])) assert_allclose((out_csc.multiply(out_csc)).sum(0).A1, np.ones(out_arr.shape[1])) assert_allclose(cond_arr, cond_csc) assert_allclose(cond_csr, cond_csc) assert isinstance(out_csr, scipy.sparse.csr_matrix)
def category_interaction(cat: Series, precondition: bool = True) -> csc_matrix: """ Parameters ---------- cat : Series Categorical series to convert to dummy variables precondition : bool Flag whether dummies should be preconditioned Returns ------- dummies : csc_matrix Sparse matrix of dummies with unit column norm """ codes = get_codes(category_product(cat).cat) return dummy_matrix(codes[:, None], precondition=precondition)[0]
def category_interaction(cat: Series, precondition: bool = True) -> sp.csc_matrix: """ Parameters ---------- cat : Series Categorical series to convert to dummy variables precondition : bool Flag whether dummies should be preconditioned Returns ------- csc_matrix Sparse matrix of dummies with unit column norm """ codes = asarray(category_product(cat).cat.codes)[:, None] mat = dummy_matrix(codes, precondition=precondition)[0] assert isinstance(mat, sp.csc_matrix) return mat
def test_against_ols(ols_data): mod = AbsorbingLS( ols_data.y, ols_data.x, absorb=ols_data.absorb, interactions=ols_data.interactions, weights=ols_data.weights, ) res = mod.fit() absorb = [] has_dummy = False if ols_data.absorb is not None: absorb.append(ols_data.absorb.cont.to_numpy()) if ols_data.absorb.cat.shape[1] > 0: dummies = dummy_matrix(ols_data.absorb.cat, precondition=False)[0] assert isinstance(dummies, sp.csc_matrix) absorb.append(dummies.A) has_dummy = ols_data.absorb.cat.shape[1] > 0 if ols_data.interactions is not None: for interact in ols_data.interactions: absorb.append(interact.sparse.A) _x = ols_data.x if absorb: absorb = np.column_stack(absorb) if np.any(np.ptp(_x, 0) == 0) and has_dummy: if ols_data.weights is None: absorb = annihilate(absorb, np.ones((absorb.shape[0], 1))) else: root_w = np.sqrt(mod.weights.ndarray) wabsorb = annihilate(root_w * absorb, root_w) absorb = (1.0 / root_w) * wabsorb rank = np.linalg.matrix_rank(absorb) if rank < absorb.shape[1]: a, b = np.linalg.eig(absorb.T @ absorb) order = np.argsort(a)[::-1] a, b = a[order], b[:, order] z = absorb @ b absorb = z[:, :rank] _x = np.column_stack([_x, absorb]) ols_mod = _OLS(ols_data.y, _x, weights=ols_data.weights) ols_res = ols_mod.fit() assert_results_equal(ols_res, res)
def _regressors(self) -> csc_matrix: regressors = [] if self._cat is not None and self._cat.shape[1] > 0: regressors.append(dummy_matrix(self._cat, precondition=False)[0]) if self._cont is not None and self._cont.shape[1] > 0: regressors.append(csc_matrix(to_numpy(self._cont))) if self._interactions is not None: regressors.extend([interact.sparse for interact in self._interactions]) if regressors: regressor_mat = sp.hstack(regressors, format='csc') approx_rank = regressor_mat.shape[1] self._approx_rank = approx_rank if self._weights is not None: return (sp.diags(sqrt(self._weights.squeeze())).dot(regressor_mat)).asformat('csc') return regressor_mat else: self._approx_rank = 0 return csc_matrix(empty((0, 0)))
def test_invalid_format(): cats = np.zeros([10, 1], dtype=np.int8) cats[5:, 0] = 1 with pytest.raises(ValueError): dummy_matrix(cats, output_format="unknown", precondition=False)