def test_1d_mask_int(self): data = np.array([0, 1, 1, 2, 1]) bb = [0, 1, 1, 0, 0] counts, nans = bn.contingency(data, bb, 2, 1, mask=[1]) np.testing.assert_almost_equal(counts, [[1, 1, 1], [0, 2, 0]]) np.testing.assert_almost_equal(nans, [0, 0]) counts, nans = bn.contingency(data, bb, 2, 1, mask=[0]) np.testing.assert_almost_equal(counts, np.zeros((2, 3))) np.testing.assert_almost_equal(nans, [0, 0])
def test_1d_mask_float(self): nan = float("nan") data = np.array([0, 1, nan, 2, 1], dtype=float) bb = [0, 1, 1, 0, 0] counts, nans = bn.contingency(data, bb, 2, 1, mask=[1]) np.testing.assert_almost_equal(counts, [[1, 1, 1], [0, 1, 0]]) np.testing.assert_almost_equal(nans, [0, 1]) counts, nans = bn.contingency(data, bb, 2, 1, mask=[0]) np.testing.assert_almost_equal(counts, np.zeros((2, 3))) np.testing.assert_almost_equal(nans, [0, 0])
def test_1d_int(self): data = np.array([0, 1, 1, 2, 1]) bb = [0, 1, 1, 0, 0] for b in [bb, np.array(bb, dtype=np.int8), np.array(bb, dtype=float)]: counts, nans = bn.contingency(data, b, 2, 1) np.testing.assert_almost_equal(counts, [[1, 1, 1], [0, 2, 0]]) np.testing.assert_almost_equal(nans, np.zeros(2))
def test_1d_weighted_int(self): nan = float("nan") data = np.array([0, 1, nan, 2, 1], dtype=float) bb = [0, 1, 1, 0, 0] for b in [bb, np.array(bb, dtype=np.int8), np.array(bb, dtype=float)]: counts, nans = bn.contingency(data, b, 2, 1, weights=[1, 2, 3, 4, 5]) np.testing.assert_almost_equal(counts, [[1, 5, 4], [0, 2, 0]]) np.testing.assert_almost_equal(nans, [0, 3])
def test_sparse_mask_float(self): data = np.array([1, 1, 2, 2, 1, 3], dtype=float) indptr = [0, 3, 4, 6] indices = [0, 1, 2, 0, 1, 2] a = sp.csr_matrix((data, indices, indptr), shape=(3, 4)) counts, nans = bn.contingency(a, [1, 0, 1], 3, 1, mask=[1, 0, 0, 1]) np.testing.assert_almost_equal(counts[0], [[0, 0, 1, 0], [0, 1, 0, 0]]) np.testing.assert_almost_equal(counts[1], np.zeros((2, 4))) np.testing.assert_almost_equal(counts[2], np.zeros((2, 4))) np.testing.assert_almost_equal(counts[3], np.zeros((2, 4)))
def test_weighted_int(self): data = np.array([[0, 1, 1, 2, 1], [1, 1, 1, 0, 1], [0, 0, 3, 0, 0]], dtype=int) counts, nans = bn.contingency(data, [1, 0, 1], 3, 1, weights=[1, 2, 3]) np.testing.assert_almost_equal(counts[0], [[0, 2, 0, 0], [4, 0, 0, 0]]) np.testing.assert_almost_equal(counts[1], [[0, 2, 0, 0], [3, 1, 0, 0]]) np.testing.assert_almost_equal(counts[2], [[0, 2, 0, 0], [0, 1, 0, 3]]) np.testing.assert_almost_equal(counts[3], [[2, 0, 0, 0], [3, 0, 1, 0]]) np.testing.assert_almost_equal(counts[4], [[0, 2, 0, 0], [3, 1, 0, 0]]) np.testing.assert_almost_equal(nans, np.zeros((5, 2)))
def test_simple_float(self): nan = float("nan") data = np.array([[0, 1, 1, 2, 1], [1, 1, 1, nan, 1], [0, 0, 3, nan, nan]], dtype=float) counts, nans = bn.contingency(data, [1, 0, 1], 3, 1) np.testing.assert_almost_equal(counts[0], [[0, 1, 0, 0], [2, 0, 0, 0]]) np.testing.assert_almost_equal(counts[1], [[0, 1, 0, 0], [1, 1, 0, 0]]) np.testing.assert_almost_equal(counts[2], [[0, 1, 0, 0], [0, 1, 0, 1]]) np.testing.assert_almost_equal(counts[3], [[0, 0, 0, 0], [0, 0, 1, 0]]) np.testing.assert_almost_equal(counts[4], [[0, 1, 0, 0], [0, 1, 0, 0]]) np.testing.assert_almost_equal(nans, [[0, 0], [0, 0], [0, 0], [1, 1], [0, 1]])
def test_simple_float(self): nan = float("nan") data = np.array( [[0, 1, 1, 2, 1], [1, 1, 1, nan, 1], [0, 0, 3, nan, nan]], dtype=float) counts, nans = bn.contingency(data, [1, 0, 1], 3, 1) np.testing.assert_almost_equal(counts[0], [[0, 1, 0, 0], [2, 0, 0, 0]]) np.testing.assert_almost_equal(counts[1], [[0, 1, 0, 0], [1, 1, 0, 0]]) np.testing.assert_almost_equal(counts[2], [[0, 1, 0, 0], [0, 1, 0, 1]]) np.testing.assert_almost_equal(counts[3], [[0, 0, 0, 0], [0, 0, 1, 0]]) np.testing.assert_almost_equal(counts[4], [[0, 1, 0, 0], [0, 1, 0, 0]]) np.testing.assert_almost_equal( nans, [[0, 0], [0, 0], [0, 0], [1, 1], [0, 1]])
def test_mask_weighted_float(self): nan = float("nan") data = np.array([[0, 1, 1, 2, 1], [1, 1, 1, nan, 1], [0, 0, 3, nan, nan]], dtype=float) counts, nans = bn.contingency(data, [1, 0, 1], 3, 1, weights=[1, 2, 3], mask=[1, 1, 0, 0, 1]) np.testing.assert_almost_equal(counts[0], [[0, 2, 0, 0], [4, 0, 0, 0]]) np.testing.assert_almost_equal(counts[1], [[0, 2, 0, 0], [3, 1, 0, 0]]) np.testing.assert_almost_equal(counts[2], np.zeros((2, 4))) np.testing.assert_almost_equal(counts[3], np.zeros((2, 4))) np.testing.assert_almost_equal(counts[4], [[0, 2, 0, 0], [0, 1, 0, 0]]) np.testing.assert_almost_equal(nans, [[0, 0], [0, 0], [0, 0], [0, 0], [0, 3]])
def test_simple_int(self): data = np.array([[0, 1, 1, 2, 1], [1, 1, 1, 0, 1], [0, 0, 3, 0, 0]], dtype=int) for b in [ np.array([1, 0, 1], dtype=np.int8), np.array([1, 0, 1], dtype=float), [1, 0, 1]]: counts, nans = bn.contingency(data, b, 3, 1) np.testing.assert_almost_equal(counts[0], [[0, 1, 0, 0], [2, 0, 0, 0]]) np.testing.assert_almost_equal(counts[1], [[0, 1, 0, 0], [1, 1, 0, 0]]) np.testing.assert_almost_equal(counts[2], [[0, 1, 0, 0], [0, 1, 0, 1]]) np.testing.assert_almost_equal(counts[3], [[1, 0, 0, 0], [1, 0, 1, 0]]) np.testing.assert_almost_equal(counts[4], [[0, 1, 0, 0], [1, 1, 0, 0]]) np.testing.assert_almost_equal(nans, np.zeros((5, 2)))
def test_mask_int(self): data = np.array([[0, 1, 1, 2, 1], [1, 1, 1, 0, 1], [0, 0, 3, 0, 0]], dtype=int) for b in [ np.array([1, 0, 1], dtype=np.int8), np.array([1, 0, 1], dtype=float), [1, 0, 1] ]: counts, nans = bn.contingency(data, b, 3, 1, mask=[1, 1, 0, 0, 1]) np.testing.assert_almost_equal(counts[0], [[0, 1, 0, 0], [2, 0, 0, 0]]) np.testing.assert_almost_equal(counts[1], [[0, 1, 0, 0], [1, 1, 0, 0]]) np.testing.assert_almost_equal(counts[2], np.zeros((2, 4))) np.testing.assert_almost_equal(counts[3], np.zeros((2, 4))) np.testing.assert_almost_equal(counts[4], [[0, 1, 0, 0], [1, 1, 0, 0]]) np.testing.assert_almost_equal(nans, np.zeros((5, 2)))
def test_mask_weighted_float(self): nan = float("nan") data = np.array( [[0, 1, 1, 2, 1], [1, 1, 1, nan, 1], [0, 0, 3, nan, nan]], dtype=float) counts, nans = bn.contingency(data, [1, 0, 1], 3, 1, weights=[1, 2, 3], mask=[1, 1, 0, 0, 1]) np.testing.assert_almost_equal(counts[0], [[0, 2, 0, 0], [4, 0, 0, 0]]) np.testing.assert_almost_equal(counts[1], [[0, 2, 0, 0], [3, 1, 0, 0]]) np.testing.assert_almost_equal(counts[2], np.zeros((2, 4))) np.testing.assert_almost_equal(counts[3], np.zeros((2, 4))) np.testing.assert_almost_equal(counts[4], [[0, 2, 0, 0], [0, 1, 0, 0]]) np.testing.assert_almost_equal( nans, [[0, 0], [0, 0], [0, 0], [0, 0], [0, 3]])
def _compute_contingency(self, col_vars=None, row_var=None): n_atts = self.X.shape[1] if col_vars is None: col_vars = range(len(self.domain.variables)) single_column = False else: col_vars = [self.domain.index(var) for var in col_vars] single_column = len(col_vars) == 1 and len(self.domain) > 1 if row_var is None: row_var = self.domain.class_var if row_var is None: raise ValueError("No row variable") row_desc = self.domain[row_var] if not isinstance(row_desc, DiscreteVariable): raise TypeError("Row variable must be discrete") row_indi = self.domain.index(row_var) n_rows = len(row_desc.values) if 0 <= row_indi < n_atts: row_data = self.X[:, row_indi] elif row_indi < 0: row_data = self.metas[:, -1 - row_indi] else: row_data = self.Y[:, row_indi - n_atts] W = self.W if self.has_weights() else None col_desc = [self.domain[var] for var in col_vars] col_indi = [self.domain.index(var) for var in col_vars] if any(not isinstance(var, (ContinuousVariable, DiscreteVariable)) for var in col_desc): raise ValueError("contingency can be computed only for discrete " "and continuous values") if any(isinstance(var, ContinuousVariable) for var in col_desc): if bn.countnans(row_data): raise ValueError("cannot compute contigencies with missing " "row data") contingencies = [None] * len(col_desc) for arr, f_cond, f_ind in ( (self.X, lambda i: 0 <= i < n_atts, lambda i: i), (self.Y, lambda i: i >= n_atts, lambda i: i - n_atts), (self.metas, lambda i: i < 0, lambda i: -1 - i)): arr_indi = [e for e, ind in enumerate(col_indi) if f_cond(ind)] vars = [(e, f_ind(col_indi[e]), col_desc[e]) for e in arr_indi] disc_vars = [v for v in vars if isinstance(v[2], DiscreteVariable)] if disc_vars: if sp.issparse(arr): max_vals = max(len(v[2].values) for v in disc_vars) disc_indi = {i for _, i, _ in disc_vars} mask = [i in disc_indi for i in range(arr.shape[1])] conts, nans = bn.contingency(arr, row_data, max_vals - 1, n_rows - 1, W, mask) for col_i, arr_i, _ in disc_vars: contingencies[col_i] = (conts[arr_i], nans[arr_i]) else: for col_i, arr_i, var in disc_vars: contingencies[col_i] = bn.contingency(arr[:, arr_i], row_data, len(var.values) - 1, n_rows - 1, W) cont_vars = [v for v in vars if isinstance(v[2], ContinuousVariable)] if cont_vars: classes = row_data.astype(dtype=np.int8) if W is not None: W = W.astype(dtype=np.float64) if sp.issparse(arr): arr = sp.csc_matrix(arr) for col_i, arr_i, _ in cont_vars: if sp.issparse(arr): col_data = arr.data[arr.indptr[arr_i]: arr.indptr[arr_i+1]] rows = arr.indices[arr.indptr[arr_i]: arr.indptr[arr_i+1]] W_ = None if W is None else W[rows] classes_ = classes[rows] else: col_data, W_, classes_ = arr[:, arr_i], W, classes col_data = col_data.astype(dtype=np.float64) U, C, unknown = _contingency.contingency_floatarray( \ col_data, classes_, n_rows, W_) contingencies[col_i] = ([U, C], unknown) return contingencies
def test_1d_weighted_int(self): data = np.array([0, 1, 1, 2, 1]) bb = [0, 1, 1, 0, 0] counts, nans = bn.contingency(data, bb, 2, 1, weights=[1, 2, 3, 4, 5]) np.testing.assert_almost_equal(counts, [[1, 5, 3], [0, 3, 0]]) np.testing.assert_almost_equal(nans, np.zeros(2))
def _compute_contingency(self, col_vars=None, row_var=None): n_atts = self.X.shape[1] if col_vars is None: col_vars = range(len(self.domain.variables)) single_column = False else: col_vars = [self.domain.index(var) for var in col_vars] single_column = len(col_vars) == 1 and len(self.domain) > 1 if row_var is None: row_var = self.domain.class_var if row_var is None: raise ValueError("No row variable") row_desc = self.domain[row_var] if not isinstance(row_desc, DiscreteVariable): raise TypeError("Row variable must be discrete") row_indi = self.domain.index(row_var) n_rows = len(row_desc.values) if 0 <= row_indi < n_atts: row_data = self.X[:, row_indi] elif row_indi < 0: row_data = self.metas[:, -1 - row_indi] else: row_data = self.Y[:, row_indi - n_atts] W = self.W if self.has_weights() else None col_desc = [self.domain[var] for var in col_vars] col_indi = [self.domain.index(var) for var in col_vars] if any(not isinstance(var, (ContinuousVariable, DiscreteVariable)) for var in col_desc): raise ValueError("contingency can be computed only for discrete " "and continuous values") if any(isinstance(var, ContinuousVariable) for var in col_desc): dep_indices = np.argsort(row_data) dep_sizes, nans = bn.bincount(row_data, n_rows - 1) dep_sizes = dep_sizes.astype(int, copy=False) if nans: raise ValueError("cannot compute contigencies with missing " "row data") else: dep_indices = dep_sizes = None contingencies = [None] * len(col_desc) for arr, f_cond, f_ind in ( (self.X, lambda i: 0 <= i < n_atts, lambda i: i), (self.Y, lambda i: i >= n_atts, lambda i: i - n_atts), (self.metas, lambda i: i < 0, lambda i: -1 - i), ): arr_indi = [e for e, ind in enumerate(col_indi) if f_cond(ind)] vars = [(e, f_ind(col_indi[e]), col_desc[e]) for e in arr_indi] disc_vars = [v for v in vars if isinstance(v[2], DiscreteVariable)] if disc_vars: if sp.issparse(arr): max_vals = max(len(v[2].values) for v in disc_vars) disc_indi = {i for _, i, _ in disc_vars} mask = [i in disc_indi for i in range(arr.shape[1])] conts, nans = bn.contingency(arr, row_data, max_vals - 1, n_rows - 1, W, mask) for col_i, arr_i, _ in disc_vars: contingencies[col_i] = (conts[arr_i], nans[arr_i]) else: for col_i, arr_i, var in disc_vars: contingencies[col_i] = bn.contingency( arr[:, arr_i], row_data, len(var.values) - 1, n_rows - 1, W ) cont_vars = [v for v in vars if isinstance(v[2], ContinuousVariable)] if cont_vars: for col_i, _, _ in cont_vars: contingencies[col_i] = ([], np.empty(n_rows)) fr = 0 for clsi, cs in enumerate(dep_sizes): to = fr + cs grp_rows = dep_indices[fr:to] grp_data = arr[grp_rows, :] grp_W = W and W[grp_rows] if sp.issparse(grp_data): grp_data = sp.csc_matrix(grp_data) for col_i, arr_i, _ in cont_vars: if sp.issparse(grp_data): col_data = grp_data.data[grp_data.indptr[arr_i] : grp_data.indptr[arr_i + 1]] else: col_data = grp_data[:, arr_i] if W is not None: ranks = np.argsort(col_data) vals = np.vstack((col_data[ranks], grp_W[ranks])) nans = bn.countnans(col_data, grp_W) else: col_data.sort() vals = np.ones((2, len(col_data))) vals[0, :] = col_data nans = bn.countnans(col_data) dist = np.array(_valuecount.valuecount(vals)) contingencies[col_i][0].append(dist) contingencies[col_i][1][clsi] = nans fr = to return contingencies
def _compute_contingency(self, col_vars=None, row_var=None): n_atts = self.X.shape[1] if col_vars is None: col_vars = range(len(self.domain.variables)) single_column = False else: col_vars = [self.domain.index(var) for var in col_vars] single_column = len(col_vars) == 1 and len(self.domain) > 1 if row_var is None: row_var = self.domain.class_var if row_var is None: raise ValueError("No row variable") row_desc = self.domain[row_var] if not isinstance(row_desc, DiscreteVariable): raise TypeError("Row variable must be discrete") row_indi = self.domain.index(row_var) n_rows = len(row_desc.values) if 0 <= row_indi < n_atts: row_data = self.X[:, row_indi] elif row_indi < 0: row_data = self.metas[:, -1 - row_indi] else: row_data = self.Y[:, row_indi - n_atts] W = self.W if self.has_weights() else None col_desc = [self.domain[var] for var in col_vars] col_indi = [self.domain.index(var) for var in col_vars] if any(not isinstance(var, (ContinuousVariable, DiscreteVariable)) for var in col_desc): raise ValueError("contingency can be computed only for discrete " "and continuous values") if any(isinstance(var, ContinuousVariable) for var in col_desc): if bn.countnans(row_data): raise ValueError("cannot compute contigencies with missing " "row data") contingencies = [None] * len(col_desc) for arr, f_cond, f_ind in ((self.X, lambda i: 0 <= i < n_atts, lambda i: i), (self.Y, lambda i: i >= n_atts, lambda i: i - n_atts), (self.metas, lambda i: i < 0, lambda i: -1 - i)): arr_indi = [e for e, ind in enumerate(col_indi) if f_cond(ind)] vars = [(e, f_ind(col_indi[e]), col_desc[e]) for e in arr_indi] disc_vars = [v for v in vars if isinstance(v[2], DiscreteVariable)] if disc_vars: if sp.issparse(arr): max_vals = max(len(v[2].values) for v in disc_vars) disc_indi = {i for _, i, _ in disc_vars} mask = [i in disc_indi for i in range(arr.shape[1])] conts, nans = bn.contingency(arr, row_data, max_vals - 1, n_rows - 1, W, mask) for col_i, arr_i, _ in disc_vars: contingencies[col_i] = (conts[arr_i], nans[arr_i]) else: for col_i, arr_i, var in disc_vars: contingencies[col_i] = bn.contingency( arr[:, arr_i], row_data, len(var.values) - 1, n_rows - 1, W) cont_vars = [ v for v in vars if isinstance(v[2], ContinuousVariable) ] if cont_vars: classes = row_data.astype(dtype=np.int8) if W is not None: W = W.astype(dtype=np.float64) if sp.issparse(arr): arr = sp.csc_matrix(arr) for col_i, arr_i, _ in cont_vars: if sp.issparse(arr): col_data = arr.data[arr.indptr[arr_i]:arr.indptr[arr_i + 1]] rows = arr.indices[arr.indptr[arr_i]:arr.indptr[arr_i + 1]] W_ = None if W is None else W[rows] classes_ = classes[rows] else: col_data, W_, classes_ = arr[:, arr_i], W, classes col_data = col_data.astype(dtype=np.float64) U, C, unknown = _contingency.contingency_floatarray( \ col_data, classes_, n_rows, W_) contingencies[col_i] = ([U, C], unknown) return contingencies