def test_bincount(self): hist, n_nans = bincount([0., 1., np.nan, 3]) self.assertEqual(n_nans, 1) np.testing.assert_equal(hist, [1, 1, 0, 1]) hist, n_nans = bincount([0., 1., 3], max_val=3) self.assertEqual(n_nans, 0) np.testing.assert_equal(hist, [1, 1, 0, 1])
def test_all_zeros_or_nans(self, array): """Sparse arrays with only nans with no explicit zeros will have no non zero indices. Check that this counts the zeros properly.""" x = array([np.nan] * 5 + [0] * 5) expected = [5] np.testing.assert_equal(bincount(x)[0], expected)
def _get_bin_distributions(self, bin_indices): """Compute the distribution of instances within bins. Parameters ---------- bin_indices : np.ndarray An array with same shape as `x` but containing the bin index of the instance. Returns ------- np.ndarray A 2d array; the first dimension represents different bins, the second - the counts of different target values. """ if self.target_var and self.target_var.is_discrete: y = self.y # TODO This probably also isn't the best handling of sparse data... if sp.issparse(y): y = np.squeeze(np.array(y.todense())) # Since y can contain missing values, we need to filter them out as # well as their corresponding `x` values y_nan_mask = np.isnan(y) y, bin_indices = y[~y_nan_mask], bin_indices[~y_nan_mask] y = one_hot(y) # In the event that y does not take up all the values and the # largest discrete value does not appear at all, one hot encoding # will produce too few columns. This causes problems, so we need to # pad y with zeros to properly compute the distribution if y.shape[1] != len(self.target_var.values): n_missing_columns = len(self.target_var.values) - y.shape[1] y = np.hstack((y, np.zeros((y.shape[0], n_missing_columns)))) bins = np.arange(self.n_bins)[:, np.newaxis] mask = bin_indices == bins distributions = np.zeros((self.n_bins, y.shape[1])) for bin_idx in range(self.n_bins): distributions[bin_idx] = y[mask[bin_idx]].sum(axis=0) else: distributions, _ = ut.bincount(bin_indices.astype(np.int64)) # To keep things consistent across different variable types, we # want to return a 2d array where the first dim represent different # bins, and the second the distributions. distributions = distributions[:, np.newaxis] return distributions
def _discrete_counts(): """ Generate pairs similar to _string_counts, except that the arrays contain bin counts for the attribute's values matching the pattern. """ attr_vals = np.array(attr.values) attr_vals = _lower_if_needed(attr_vals) bins = bincount(data, max_val=len(attr.values) - 1)[0] remaining = np.array(bins) for _, pattern in self.active_rules: matching = _matcher(attr_vals, pattern) yield remaining[matching], bins[matching] remaining[matching] = 0 if not np.any(remaining): break
def get_discrete_stats(self, column, n_bins): """ Return tables used computing distance between missing discrete values. Args: column (np.ndarray): column data n_bins (int): maximal number of bins in the data set Returns: dist_missing_disc (np.ndarray): `dist_missing_disc[value]` is 1 - probability of `value`, which is used as the distance added for the given `value` in the column `col` if the value for the other row is missing dist_missing2_disc (float): the distance between two missing values in this columns """ dist = util.bincount(column, minlength=n_bins)[0] dist /= max(1, sum(dist)) return 1 - dist, 1 - np.sum(dist ** 2)
def test_adds_empty_bins(self, array): x = array([0, 1, 3, 5]) expected = [1, 1, 0, 1, 0, 1] np.testing.assert_equal(bincount(x)[0], expected)
def test_count_nans(self, array): x = array([0, 0, 1, 2, np.nan, 2]) expected = 1 np.testing.assert_equal(bincount(x)[1], expected)
def _categorical_entropy(x): """Compute the entropy of a dense/sparse matrix, column-wise. Assuming categorical values.""" p = [ut.bincount(row)[0] for row in x.T] p = [pk / np.sum(pk) for pk in p] return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64)
def get_column(self, attr, filter_valid=True, max_categories=None, return_labels=False): """ Retrieve the data from the given column in the data table The method: - densifies sparse data, - converts arrays with dtype object to floats if the attribute is actually primitive, - filters out invalid data (if `filter_valid` is `True`), - merges infrequent (discrete) values into a single value (if `max_categories` is set). Tha latter feature is used for shapes and labels, where only a specified number of different values is shown, and others are merged into category 'Other'. In this case, the method may return either the data (e.g. color indices, shape indices) or the list of retained values, followed by `['Other']`. Args: attr (:obj:~Orange.data.Variable): the column to extract filter_valid (bool): filter out invalid data (default: `True`) max_categories (int): merge infrequent values (default: `None`); ignored for non-discrete attributes return_labels (bool): return a list of labels instead of data (default: `False`) Returns: (np.ndarray): (valid) data from the column, or a list of labels """ if attr is None: return None needs_merging = attr.is_discrete \ and max_categories is not None \ and len(attr.values) >= max_categories if return_labels and not needs_merging: assert attr.is_discrete return attr.values all_data = self.data.get_column_view(attr)[0] if all_data.dtype == object and attr.is_primitive(): all_data = all_data.astype(float) if filter_valid and self.valid_data is not None: all_data = all_data[self.valid_data] if not needs_merging: return all_data dist = bincount(all_data, max_val=len(attr.values) - 1)[0] infrequent = np.zeros(len(attr.values), dtype=bool) infrequent[np.argsort(dist)[:-(max_categories - 1)]] = True if return_labels: return [ value for value, infreq in zip(attr.values, infrequent) if not infreq ] + ["Other"] else: result = all_data.copy() freq_vals = [i for i, f in enumerate(infrequent) if not f] for i, infreq in enumerate(infrequent): if infreq: result[all_data == i] = max_categories - 1 else: result[all_data == i] = freq_vals.index(i) return result
def test_weights_with_transposed_x(self, array): x = array([0, 0, 1, 1, 2, 2, 3, 3]).T w = np.array([1, 2, 0, 0, 1, 1, 0, 1]) expected = [3, 0, 2, 1] np.testing.assert_equal(bincount(x, w)[0], expected)
def test_minlength_adds_empty_bins(self, array): x = array([1, 1, 1, 2, 3, 2]) minlength = 5 expected = [0, 3, 2, 1, 0] np.testing.assert_equal(bincount(x, minlength=minlength)[0], expected)
def test_weights(self, array): x = array([0, 0, 1, 1, 2, 2, 3, 3]) w = np.array([1, 2, 0, 0, 1, 1, 0, 1]) expected = [3, 0, 2, 1] np.testing.assert_equal(bincount(x, w)[0], expected)
def test_minlength_adds_empty_bins(self, array): x = array([1, 1, 1, 2, 3, 2]) minlength = 5 expected = [0, 3, 2, 1, 0] np.testing.assert_equal(bincount(x, minlength=minlength)[0], expected)
def test_maxval_doesnt_truncate_values_when_too_small(self, array): x = array([1, 1, 1, 2, 3, 2]) max_val = 1 expected = [0, 3, 2, 1] np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)
def test_maxval_adds_empty_bins(self, array): x = array([1, 1, 1, 2, 3, 2]) max_val = 5 expected = [0, 3, 2, 1, 0, 0] np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)
def test_adds_empty_bins(self, array): x = array([0, 1, 3, 5]) expected = [1, 1, 0, 1, 0, 1] np.testing.assert_equal(bincount(x)[0], expected)
def test_count_nans(self, array): x = array([0, 0, 1, 2, np.nan, 2]) expected = 1 np.testing.assert_equal(bincount(x)[1], expected)
def test_maxval_adds_empty_bins(self, array): x = array([1, 1, 1, 2, 3, 2]) max_val = 5 expected = [0, 3, 2, 1, 0, 0] np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)
def test_maxval_doesnt_truncate_values_when_too_small(self, array): x = array([1, 1, 1, 2, 3, 2]) max_val = 1 expected = [0, 3, 2, 1] np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)
def test_all_nans(self, array): x = array([np.nan] * 5) expected = [] np.testing.assert_equal(bincount(x)[0], expected)
def test_weights_with_nans(self, array): x = array([0, 0, 1, 1, np.nan, 2, np.nan, 3]) w = np.array([1, 2, 0, 0, 1, 1, 0, 1]) expected = [3, 0, 1, 1] np.testing.assert_equal(bincount(x, w)[0], expected)
def majority(x): if x.shape[0] == 0: return np.nan counts = bincount(x)[0] return np.argmax(counts) if counts.shape[0] else np.nan
def test_all_nans(self, array): x = array([np.nan] * 5) expected = [] np.testing.assert_equal(bincount(x)[0], expected)
def _categorical_entropy(x): """Compute the entropy of a dense/sparse matrix, column-wise. Assuming categorical values.""" p = [ut.bincount(row)[0] for row in x.T] p = [pk / np.sum(pk) for pk in p] return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64)
def get_column(self, attr, filter_valid=True, merge_infrequent=False, return_labels=False): """ Retrieve the data from the given column in the data table The method: - densifies sparse data, - converts arrays with dtype object to floats if the attribute is actually primitive, - filters out invalid data (if `filter_valid` is `True`), - merges infrequent (discrete) values into a single value (if `merge_infrequent` is `True`). Tha latter feature is used for shapes and labels, where only a set number (`MAX`) of different values is shown, and others are merged into category 'Other'. In this case, the method may return either the data (e.g. color indices, shape indices) or the list of retained values, followed by `['Other']`. Args: attr (:obj:~Orange.data.Variable): the column to extract filter_valid (bool): filter out invalid data (default: `True`) merge_infrequent (bool): merge infrequent values (default: `False`); ignored for non-discrete attributes return_labels (bool): return a list of labels instead of data (default: `False`) Returns: (np.ndarray): (valid) data from the column, or a list of labels """ if attr is None: return None needs_merging = \ attr.is_discrete \ and merge_infrequent and len(attr.values) >= MAX_CATEGORIES if return_labels and not needs_merging: assert attr.is_discrete return attr.values all_data = self.data.get_column_view(attr)[0] if all_data.dtype == object and attr.is_primitive(): all_data = all_data.astype(float) if filter_valid and self.valid_data is not None: all_data = all_data[self.valid_data] if not needs_merging: return all_data dist = bincount(all_data, max_val=len(attr.values) - 1)[0] infrequent = np.zeros(len(attr.values), dtype=bool) infrequent[np.argsort(dist)[:-(MAX_CATEGORIES-1)]] = True if return_labels: return [value for value, infreq in zip(attr.values, infrequent) if not infreq] + ["Other"] else: result = all_data.copy() freq_vals = [i for i, f in enumerate(infrequent) if not f] for i, infreq in enumerate(infrequent): if infreq: result[all_data == i] = MAX_CATEGORIES - 1 else: result[all_data == i] = freq_vals.index(i) return result
def test_count_nans_objectarray(self): x = np.array([0, 0, 1, 2, np.nan, 2], dtype=object) expected = 1 np.testing.assert_equal(bincount(x)[1], expected)