def test_digitize(self): for x in self.data: x_sparse = csr_matrix(x) bins = np.arange(-2, 2) x_shape = x.shape np.testing.assert_array_equal( np.digitize(x.flatten(), bins).reshape(x_shape), digitize(x, bins), 'Digitize fails on dense data') np.testing.assert_array_equal( np.digitize(x.flatten(), bins).reshape(x_shape), digitize(x_sparse, bins), 'Digitize fails on sparse data')
def test_digitize_1d_array(self): """A consistent return shape must be returned for both sparse and dense.""" x = np.array([0, 1, 1, 0, np.nan, 0, 1]) x_sparse = csr_matrix(x) bins = np.arange(-2, 2) x_shape = x.shape np.testing.assert_array_equal( [np.digitize(x.flatten(), bins).reshape(x_shape)], digitize(x, bins), 'Digitize fails on 1d dense data') np.testing.assert_array_equal( [np.digitize(x.flatten(), bins).reshape(x_shape)], digitize(x_sparse, bins), 'Digitize fails on 1d sparse data')
def test_digitize_right(self): for x in self.data: x_sparse = csr_matrix(x) bins = np.arange(-2, 2) x_shape = x.shape np.testing.assert_array_equal( np.digitize(x.flatten(), bins, right=True).reshape(x_shape), digitize(x, bins, right=True), 'Digitize fails on dense data' ) np.testing.assert_array_equal( np.digitize(x.flatten(), bins, right=True).reshape(x_shape), digitize(x_sparse, bins, right=True), 'Digitize fails on sparse data' )
def _get_colors(self): """Compute colors for different kinds of histograms.""" if self.target_var and self.target_var.is_discrete: colors = [[QColor(*color) for color in self.target_var.colors]] * self.n_bins elif self.target_var and self.target_var.is_continuous: palette = ContinuousPaletteGenerator(*self.target_var.colors) bins = np.arange(self.n_bins)[:, np.newaxis] edges = self.edges if self.attribute.is_discrete else self.edges[ 1:-1] # Need to digitize on `right` here so the samples will be assigned # to the correct bin for coloring bin_indices = ut.digitize(self.x, bins=edges, right=True) mask = bin_indices == bins colors = [] for bin_idx in range(self.n_bins): biny = self.y[mask[bin_idx]] if np.isfinite(biny).any(): mean = ut.nanmean(biny) / ut.nanmax(self.y) else: mean = 0 # bin is empty, color does not matter colors.append([palette[mean]]) else: colors = [[QColor('#ccc')]] * self.n_bins return colors
def _get_colors(self): """Compute colors for different kinds of histograms.""" target = self.target_var if target and target.is_discrete: colors = [list(target.palette)[:len(target.values)]] * self.n_bins elif self.target_var and self.target_var.is_continuous: palette = self.target_var.palette bins = np.arange(self.n_bins)[:, np.newaxis] edges = self.edges if self.attribute.is_discrete else self.edges[ 1:-1] bin_indices = ut.digitize(self.x, bins=edges) mask = bin_indices == bins colors = [] for bin_idx in range(self.n_bins): biny = self.y[mask[bin_idx]] if np.isfinite(biny).any(): mean = ut.nanmean(biny) / ut.nanmax(self.y) else: mean = 0 # bin is empty, color does not matter colors.append([palette.value_to_qcolor(mean)]) else: colors = [[QColor('#ccc')]] * self.n_bins return colors
def test_digitize_sparse_zeroth_bin(self): # Setup the data so that the '0's will fit into the '0'th bin. data = csr_matrix([[0, 0, 0, 1, 1, 0, 0, 1, 0], [0, 0, 1, 1, 0, 0, 1, 0, 0]]) bins = np.array([1]) # Then digitize should return a sparse matrix self.assertTrue(issparse(digitize(data, bins)))
def _get_colors(self): """Compute colors for different kinds of histograms.""" if self.target_var and self.target_var.is_discrete: colors = [[QColor(*color) for color in self.target_var.colors]] * self.n_bins elif self.target_var and self.target_var.is_continuous: palette = ContinuousPaletteGenerator(*self.target_var.colors) bins = np.arange(self.n_bins)[:, np.newaxis] edges = self.edges if self.attribute.is_discrete else self.edges[1:-1] # Need to digitize on `right` here so the samples will be assigned # to the correct bin for coloring bin_indices = ut.digitize(self.x, bins=edges, right=True) mask = bin_indices == bins colors = [] for bin_idx in range(self.n_bins): biny = self.y[mask[bin_idx]] if np.isfinite(biny).any(): mean = ut.nanmean(biny) / ut.nanmax(self.y) else: mean = 0 # bin is empty, color does not matter colors.append([palette[mean]]) else: colors = [[QColor('#ccc')]] * self.n_bins return colors
def test_digitize_1d_array(self): """A consistent return shape must be returned for both sparse and dense.""" x = np.array([0, 1, 1, 0, np.nan, 0, 1]) x_sparse = csr_matrix(x) bins = np.arange(-2, 2) x_shape = x.shape np.testing.assert_array_equal( [np.digitize(x.flatten(), bins).reshape(x_shape)], digitize(x, bins), 'Digitize fails on 1d dense data' ) np.testing.assert_array_equal( [np.digitize(x.flatten(), bins).reshape(x_shape)], digitize(x_sparse, bins), 'Digitize fails on 1d sparse data' )
def test_digitize_sparse_zeroth_bin(self): # Setup the data so that the '0's will fit into the '0'th bin. data = csr_matrix([ [0, 0, 0, 1, 1, 0, 0, 1, 0], [0, 0, 1, 1, 0, 0, 1, 0, 0], ]) bins = np.array([1]) # Then digitize should return a sparse matrix self.assertTrue(issparse(digitize(data, bins)))
def test_digitize_right(self, array): for x_original in self.data: x = array(x_original) bins = np.arange(-2, 2) x_shape = x.shape np.testing.assert_array_equal( np.digitize(x_original.flatten(), bins, right=True).reshape(x_shape), digitize(x, bins, right=True))
def test_digitize_right(self, array): for x_original in self.data: x = array(x_original) bins = np.arange(-2, 2) x_shape = x.shape np.testing.assert_array_equal( np.digitize(x_original.flatten(), bins, right=True).reshape(x_shape), digitize(x, bins, right=True) )
def test_digitize_1d_array(self, array): """A consistent return shape must be returned for both sparse and dense.""" x_original = np.array([0, 1, 1, 0, np.nan, 0, 1]) x = array(x_original) bins = np.arange(-2, 2) x_shape = x_original.shape np.testing.assert_array_equal( [np.digitize(x_original.flatten(), bins).reshape(x_shape)], digitize(x, bins), )
def test_digitize_1d_array(self, array): """A consistent return shape must be returned for both sparse and dense.""" x_original = np.array([0, 1, 1, 0, np.nan, 0, 1]) x = array(x_original) bins = np.arange(-2, 2) x_shape = x_original.shape np.testing.assert_array_equal( [np.digitize(x_original.flatten(), bins).reshape(x_shape)], digitize(x, bins), )
def _histogram(self): assert self.x.size > 0, 'Cannot calculate histogram on empty array' edges = self._get_histogram_edges() if self.attribute.is_discrete: bin_indices = self.x # TODO It probably isn't a very good idea to convert a sparse row # to a dense array... Converts sparse to 1d numpy array if sp.issparse(bin_indices): bin_indices = np.squeeze( np.asarray(bin_indices.todense(), dtype=np.int64)) elif self.attribute.is_continuous: bin_indices = ut.digitize(self.x, bins=edges[1:-1]).flatten() distributions = self._get_bin_distributions(bin_indices) return edges, distributions
def _histogram(self): edges = self._get_histogram_edges() if self.attribute.is_discrete: bin_indices = self.x # TODO It probably isn't a very good idea to convert a sparse row # to a dense array... Converts sparse to 1d numpy array if sp.issparse(bin_indices): bin_indices = np.squeeze( np.asarray(bin_indices.todense(), dtype=np.int64)) elif self.attribute.is_continuous: # TODO: Digitize throws nans into first bin. This is incorrect. bin_indices = ut.digitize(self.x, bins=edges[1:-1]).flatten() distributions = self._get_bin_distributions(bin_indices) return edges, distributions
def _histogram(self): assert self.x.size > 0, 'Cannot calculate histogram on empty array' edges = self._get_histogram_edges() if self.attribute.is_discrete: bin_indices = self.x # TODO It probably isn't a very good idea to convert a sparse row # to a dense array... Converts sparse to 1d numpy array if sp.issparse(bin_indices): bin_indices = np.squeeze(np.asarray( bin_indices.todense(), dtype=np.int64 )) elif self.attribute.is_continuous: bin_indices = ut.digitize(self.x, bins=edges[1:-1]).flatten() distributions = self._get_bin_distributions(bin_indices) return edges, distributions
def _get_colors(self): """Compute colors for different kinds of histograms.""" if self.target_var and self.target_var.is_discrete: colors = [[QColor(*color) for color in self.target_var.colors]] * self.n_bins elif self.target_var and self.target_var.is_continuous: palette = ContinuousPaletteGenerator(*self.target_var.colors) bins = np.arange(self.n_bins)[:, np.newaxis] edges = self.edges if self.attribute.is_discrete else self.edges[ 1:-1] bin_indices = ut.digitize(self.x, bins=edges) mask = bin_indices == bins colors = [] for bin_idx in range(self.n_bins): mean = ut.nanmean(self.y[mask[bin_idx]], axis=0) / self.y.max() colors.append([palette[mean]]) else: colors = [[QColor('#ccc')]] * self.n_bins return colors